melihcatal commited on
Commit
179d9b2
·
verified ·
1 Parent(s): e426ee7

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. deepseek-coder-6.7b/base/adapter/README.md +207 -0
  2. deepseek-coder-6.7b/base/adapter/adapter_config.json +46 -0
  3. deepseek-coder-6.7b/base/audit_results.json +137 -0
  4. deepseek-coder-6.7b/base/canary_meta.json +0 -0
  5. deepseek-coder-6.7b/base/codecarbon.csv +2 -0
  6. deepseek-coder-6.7b/base/epochs/epoch_001/adapter/README.md +207 -0
  7. deepseek-coder-6.7b/base/epochs/epoch_001/adapter/adapter_config.json +46 -0
  8. deepseek-coder-6.7b/base/epochs/epoch_001/audit_results.json +137 -0
  9. deepseek-coder-6.7b/base/epochs/epoch_002/adapter/adapter_config.json +46 -0
  10. deepseek-coder-6.7b/base/metrics.jsonl +55 -0
  11. deepseek-coder-6.7b/base/resolved_config.yaml +100 -0
  12. deepseek-coder-6.7b/base/scalars.csv +591 -0
  13. deepseek-coder-6.7b/base/summary.json +71 -0
  14. deepseek-coder-6.7b/base/tokenizer/chat_template.jinja +26 -0
  15. deepseek-coder-6.7b/base/tokenizer/tokenizer.json +0 -0
  16. deepseek-coder-6.7b/base/tokenizer/tokenizer_config.json +516 -0
  17. deepseek-coder-6.7b/base/train.log +49 -0
  18. deepseek-coder-6.7b/dp3/adapter/adapter_config.json +46 -0
  19. deepseek-coder-6.7b/dp3/audit_results.json +137 -0
  20. deepseek-coder-6.7b/dp3/canary_meta.json +0 -0
  21. deepseek-coder-6.7b/dp3/codecarbon.csv +2 -0
  22. deepseek-coder-6.7b/dp3/metrics.jsonl +30 -0
  23. deepseek-coder-6.7b/dp3/resolved_config.yaml +101 -0
  24. deepseek-coder-6.7b/dp3/scalars.csv +386 -0
  25. deepseek-coder-6.7b/dp3/summary.json +72 -0
  26. deepseek-coder-6.7b/dp3/tokenizer/tokenizer_config.json +516 -0
  27. deepseek-coder-6.7b/dp3/train.log +24 -0
  28. deepseek-coder-6.7b/dp8/audit_results.json +137 -0
  29. deepseek-coder-6.7b/dp8/canary_meta.json +0 -0
  30. deepseek-coder-6.7b/dp8/codecarbon.csv +2 -0
  31. deepseek-coder-6.7b/dp8/epochs/epoch_001/adapter/README.md +207 -0
  32. deepseek-coder-6.7b/dp8/epochs/epoch_001/adapter/adapter_config.json +46 -0
  33. deepseek-coder-6.7b/dp8/epochs/epoch_001/audit_results.json +137 -0
  34. deepseek-coder-6.7b/dp8/epochs/epoch_002/adapter/README.md +207 -0
  35. deepseek-coder-6.7b/dp8/epochs/epoch_002/adapter/adapter_config.json +46 -0
  36. deepseek-coder-6.7b/dp8/epochs/epoch_002/audit_results.json +137 -0
  37. deepseek-coder-6.7b/dp8/metrics.jsonl +30 -0
  38. deepseek-coder-6.7b/dp8/resolved_config.yaml +101 -0
  39. deepseek-coder-6.7b/dp8/scalars.csv +386 -0
  40. deepseek-coder-6.7b/dp8/summary.json +72 -0
  41. deepseek-coder-6.7b/dp8/tokenizer/tokenizer.json +0 -0
  42. deepseek-coder-6.7b/dp8/train.log +24 -0
  43. granite-4.0-h-tiny/base/canary_meta.json +0 -0
  44. granite-4.0-h-tiny/base/resolved_config.yaml +110 -0
  45. granite-4.0-h-tiny/base/scalars.csv +35 -0
  46. granite-4.0-h-tiny/base/summary.json +14 -0
  47. granite-4.0-h-tiny/base/tokenizer/chat_template.jinja +118 -0
  48. granite-4.0-h-tiny/base/tokenizer/tokenizer.json +0 -0
  49. granite-4.0-h-tiny/base/tokenizer/tokenizer_config.json +516 -0
  50. granite-4.0-h-tiny/base/train.log +3 -0
deepseek-coder-6.7b/base/adapter/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
deepseek-coder-6.7b/base/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "k_proj",
36
+ "v_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/base/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.957184,
9
+ "empirical_epsilon": {
10
+ "0.05": 3.4791953936219215,
11
+ "0.01": 3.023197554051876
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 3.4791953936219215,
16
+ "num_guesses": 100,
17
+ "correct_guesses": 100,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 3.023197554051876,
44
+ "num_guesses": 100,
45
+ "correct_guesses": 100,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.968208,
74
+ "empirical_epsilon": {
75
+ "0.05": 3.4791953936219215,
76
+ "0.01": 3.023197554051876
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 3.4791953936219215,
81
+ "num_guesses": 100,
82
+ "correct_guesses": 100,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 3.023197554051876,
109
+ "num_guesses": 100,
110
+ "correct_guesses": 100,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/base/canary_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
deepseek-coder-6.7b/base/codecarbon.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,cpu_utilization_percent,gpu_utilization_percent,ram_utilization_percent,ram_used_gb,on_cloud,pue,wue
2
+ 2026-03-17T21:46:21,codedp-deepseek-coder-6.7b-cpt-base,064d823c-8d0a-48ad-b578-92d6f569557c,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,1810.5507336058654,0.09709380205154217,5.362666742741399e-05,72.0230906963752,4629.388481318127,54.0,0.03488049743955748,2.3248590518302024,0.026150659639004155,2.3858902089087644,0.0,Sweden,SWE,östergötland county,,,Linux-6.8.0-94-generic-x86_64-with-glibc2.39,3.11.0,3.2.3,256,AMD EPYC 9554 64-Core Processor,8,8 x NVIDIA H200,16.1885,58.594,1511.49019241333,machine,3.485983379501395,91.87222991689751,5.226869806094248,78.96254361435317,N,1.0,0.0
deepseek-coder-6.7b/base/epochs/epoch_001/adapter/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
deepseek-coder-6.7b/base/epochs/epoch_001/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "k_proj",
36
+ "v_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/base/epochs/epoch_001/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.923752,
9
+ "empirical_epsilon": {
10
+ "0.05": 3.4791953936219215,
11
+ "0.01": 3.023197554051876
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 3.4791953936219215,
16
+ "num_guesses": 100,
17
+ "correct_guesses": 100,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 3.023197554051876,
44
+ "num_guesses": 100,
45
+ "correct_guesses": 100,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.916456,
74
+ "empirical_epsilon": {
75
+ "0.05": 3.4791953936219215,
76
+ "0.01": 3.023197554051876
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 3.4791953936219215,
81
+ "num_guesses": 100,
82
+ "correct_guesses": 100,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 3.023197554051876,
109
+ "num_guesses": 100,
110
+ "correct_guesses": 100,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/base/epochs/epoch_002/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "k_proj",
36
+ "v_proj",
37
+ "o_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/base/metrics.jsonl ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"timestamp": 1773782272.0493639, "event": "train_step", "step": 10, "epoch": 1, "metrics": {"train/step_loss": 14.555815444273108, "train/step_real_loss": 14.770240783691406, "train/lr": 4.761904761904762e-05, "train/step_canary_loss": 11.125, "perf/step_duration_sec": 4.0931806759908795, "perf/samples_per_sec": 8.30649870879918, "perf/tokens_per_sec": 5843.6218416402235, "perf/logical_batch_size": 34.0, "perf/logical_token_count": 23919.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1168704032898}}
2
+ {"timestamp": 1773782309.7872338, "event": "train_step", "step": 20, "epoch": 1, "metrics": {"train/step_loss": 12.852383613586426, "train/step_real_loss": 12.852383613586426, "train/lr": 9.523809523809524e-05, "perf/step_duration_sec": 3.573033411987126, "perf/samples_per_sec": 8.955975584399406, "perf/tokens_per_sec": 7755.035233378848, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27709.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1168704032898}}
3
+ {"timestamp": 1773782347.6002963, "event": "train_step", "step": 30, "epoch": 1, "metrics": {"train/step_loss": 9.603991746902466, "train/step_real_loss": 9.603991746902466, "train/lr": 9.98706541985615e-05, "perf/step_duration_sec": 3.6900156908668578, "perf/samples_per_sec": 8.672049844992005, "perf/tokens_per_sec": 7146.58207694638, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26371.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1168704032898}}
4
+ {"timestamp": 1773782386.0648503, "event": "train_step", "step": 40, "epoch": 1, "metrics": {"train/step_loss": 7.74934458732605, "train/step_real_loss": 7.74934458732605, "train/lr": 9.942439201095397e-05, "perf/step_duration_sec": 4.0733352322131395, "perf/samples_per_sec": 7.855970151176003, "perf/tokens_per_sec": 7048.523719075445, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 28711.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1168704032898}}
5
+ {"timestamp": 1773782423.7351854, "event": "train_step", "step": 50, "epoch": 1, "metrics": {"train/step_loss": 7.176188588142395, "train/step_real_loss": 7.176188588142395, "train/lr": 9.866246608261724e-05, "perf/step_duration_sec": 3.6839785198681056, "perf/samples_per_sec": 8.686261287198187, "perf/tokens_per_sec": 7128.163168807011, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26260.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1168704032898}}
6
+ {"timestamp": 1773782438.1861553, "event": "eval_step", "step": 50, "epoch": 1, "metrics": {"eval/loss": 6.95498745685274, "eval/duration_sec": 14.44864141382277}}
7
+ {"timestamp": 1773782475.968484, "event": "train_step", "step": 60, "epoch": 1, "metrics": {"train/step_loss": 6.844001889228821, "train/step_real_loss": 6.844001889228821, "train/lr": 9.7589742682592e-05, "perf/step_duration_sec": 3.950888440012932, "perf/samples_per_sec": 8.099444083492081, "perf/tokens_per_sec": 7022.471128015244, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27745.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
8
+ {"timestamp": 1773782514.3752964, "event": "train_step", "step": 70, "epoch": 1, "metrics": {"train/step_loss": 6.707658648490906, "train/step_real_loss": 6.707658648490906, "train/lr": 9.621307308142384e-05, "perf/step_duration_sec": 4.038279882632196, "perf/samples_per_sec": 7.924165964232782, "perf/tokens_per_sec": 6047.624411827906, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 24422.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
9
+ {"timestamp": 1773782552.1368628, "event": "train_step", "step": 80, "epoch": 1, "metrics": {"train/step_loss": 7.028713240342982, "train/step_real_loss": 6.692617297172546, "train/lr": 9.454124979346391e-05, "train/step_canary_loss": 12.40625, "perf/step_duration_sec": 3.830359708983451, "perf/samples_per_sec": 8.876450929728307, "perf/tokens_per_sec": 6924.153869360416, "perf/logical_batch_size": 34.0, "perf/logical_token_count": 26522.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.167274475097656, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
10
+ {"timestamp": 1773782590.0828424, "event": "train_step", "step": 90, "epoch": 1, "metrics": {"train/step_loss": 6.5469324588775635, "train/step_real_loss": 6.5469324588775635, "train/lr": 9.258495042083221e-05, "perf/step_duration_sec": 3.57225156808272, "perf/samples_per_sec": 8.957935741679826, "perf/tokens_per_sec": 7531.66441030924, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26905.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
11
+ {"timestamp": 1773782627.521805, "event": "train_step", "step": 100, "epoch": 1, "metrics": {"train/step_loss": 6.427393674850464, "train/step_real_loss": 6.427393674850464, "train/lr": 9.035666945770107e-05, "perf/step_duration_sec": 3.5687404102645814, "perf/samples_per_sec": 8.966749138704534, "perf/tokens_per_sec": 8349.164291776258, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 29796.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
12
+ {"timestamp": 1773782642.0074606, "event": "eval_step", "step": 100, "epoch": 1, "metrics": {"eval/loss": 6.393013093959201, "eval/duration_sec": 14.483735092915595}}
13
+ {"timestamp": 1773782680.1220238, "event": "train_step", "step": 110, "epoch": 1, "metrics": {"train/step_loss": 6.492933461160371, "train/step_real_loss": 6.355993866920471, "train/lr": 8.787063849045118e-05, "train/step_canary_loss": 10.875, "perf/step_duration_sec": 3.834493800997734, "perf/samples_per_sec": 8.606090324468228, "perf/tokens_per_sec": 7034.044491865366, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 26972.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
14
+ {"timestamp": 1773782717.354572, "event": "train_step", "step": 120, "epoch": 1, "metrics": {"train/step_loss": 6.159674048423767, "train/step_real_loss": 6.159674048423767, "train/lr": 8.5142735303366e-05, "perf/step_duration_sec": 3.8241200419142842, "perf/samples_per_sec": 8.367938152898931, "perf/tokens_per_sec": 7346.00370597615, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 28092.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.1198000907898}}
15
+ {"timestamp": 1773782755.7000217, "event": "train_step", "step": 130, "epoch": 1, "metrics": {"train/step_loss": 6.002715587615967, "train/step_real_loss": 6.002715587615967, "train/lr": 8.219038247038819e-05, "perf/step_duration_sec": 3.574957752134651, "perf/samples_per_sec": 8.951154732078276, "perf/tokens_per_sec": 7391.13629642826, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26423.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
16
+ {"timestamp": 1773782794.0098693, "event": "train_step", "step": 140, "epoch": 1, "metrics": {"train/step_loss": 5.99733579158783, "train/step_real_loss": 5.99733579158783, "train/lr": 7.903243608061246e-05, "perf/step_duration_sec": 3.9556330870836973, "perf/samples_per_sec": 8.089729076361857, "perf/tokens_per_sec": 7194.044385188417, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 28457.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
17
+ {"timestamp": 1773782832.684074, "event": "train_step", "step": 150, "epoch": 1, "metrics": {"train/step_loss": 6.004466891288757, "train/step_real_loss": 6.004466891288757, "train/lr": 7.568906530820282e-05, "perf/step_duration_sec": 3.8291429793462157, "perf/samples_per_sec": 8.356961380811027, "perf/tokens_per_sec": 6940.456426763558, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26576.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
18
+ {"timestamp": 1773782847.266101, "event": "eval_step", "step": 150, "epoch": 1, "metrics": {"eval/loss": 5.90363540568135, "eval/duration_sec": 14.580172989983112}}
19
+ {"timestamp": 1773782884.7874682, "event": "train_step", "step": 160, "epoch": 1, "metrics": {"train/step_loss": 5.9455406665802, "train/step_real_loss": 5.9455406665802, "train/lr": 7.21816235958972e-05, "perf/step_duration_sec": 3.6964078596793115, "perf/samples_per_sec": 8.65705333793339, "perf/tokens_per_sec": 6998.416024968714, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 25869.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
20
+ {"timestamp": 1773782922.1683276, "event": "train_step", "step": 170, "epoch": 1, "metrics": {"train/step_loss": 5.978999730312463, "train/step_real_loss": 5.847484111785889, "train/lr": 6.853251227482479e-05, "train/step_canary_loss": 10.1875, "perf/step_duration_sec": 3.8291215640492737, "perf/samples_per_sec": 8.618164622880943, "perf/tokens_per_sec": 7364.352248503628, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 28199.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.167274475097656, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
21
+ {"timestamp": 1773782960.5932062, "event": "train_step", "step": 180, "epoch": 1, "metrics": {"train/step_loss": 5.780862244692716, "train/step_real_loss": 5.639248490333557, "train/lr": 6.476503749166904e-05, "train/step_canary_loss": 10.3125, "perf/step_duration_sec": 3.831100419629365, "perf/samples_per_sec": 8.61371313341678, "perf/tokens_per_sec": 7198.453963435392, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 27578.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
22
+ {"timestamp": 1773782999.2272496, "event": "train_step", "step": 190, "epoch": 1, "metrics": {"train/step_loss": 5.710304021835327, "train/step_real_loss": 5.710304021835327, "train/lr": 6.090326135695403e-05, "perf/step_duration_sec": 3.831862016580999, "perf/samples_per_sec": 8.351031394536536, "perf/tokens_per_sec": 6408.111746651394, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 24555.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
23
+ {"timestamp": 1773783037.1072268, "event": "train_step", "step": 200, "epoch": 1, "metrics": {"train/step_loss": 5.810285813880689, "train/step_real_loss": 5.673497915267944, "train/lr": 5.697184826514057e-05, "train/step_canary_loss": 10.1875, "perf/step_duration_sec": 3.822298335842788, "perf/samples_per_sec": 8.633549006509913, "perf/tokens_per_sec": 5939.881716478821, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 22704.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 90.77418279647827}}
24
+ {"timestamp": 1773783051.6295593, "event": "eval_step", "step": 200, "epoch": 1, "metrics": {"eval/loss": 5.488488009030169, "eval/duration_sec": 14.520337663125247}}
25
+ {"timestamp": 1773783092.753959, "event": "train_epoch", "step": 207, "epoch": 1, "metrics": {"train/epoch_loss": 7.408701919082488, "train/epoch_real_loss": 7.374264005589601, "train/epoch_canary_loss": 11.38703087321013, "perf/epoch_duration_sec": 846.9085897100158, "perf/epoch_samples_per_sec": 63.12959940376397, "perf/epoch_tokens_per_sec": 51777.628108618774, "perf/epoch_samples": 53465.0, "perf/epoch_tokens": 43850918.0, "system/cuda_epoch_peak_memory_gb": 90.77418279647827, "eval/loss": 5.447803701866757, "eval/duration_sec": 14.591437232214957}}
26
+ {"timestamp": 1773783101.5745099, "event": "audit_epoch", "step": 207, "epoch": 1, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.923752, "audit/loss/empirical_epsilon/0.05": 3.4791953936219215, "audit/loss/empirical_epsilon/0.01": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 100.0, "audit/embedding/auc": 0.916456, "audit/embedding/empirical_epsilon/0.05": 3.4791953936219215, "audit/embedding/empirical_epsilon/0.01": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 100.0, "perf/audit_duration_sec": 6.246651589404792}}
27
+ {"timestamp": 1773783113.496209, "event": "train_step", "step": 210, "epoch": 2, "metrics": {"train/step_loss": 5.665921092033386, "train/step_real_loss": 5.665921092033386, "train/lr": 5.29959073680547e-05, "perf/step_duration_sec": 3.699097161181271, "perf/samples_per_sec": 8.650759524732544, "perf/tokens_per_sec": 6618.371708890693, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 24482.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 77.32090139389038}}
28
+ {"timestamp": 1773783152.2505124, "event": "train_step", "step": 220, "epoch": 2, "metrics": {"train/step_loss": 5.316019058227539, "train/step_real_loss": 5.316019058227539, "train/lr": 4.9000832207739676e-05, "perf/step_duration_sec": 3.963131622876972, "perf/samples_per_sec": 8.074422715430812, "perf/tokens_per_sec": 6834.746502902325, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27087.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 77.32090139389038}}
29
+ {"timestamp": 1773783190.084273, "event": "train_step", "step": 230, "epoch": 2, "metrics": {"train/step_loss": 5.287669539451599, "train/step_real_loss": 5.287669539451599, "train/lr": 4.501213853296425e-05, "perf/step_duration_sec": 3.7102178959175944, "perf/samples_per_sec": 8.624830373226882, "perf/tokens_per_sec": 7261.298596409544, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26941.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 77.32090139389038}}
30
+ {"timestamp": 1773783228.4977129, "event": "train_step", "step": 240, "epoch": 2, "metrics": {"train/step_loss": 5.252316236495972, "train/step_real_loss": 5.252316236495972, "train/lr": 4.1055301335220955e-05, "perf/step_duration_sec": 3.832394160795957, "perf/samples_per_sec": 8.349871818339755, "perf/tokens_per_sec": 6669.981981887527, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 25562.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
31
+ {"timestamp": 1773783266.2993257, "event": "train_step", "step": 250, "epoch": 2, "metrics": {"train/step_loss": 5.295487284660339, "train/step_real_loss": 5.295487284660339, "train/lr": 3.715559214503298e-05, "perf/step_duration_sec": 3.704376870766282, "perf/samples_per_sec": 8.63842992124625, "perf/tokens_per_sec": 6620.006779960055, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 24523.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
32
+ {"timestamp": 1773783280.9439437, "event": "eval_step", "step": 250, "epoch": 2, "metrics": {"eval/loss": 5.170890462669458, "eval/duration_sec": 14.642520225141197}}
33
+ {"timestamp": 1773783318.7159903, "event": "train_step", "step": 260, "epoch": 2, "metrics": {"train/step_loss": 5.102391600608826, "train/step_real_loss": 5.102391600608826, "train/lr": 3.33379176277258e-05, "perf/step_duration_sec": 3.9554404942318797, "perf/samples_per_sec": 8.090122970289858, "perf/tokens_per_sec": 6473.109641603173, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 25604.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
34
+ {"timestamp": 1773783356.4853346, "event": "train_step", "step": 270, "epoch": 2, "metrics": {"train/step_loss": 5.203338623046875, "train/step_real_loss": 5.203338623046875, "train/lr": 2.962666050951997e-05, "perf/step_duration_sec": 3.9644229151308537, "perf/samples_per_sec": 8.071792713604516, "perf/tokens_per_sec": 7155.896484132703, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 28369.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
35
+ {"timestamp": 1773783394.5975869, "event": "train_step", "step": 280, "epoch": 2, "metrics": {"train/step_loss": 4.966692328453064, "train/step_real_loss": 4.966692328453064, "train/lr": 2.604552384991855e-05, "perf/step_duration_sec": 3.573012210894376, "perf/samples_per_sec": 8.956028726246625, "perf/tokens_per_sec": 7867.31148421227, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 28110.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
36
+ {"timestamp": 1773783432.839679, "event": "train_step", "step": 290, "epoch": 2, "metrics": {"train/step_loss": 5.069774425390995, "train/step_real_loss": 4.921564221382141, "train/lr": 2.2617379654990623e-05, "train/step_canary_loss": 9.8125, "perf/step_duration_sec": 3.8366584139876068, "perf/samples_per_sec": 8.601234834899378, "perf/tokens_per_sec": 7269.607296369047, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 27891.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
37
+ {"timestamp": 1773783471.2908254, "event": "train_step", "step": 300, "epoch": 2, "metrics": {"train/step_loss": 5.246372468543775, "train/step_real_loss": 5.078290581703186, "train/lr": 1.936412279842705e-05, "train/step_canary_loss": 10.625, "perf/step_duration_sec": 4.068281149957329, "perf/samples_per_sec": 8.111533786288621, "perf/tokens_per_sec": 6243.177170846809, "perf/logical_batch_size": 33.0, "perf/logical_token_count": 25399.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
38
+ {"timestamp": 1773783485.8945208, "event": "eval_step", "step": 300, "epoch": 2, "metrics": {"eval/loss": 4.965634505857121, "eval/duration_sec": 14.601576885208488}}
39
+ {"timestamp": 1773783524.0127125, "event": "train_step", "step": 310, "epoch": 2, "metrics": {"train/step_loss": 5.0469924211502075, "train/step_real_loss": 5.0469924211502075, "train/lr": 1.6306531183346385e-05, "perf/step_duration_sec": 3.7034485950134695, "perf/samples_per_sec": 8.64059515854671, "perf/tokens_per_sec": 6898.975196902139, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 25550.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
40
+ {"timestamp": 1773783561.666821, "event": "train_step", "step": 320, "epoch": 2, "metrics": {"train/step_loss": 5.018897533416748, "train/step_real_loss": 5.018897533416748, "train/lr": 1.3464133037968912e-05, "perf/step_duration_sec": 3.5697252051904798, "perf/samples_per_sec": 8.964275444359446, "perf/tokens_per_sec": 7121.5565733283065, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 25422.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
41
+ {"timestamp": 1773783600.0391417, "event": "train_step", "step": 330, "epoch": 2, "metrics": {"train/step_loss": 5.0458667278289795, "train/step_real_loss": 5.0458667278289795, "train/lr": 1.0855082192715294e-05, "perf/step_duration_sec": 3.95492945285514, "perf/samples_per_sec": 8.091168346099975, "perf/tokens_per_sec": 6913.6505027316125, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27343.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
42
+ {"timestamp": 1773783637.8100953, "event": "train_step", "step": 340, "epoch": 2, "metrics": {"train/step_loss": 5.095871806144714, "train/step_real_loss": 5.095871806144714, "train/lr": 8.49604213531004e-06, "perf/step_duration_sec": 3.57716670492664, "perf/samples_per_sec": 8.945627263031415, "perf/tokens_per_sec": 7446.396043917744, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26637.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
43
+ {"timestamp": 1773783676.205935, "event": "train_step", "step": 350, "epoch": 2, "metrics": {"train/step_loss": 5.241442084312439, "train/step_real_loss": 5.241442084312439, "train/lr": 6.402079584406673e-06, "perf/step_duration_sec": 3.8292608251795173, "perf/samples_per_sec": 8.356704194601273, "perf/tokens_per_sec": 5819.13873650938, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 22283.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
44
+ {"timestamp": 1773783690.815634, "event": "eval_step", "step": 350, "epoch": 2, "metrics": {"eval/loss": 4.8690267882563845, "eval/duration_sec": 14.607622059062123}}
45
+ {"timestamp": 1773783728.3614442, "event": "train_step", "step": 360, "epoch": 2, "metrics": {"train/step_loss": 4.913779258728027, "train/step_real_loss": 4.913779258728027, "train/lr": 4.586568261458729e-06, "perf/step_duration_sec": 3.8354132031090558, "perf/samples_per_sec": 8.343299223682136, "perf/tokens_per_sec": 7633.075877266194, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 29276.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
46
+ {"timestamp": 1773783766.5169628, "event": "train_step", "step": 370, "epoch": 2, "metrics": {"train/step_loss": 5.0288437604904175, "train/step_real_loss": 5.0288437604904175, "train/lr": 3.06110347542643e-06, "perf/step_duration_sec": 3.820353894960135, "perf/samples_per_sec": 8.376187358510125, "perf/tokens_per_sec": 7220.011747180773, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27583.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
47
+ {"timestamp": 1773783804.5456383, "event": "train_step", "step": 380, "epoch": 2, "metrics": {"train/step_loss": 4.826860070228577, "train/step_real_loss": 4.826860070228577, "train/lr": 1.8354280658494649e-06, "perf/step_duration_sec": 3.825985827948898, "perf/samples_per_sec": 8.363857431525073, "perf/tokens_per_sec": 7291.454086476718, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27897.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
48
+ {"timestamp": 1773783843.007424, "event": "train_step", "step": 390, "epoch": 2, "metrics": {"train/step_loss": 4.915419220924377, "train/step_real_loss": 4.915419220924377, "train/lr": 9.17370177272775e-07, "perf/step_duration_sec": 3.7134576980024576, "perf/samples_per_sec": 8.617305649452646, "perf/tokens_per_sec": 7082.61737144544, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 26301.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
49
+ {"timestamp": 1773783881.5532339, "event": "train_step", "step": 400, "epoch": 2, "metrics": {"train/step_loss": 4.916181445121765, "train/step_real_loss": 4.916181445121765, "train/lr": 3.127932624475638e-07, "perf/step_duration_sec": 3.831934977322817, "perf/samples_per_sec": 8.350872389373583, "perf/tokens_per_sec": 7147.302906255116, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 27388.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
50
+ {"timestamp": 1773783896.2133112, "event": "eval_step", "step": 400, "epoch": 2, "metrics": {"eval/loss": 4.840054991570385, "eval/duration_sec": 14.658054957631975}}
51
+ {"timestamp": 1773783934.3512156, "event": "train_step", "step": 410, "epoch": 2, "metrics": {"train/step_loss": 4.948451399803162, "train/step_real_loss": 4.948451399803162, "train/lr": 2.5558633627303928e-08, "perf/step_duration_sec": 3.8317640791647136, "perf/samples_per_sec": 8.35124484150801, "perf/tokens_per_sec": 6492.570911469883, "perf/logical_batch_size": 32.0, "perf/logical_token_count": 24878.0, "perf/gradient_accumulation_steps": 4.0, "system/cuda_memory_allocated_gb": 15.10503625869751, "system/cuda_max_memory_allocated_gb": 84.09655332565308}}
52
+ {"timestamp": 1773783964.370691, "event": "train_epoch", "step": 414, "epoch": 2, "metrics": {"train/epoch_loss": 5.128793139947108, "train/epoch_real_loss": 5.082210323228928, "train/epoch_canary_loss": 10.210301459293396, "perf/epoch_duration_sec": 848.1400936129503, "perf/epoch_samples_per_sec": 63.06976925490229, "perf/epoch_tokens_per_sec": 51703.841535419684, "perf/epoch_samples": 53492.0, "perf/epoch_tokens": 43852101.0, "system/cuda_epoch_peak_memory_gb": 84.09655332565308, "eval/loss": 4.839725652878935, "eval/duration_sec": 14.596055098809302}}
53
+ {"timestamp": 1773783972.8350441, "event": "audit_epoch", "step": 414, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.957184, "audit/loss/empirical_epsilon/0.05": 3.4791953936219215, "audit/loss/empirical_epsilon/0.01": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 100.0, "audit/embedding/auc": 0.968208, "audit/embedding/empirical_epsilon/0.05": 3.4791953936219215, "audit/embedding/empirical_epsilon/0.01": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 100.0, "perf/audit_duration_sec": 5.8983861412853}}
54
+ {"timestamp": 1773783981.1943111, "event": "audit_final", "step": 414, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.957184, "audit/loss/empirical_epsilon/0.05": 3.4791953936219215, "audit/loss/empirical_epsilon/0.01": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 100.0, "audit/embedding/auc": 0.968208, "audit/embedding/empirical_epsilon/0.05": 3.4791953936219215, "audit/embedding/empirical_epsilon/0.01": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 3.023197554051876, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 100.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 100.0}}
55
+ {"timestamp": 1773783981.7373376, "event": "energy_final", "step": 414, "epoch": null, "metrics": {"energy/codecarbon/duration": 1810.5507336058654, "energy/codecarbon/emissions": 0.09709380205154217, "energy/codecarbon/emissions_rate": 5.362666742741399e-05, "energy/codecarbon/cpu_power": 72.0230906963752, "energy/codecarbon/gpu_power": 4629.388481318127, "energy/codecarbon/ram_power": 54.0, "energy/codecarbon/cpu_energy": 0.03488049743955748, "energy/codecarbon/gpu_energy": 2.3248590518302024, "energy/codecarbon/ram_energy": 0.026150659639004155, "energy/codecarbon/energy_consumed": 2.3858902089087644, "energy/codecarbon/water_consumed": 0.0, "energy/codecarbon/cpu_count": 256.0, "energy/codecarbon/gpu_count": 8.0, "energy/codecarbon/longitude": 16.1885, "energy/codecarbon/latitude": 58.594, "energy/codecarbon/ram_total_size": 1511.49019241333, "energy/codecarbon/cpu_utilization_percent": 3.485983379501395, "energy/codecarbon/gpu_utilization_percent": 91.87222991689751, "energy/codecarbon/ram_utilization_percent": 5.226869806094248, "energy/codecarbon/ram_used_gb": 78.96254361435317, "energy/codecarbon/pue": 1.0, "energy/codecarbon/wue": 0.0}}
deepseek-coder-6.7b/base/resolved_config.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
4
+ max_length: 1024
5
+ dtype: bfloat16
6
+ trust_remote_code: true
7
+ use_fast_tokenizer: true
8
+ cache_dir: null
9
+ local_files_only: false
10
+ low_cpu_mem_usage: true
11
+ tie_word_embeddings: true
12
+ gradient_checkpointing: false
13
+ use_chat_template: false
14
+ dataset:
15
+ name: melihcatal/codedp-cpt
16
+ split: train
17
+ mode: cpt
18
+ text_column: text
19
+ validation_ratio: 0.05
20
+ max_samples: -1
21
+ lora:
22
+ enabled: true
23
+ r: 16
24
+ alpha: 32
25
+ dropout: 0.05
26
+ target_modules:
27
+ - q_proj
28
+ - k_proj
29
+ - v_proj
30
+ - o_proj
31
+ modules_to_save:
32
+ - lm_head
33
+ bias: none
34
+ training:
35
+ seed: 42
36
+ epochs: 2
37
+ warmup_steps: null
38
+ warmup_ratio: 0.05
39
+ mixed_precision: false
40
+ mixed_precision_dtype: bfloat16
41
+ batch_size: 8
42
+ eval_batch_size: 8
43
+ eval_every_steps: 50
44
+ eval_every_epochs: 1
45
+ learning_rate: 0.0001
46
+ optimizer: adamw
47
+ lr_scheduler: cosine
48
+ adam_beta1: 0.9
49
+ adam_beta2: 0.999
50
+ adam_epsilon: 1.0e-08
51
+ sgd_momentum: 0.9
52
+ weight_decay: 0.01
53
+ max_grad_norm: 1.0
54
+ log_every: 10
55
+ gradient_accumulation_steps: 4
56
+ num_workers: 4
57
+ output_dir: runs/cpt/deepseek-coder-6.7b/base
58
+ distributed:
59
+ strategy: dpddp
60
+ backend: nccl
61
+ devices: null
62
+ dp:
63
+ module_validator: auto
64
+ target_delta: 1.0e-05
65
+ noise_multiplier: null
66
+ max_grad_norm: 1.0
67
+ grad_sample_mode: ghost
68
+ secure_mode: false
69
+ enabled: false
70
+ target_epsilon: 8.0
71
+ audit:
72
+ enabled: true
73
+ run_every_epoch: true
74
+ epoch_device: cuda
75
+ q_canary: auto
76
+ num_canaries: 500
77
+ prefix_length: 49
78
+ num_digits: 12
79
+ batch_size: 32
80
+ delta: 1.0e-05
81
+ p_values:
82
+ - 0.05
83
+ - 0.01
84
+ paper_guess_fraction: 0.2
85
+ paper_guess_steps: 20
86
+ enable_holdout_empirical_epsilon: false
87
+ holdout_seed: 42
88
+ tie_seed: 42
89
+ tracking:
90
+ enabled: true
91
+ tensorboard: true
92
+ wandb: false
93
+ wandb_project: codedp-finetune-h200-audit
94
+ wandb_run_name: deepseek-coder-6.7b-cpt-base
95
+ wandb_mode: online
96
+ codecarbon: true
97
+ codecarbon_output_file: codecarbon.csv
98
+ codecarbon_measure_power_secs: 15
99
+ codecarbon_country_iso_code: null
100
+ codecarbon_project_name: codedp-deepseek-coder-6.7b-cpt-base
deepseek-coder-6.7b/base/scalars.csv ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timestamp,event,step,epoch,key,value
2
+ 1773782272.0493639,train_step,10,1,train/step_loss,14.555815444273108
3
+ 1773782272.0493639,train_step,10,1,train/step_real_loss,14.770240783691406
4
+ 1773782272.0493639,train_step,10,1,train/lr,4.761904761904762e-05
5
+ 1773782272.0493639,train_step,10,1,train/step_canary_loss,11.125
6
+ 1773782272.0493639,train_step,10,1,perf/step_duration_sec,4.0931806759908795
7
+ 1773782272.0493639,train_step,10,1,perf/samples_per_sec,8.30649870879918
8
+ 1773782272.0493639,train_step,10,1,perf/tokens_per_sec,5843.6218416402235
9
+ 1773782272.0493639,train_step,10,1,perf/logical_batch_size,34.0
10
+ 1773782272.0493639,train_step,10,1,perf/logical_token_count,23919.0
11
+ 1773782272.0493639,train_step,10,1,perf/gradient_accumulation_steps,4.0
12
+ 1773782272.0493639,train_step,10,1,system/cuda_memory_allocated_gb,15.10503625869751
13
+ 1773782272.0493639,train_step,10,1,system/cuda_max_memory_allocated_gb,84.1168704032898
14
+ 1773782309.7872338,train_step,20,1,train/step_loss,12.852383613586426
15
+ 1773782309.7872338,train_step,20,1,train/step_real_loss,12.852383613586426
16
+ 1773782309.7872338,train_step,20,1,train/lr,9.523809523809524e-05
17
+ 1773782309.7872338,train_step,20,1,perf/step_duration_sec,3.573033411987126
18
+ 1773782309.7872338,train_step,20,1,perf/samples_per_sec,8.955975584399406
19
+ 1773782309.7872338,train_step,20,1,perf/tokens_per_sec,7755.035233378848
20
+ 1773782309.7872338,train_step,20,1,perf/logical_batch_size,32.0
21
+ 1773782309.7872338,train_step,20,1,perf/logical_token_count,27709.0
22
+ 1773782309.7872338,train_step,20,1,perf/gradient_accumulation_steps,4.0
23
+ 1773782309.7872338,train_step,20,1,system/cuda_memory_allocated_gb,15.10503625869751
24
+ 1773782309.7872338,train_step,20,1,system/cuda_max_memory_allocated_gb,84.1168704032898
25
+ 1773782347.6002963,train_step,30,1,train/step_loss,9.603991746902466
26
+ 1773782347.6002963,train_step,30,1,train/step_real_loss,9.603991746902466
27
+ 1773782347.6002963,train_step,30,1,train/lr,9.98706541985615e-05
28
+ 1773782347.6002963,train_step,30,1,perf/step_duration_sec,3.6900156908668578
29
+ 1773782347.6002963,train_step,30,1,perf/samples_per_sec,8.672049844992005
30
+ 1773782347.6002963,train_step,30,1,perf/tokens_per_sec,7146.58207694638
31
+ 1773782347.6002963,train_step,30,1,perf/logical_batch_size,32.0
32
+ 1773782347.6002963,train_step,30,1,perf/logical_token_count,26371.0
33
+ 1773782347.6002963,train_step,30,1,perf/gradient_accumulation_steps,4.0
34
+ 1773782347.6002963,train_step,30,1,system/cuda_memory_allocated_gb,15.10503625869751
35
+ 1773782347.6002963,train_step,30,1,system/cuda_max_memory_allocated_gb,84.1168704032898
36
+ 1773782386.0648503,train_step,40,1,train/step_loss,7.74934458732605
37
+ 1773782386.0648503,train_step,40,1,train/step_real_loss,7.74934458732605
38
+ 1773782386.0648503,train_step,40,1,train/lr,9.942439201095397e-05
39
+ 1773782386.0648503,train_step,40,1,perf/step_duration_sec,4.0733352322131395
40
+ 1773782386.0648503,train_step,40,1,perf/samples_per_sec,7.855970151176003
41
+ 1773782386.0648503,train_step,40,1,perf/tokens_per_sec,7048.523719075445
42
+ 1773782386.0648503,train_step,40,1,perf/logical_batch_size,32.0
43
+ 1773782386.0648503,train_step,40,1,perf/logical_token_count,28711.0
44
+ 1773782386.0648503,train_step,40,1,perf/gradient_accumulation_steps,4.0
45
+ 1773782386.0648503,train_step,40,1,system/cuda_memory_allocated_gb,15.10503625869751
46
+ 1773782386.0648503,train_step,40,1,system/cuda_max_memory_allocated_gb,84.1168704032898
47
+ 1773782423.7351854,train_step,50,1,train/step_loss,7.176188588142395
48
+ 1773782423.7351854,train_step,50,1,train/step_real_loss,7.176188588142395
49
+ 1773782423.7351854,train_step,50,1,train/lr,9.866246608261724e-05
50
+ 1773782423.7351854,train_step,50,1,perf/step_duration_sec,3.6839785198681056
51
+ 1773782423.7351854,train_step,50,1,perf/samples_per_sec,8.686261287198187
52
+ 1773782423.7351854,train_step,50,1,perf/tokens_per_sec,7128.163168807011
53
+ 1773782423.7351854,train_step,50,1,perf/logical_batch_size,32.0
54
+ 1773782423.7351854,train_step,50,1,perf/logical_token_count,26260.0
55
+ 1773782423.7351854,train_step,50,1,perf/gradient_accumulation_steps,4.0
56
+ 1773782423.7351854,train_step,50,1,system/cuda_memory_allocated_gb,15.10503625869751
57
+ 1773782423.7351854,train_step,50,1,system/cuda_max_memory_allocated_gb,84.1168704032898
58
+ 1773782438.1861553,eval_step,50,1,eval/loss,6.95498745685274
59
+ 1773782438.1861553,eval_step,50,1,eval/duration_sec,14.44864141382277
60
+ 1773782475.968484,train_step,60,1,train/step_loss,6.844001889228821
61
+ 1773782475.968484,train_step,60,1,train/step_real_loss,6.844001889228821
62
+ 1773782475.968484,train_step,60,1,train/lr,9.7589742682592e-05
63
+ 1773782475.968484,train_step,60,1,perf/step_duration_sec,3.950888440012932
64
+ 1773782475.968484,train_step,60,1,perf/samples_per_sec,8.099444083492081
65
+ 1773782475.968484,train_step,60,1,perf/tokens_per_sec,7022.471128015244
66
+ 1773782475.968484,train_step,60,1,perf/logical_batch_size,32.0
67
+ 1773782475.968484,train_step,60,1,perf/logical_token_count,27745.0
68
+ 1773782475.968484,train_step,60,1,perf/gradient_accumulation_steps,4.0
69
+ 1773782475.968484,train_step,60,1,system/cuda_memory_allocated_gb,15.10503625869751
70
+ 1773782475.968484,train_step,60,1,system/cuda_max_memory_allocated_gb,84.1198000907898
71
+ 1773782514.3752964,train_step,70,1,train/step_loss,6.707658648490906
72
+ 1773782514.3752964,train_step,70,1,train/step_real_loss,6.707658648490906
73
+ 1773782514.3752964,train_step,70,1,train/lr,9.621307308142384e-05
74
+ 1773782514.3752964,train_step,70,1,perf/step_duration_sec,4.038279882632196
75
+ 1773782514.3752964,train_step,70,1,perf/samples_per_sec,7.924165964232782
76
+ 1773782514.3752964,train_step,70,1,perf/tokens_per_sec,6047.624411827906
77
+ 1773782514.3752964,train_step,70,1,perf/logical_batch_size,32.0
78
+ 1773782514.3752964,train_step,70,1,perf/logical_token_count,24422.0
79
+ 1773782514.3752964,train_step,70,1,perf/gradient_accumulation_steps,4.0
80
+ 1773782514.3752964,train_step,70,1,system/cuda_memory_allocated_gb,15.10503625869751
81
+ 1773782514.3752964,train_step,70,1,system/cuda_max_memory_allocated_gb,84.1198000907898
82
+ 1773782552.1368628,train_step,80,1,train/step_loss,7.028713240342982
83
+ 1773782552.1368628,train_step,80,1,train/step_real_loss,6.692617297172546
84
+ 1773782552.1368628,train_step,80,1,train/lr,9.454124979346391e-05
85
+ 1773782552.1368628,train_step,80,1,train/step_canary_loss,12.40625
86
+ 1773782552.1368628,train_step,80,1,perf/step_duration_sec,3.830359708983451
87
+ 1773782552.1368628,train_step,80,1,perf/samples_per_sec,8.876450929728307
88
+ 1773782552.1368628,train_step,80,1,perf/tokens_per_sec,6924.153869360416
89
+ 1773782552.1368628,train_step,80,1,perf/logical_batch_size,34.0
90
+ 1773782552.1368628,train_step,80,1,perf/logical_token_count,26522.0
91
+ 1773782552.1368628,train_step,80,1,perf/gradient_accumulation_steps,4.0
92
+ 1773782552.1368628,train_step,80,1,system/cuda_memory_allocated_gb,15.167274475097656
93
+ 1773782552.1368628,train_step,80,1,system/cuda_max_memory_allocated_gb,84.1198000907898
94
+ 1773782590.0828424,train_step,90,1,train/step_loss,6.5469324588775635
95
+ 1773782590.0828424,train_step,90,1,train/step_real_loss,6.5469324588775635
96
+ 1773782590.0828424,train_step,90,1,train/lr,9.258495042083221e-05
97
+ 1773782590.0828424,train_step,90,1,perf/step_duration_sec,3.57225156808272
98
+ 1773782590.0828424,train_step,90,1,perf/samples_per_sec,8.957935741679826
99
+ 1773782590.0828424,train_step,90,1,perf/tokens_per_sec,7531.66441030924
100
+ 1773782590.0828424,train_step,90,1,perf/logical_batch_size,32.0
101
+ 1773782590.0828424,train_step,90,1,perf/logical_token_count,26905.0
102
+ 1773782590.0828424,train_step,90,1,perf/gradient_accumulation_steps,4.0
103
+ 1773782590.0828424,train_step,90,1,system/cuda_memory_allocated_gb,15.10503625869751
104
+ 1773782590.0828424,train_step,90,1,system/cuda_max_memory_allocated_gb,84.1198000907898
105
+ 1773782627.521805,train_step,100,1,train/step_loss,6.427393674850464
106
+ 1773782627.521805,train_step,100,1,train/step_real_loss,6.427393674850464
107
+ 1773782627.521805,train_step,100,1,train/lr,9.035666945770107e-05
108
+ 1773782627.521805,train_step,100,1,perf/step_duration_sec,3.5687404102645814
109
+ 1773782627.521805,train_step,100,1,perf/samples_per_sec,8.966749138704534
110
+ 1773782627.521805,train_step,100,1,perf/tokens_per_sec,8349.164291776258
111
+ 1773782627.521805,train_step,100,1,perf/logical_batch_size,32.0
112
+ 1773782627.521805,train_step,100,1,perf/logical_token_count,29796.0
113
+ 1773782627.521805,train_step,100,1,perf/gradient_accumulation_steps,4.0
114
+ 1773782627.521805,train_step,100,1,system/cuda_memory_allocated_gb,15.10503625869751
115
+ 1773782627.521805,train_step,100,1,system/cuda_max_memory_allocated_gb,84.1198000907898
116
+ 1773782642.0074606,eval_step,100,1,eval/loss,6.393013093959201
117
+ 1773782642.0074606,eval_step,100,1,eval/duration_sec,14.483735092915595
118
+ 1773782680.1220238,train_step,110,1,train/step_loss,6.492933461160371
119
+ 1773782680.1220238,train_step,110,1,train/step_real_loss,6.355993866920471
120
+ 1773782680.1220238,train_step,110,1,train/lr,8.787063849045118e-05
121
+ 1773782680.1220238,train_step,110,1,train/step_canary_loss,10.875
122
+ 1773782680.1220238,train_step,110,1,perf/step_duration_sec,3.834493800997734
123
+ 1773782680.1220238,train_step,110,1,perf/samples_per_sec,8.606090324468228
124
+ 1773782680.1220238,train_step,110,1,perf/tokens_per_sec,7034.044491865366
125
+ 1773782680.1220238,train_step,110,1,perf/logical_batch_size,33.0
126
+ 1773782680.1220238,train_step,110,1,perf/logical_token_count,26972.0
127
+ 1773782680.1220238,train_step,110,1,perf/gradient_accumulation_steps,4.0
128
+ 1773782680.1220238,train_step,110,1,system/cuda_memory_allocated_gb,15.10503625869751
129
+ 1773782680.1220238,train_step,110,1,system/cuda_max_memory_allocated_gb,84.1198000907898
130
+ 1773782717.354572,train_step,120,1,train/step_loss,6.159674048423767
131
+ 1773782717.354572,train_step,120,1,train/step_real_loss,6.159674048423767
132
+ 1773782717.354572,train_step,120,1,train/lr,8.5142735303366e-05
133
+ 1773782717.354572,train_step,120,1,perf/step_duration_sec,3.8241200419142842
134
+ 1773782717.354572,train_step,120,1,perf/samples_per_sec,8.367938152898931
135
+ 1773782717.354572,train_step,120,1,perf/tokens_per_sec,7346.00370597615
136
+ 1773782717.354572,train_step,120,1,perf/logical_batch_size,32.0
137
+ 1773782717.354572,train_step,120,1,perf/logical_token_count,28092.0
138
+ 1773782717.354572,train_step,120,1,perf/gradient_accumulation_steps,4.0
139
+ 1773782717.354572,train_step,120,1,system/cuda_memory_allocated_gb,15.10503625869751
140
+ 1773782717.354572,train_step,120,1,system/cuda_max_memory_allocated_gb,84.1198000907898
141
+ 1773782755.7000217,train_step,130,1,train/step_loss,6.002715587615967
142
+ 1773782755.7000217,train_step,130,1,train/step_real_loss,6.002715587615967
143
+ 1773782755.7000217,train_step,130,1,train/lr,8.219038247038819e-05
144
+ 1773782755.7000217,train_step,130,1,perf/step_duration_sec,3.574957752134651
145
+ 1773782755.7000217,train_step,130,1,perf/samples_per_sec,8.951154732078276
146
+ 1773782755.7000217,train_step,130,1,perf/tokens_per_sec,7391.13629642826
147
+ 1773782755.7000217,train_step,130,1,perf/logical_batch_size,32.0
148
+ 1773782755.7000217,train_step,130,1,perf/logical_token_count,26423.0
149
+ 1773782755.7000217,train_step,130,1,perf/gradient_accumulation_steps,4.0
150
+ 1773782755.7000217,train_step,130,1,system/cuda_memory_allocated_gb,15.10503625869751
151
+ 1773782755.7000217,train_step,130,1,system/cuda_max_memory_allocated_gb,90.77418279647827
152
+ 1773782794.0098693,train_step,140,1,train/step_loss,5.99733579158783
153
+ 1773782794.0098693,train_step,140,1,train/step_real_loss,5.99733579158783
154
+ 1773782794.0098693,train_step,140,1,train/lr,7.903243608061246e-05
155
+ 1773782794.0098693,train_step,140,1,perf/step_duration_sec,3.9556330870836973
156
+ 1773782794.0098693,train_step,140,1,perf/samples_per_sec,8.089729076361857
157
+ 1773782794.0098693,train_step,140,1,perf/tokens_per_sec,7194.044385188417
158
+ 1773782794.0098693,train_step,140,1,perf/logical_batch_size,32.0
159
+ 1773782794.0098693,train_step,140,1,perf/logical_token_count,28457.0
160
+ 1773782794.0098693,train_step,140,1,perf/gradient_accumulation_steps,4.0
161
+ 1773782794.0098693,train_step,140,1,system/cuda_memory_allocated_gb,15.10503625869751
162
+ 1773782794.0098693,train_step,140,1,system/cuda_max_memory_allocated_gb,90.77418279647827
163
+ 1773782832.684074,train_step,150,1,train/step_loss,6.004466891288757
164
+ 1773782832.684074,train_step,150,1,train/step_real_loss,6.004466891288757
165
+ 1773782832.684074,train_step,150,1,train/lr,7.568906530820282e-05
166
+ 1773782832.684074,train_step,150,1,perf/step_duration_sec,3.8291429793462157
167
+ 1773782832.684074,train_step,150,1,perf/samples_per_sec,8.356961380811027
168
+ 1773782832.684074,train_step,150,1,perf/tokens_per_sec,6940.456426763558
169
+ 1773782832.684074,train_step,150,1,perf/logical_batch_size,32.0
170
+ 1773782832.684074,train_step,150,1,perf/logical_token_count,26576.0
171
+ 1773782832.684074,train_step,150,1,perf/gradient_accumulation_steps,4.0
172
+ 1773782832.684074,train_step,150,1,system/cuda_memory_allocated_gb,15.10503625869751
173
+ 1773782832.684074,train_step,150,1,system/cuda_max_memory_allocated_gb,90.77418279647827
174
+ 1773782847.266101,eval_step,150,1,eval/loss,5.90363540568135
175
+ 1773782847.266101,eval_step,150,1,eval/duration_sec,14.580172989983112
176
+ 1773782884.7874682,train_step,160,1,train/step_loss,5.9455406665802
177
+ 1773782884.7874682,train_step,160,1,train/step_real_loss,5.9455406665802
178
+ 1773782884.7874682,train_step,160,1,train/lr,7.21816235958972e-05
179
+ 1773782884.7874682,train_step,160,1,perf/step_duration_sec,3.6964078596793115
180
+ 1773782884.7874682,train_step,160,1,perf/samples_per_sec,8.65705333793339
181
+ 1773782884.7874682,train_step,160,1,perf/tokens_per_sec,6998.416024968714
182
+ 1773782884.7874682,train_step,160,1,perf/logical_batch_size,32.0
183
+ 1773782884.7874682,train_step,160,1,perf/logical_token_count,25869.0
184
+ 1773782884.7874682,train_step,160,1,perf/gradient_accumulation_steps,4.0
185
+ 1773782884.7874682,train_step,160,1,system/cuda_memory_allocated_gb,15.10503625869751
186
+ 1773782884.7874682,train_step,160,1,system/cuda_max_memory_allocated_gb,90.77418279647827
187
+ 1773782922.1683276,train_step,170,1,train/step_loss,5.978999730312463
188
+ 1773782922.1683276,train_step,170,1,train/step_real_loss,5.847484111785889
189
+ 1773782922.1683276,train_step,170,1,train/lr,6.853251227482479e-05
190
+ 1773782922.1683276,train_step,170,1,train/step_canary_loss,10.1875
191
+ 1773782922.1683276,train_step,170,1,perf/step_duration_sec,3.8291215640492737
192
+ 1773782922.1683276,train_step,170,1,perf/samples_per_sec,8.618164622880943
193
+ 1773782922.1683276,train_step,170,1,perf/tokens_per_sec,7364.352248503628
194
+ 1773782922.1683276,train_step,170,1,perf/logical_batch_size,33.0
195
+ 1773782922.1683276,train_step,170,1,perf/logical_token_count,28199.0
196
+ 1773782922.1683276,train_step,170,1,perf/gradient_accumulation_steps,4.0
197
+ 1773782922.1683276,train_step,170,1,system/cuda_memory_allocated_gb,15.167274475097656
198
+ 1773782922.1683276,train_step,170,1,system/cuda_max_memory_allocated_gb,90.77418279647827
199
+ 1773782960.5932062,train_step,180,1,train/step_loss,5.780862244692716
200
+ 1773782960.5932062,train_step,180,1,train/step_real_loss,5.639248490333557
201
+ 1773782960.5932062,train_step,180,1,train/lr,6.476503749166904e-05
202
+ 1773782960.5932062,train_step,180,1,train/step_canary_loss,10.3125
203
+ 1773782960.5932062,train_step,180,1,perf/step_duration_sec,3.831100419629365
204
+ 1773782960.5932062,train_step,180,1,perf/samples_per_sec,8.61371313341678
205
+ 1773782960.5932062,train_step,180,1,perf/tokens_per_sec,7198.453963435392
206
+ 1773782960.5932062,train_step,180,1,perf/logical_batch_size,33.0
207
+ 1773782960.5932062,train_step,180,1,perf/logical_token_count,27578.0
208
+ 1773782960.5932062,train_step,180,1,perf/gradient_accumulation_steps,4.0
209
+ 1773782960.5932062,train_step,180,1,system/cuda_memory_allocated_gb,15.10503625869751
210
+ 1773782960.5932062,train_step,180,1,system/cuda_max_memory_allocated_gb,90.77418279647827
211
+ 1773782999.2272496,train_step,190,1,train/step_loss,5.710304021835327
212
+ 1773782999.2272496,train_step,190,1,train/step_real_loss,5.710304021835327
213
+ 1773782999.2272496,train_step,190,1,train/lr,6.090326135695403e-05
214
+ 1773782999.2272496,train_step,190,1,perf/step_duration_sec,3.831862016580999
215
+ 1773782999.2272496,train_step,190,1,perf/samples_per_sec,8.351031394536536
216
+ 1773782999.2272496,train_step,190,1,perf/tokens_per_sec,6408.111746651394
217
+ 1773782999.2272496,train_step,190,1,perf/logical_batch_size,32.0
218
+ 1773782999.2272496,train_step,190,1,perf/logical_token_count,24555.0
219
+ 1773782999.2272496,train_step,190,1,perf/gradient_accumulation_steps,4.0
220
+ 1773782999.2272496,train_step,190,1,system/cuda_memory_allocated_gb,15.10503625869751
221
+ 1773782999.2272496,train_step,190,1,system/cuda_max_memory_allocated_gb,90.77418279647827
222
+ 1773783037.1072268,train_step,200,1,train/step_loss,5.810285813880689
223
+ 1773783037.1072268,train_step,200,1,train/step_real_loss,5.673497915267944
224
+ 1773783037.1072268,train_step,200,1,train/lr,5.697184826514057e-05
225
+ 1773783037.1072268,train_step,200,1,train/step_canary_loss,10.1875
226
+ 1773783037.1072268,train_step,200,1,perf/step_duration_sec,3.822298335842788
227
+ 1773783037.1072268,train_step,200,1,perf/samples_per_sec,8.633549006509913
228
+ 1773783037.1072268,train_step,200,1,perf/tokens_per_sec,5939.881716478821
229
+ 1773783037.1072268,train_step,200,1,perf/logical_batch_size,33.0
230
+ 1773783037.1072268,train_step,200,1,perf/logical_token_count,22704.0
231
+ 1773783037.1072268,train_step,200,1,perf/gradient_accumulation_steps,4.0
232
+ 1773783037.1072268,train_step,200,1,system/cuda_memory_allocated_gb,15.10503625869751
233
+ 1773783037.1072268,train_step,200,1,system/cuda_max_memory_allocated_gb,90.77418279647827
234
+ 1773783051.6295593,eval_step,200,1,eval/loss,5.488488009030169
235
+ 1773783051.6295593,eval_step,200,1,eval/duration_sec,14.520337663125247
236
+ 1773783092.753959,train_epoch,207,1,train/epoch_loss,7.408701919082488
237
+ 1773783092.753959,train_epoch,207,1,train/epoch_real_loss,7.374264005589601
238
+ 1773783092.753959,train_epoch,207,1,train/epoch_canary_loss,11.38703087321013
239
+ 1773783092.753959,train_epoch,207,1,perf/epoch_duration_sec,846.9085897100158
240
+ 1773783092.753959,train_epoch,207,1,perf/epoch_samples_per_sec,63.12959940376397
241
+ 1773783092.753959,train_epoch,207,1,perf/epoch_tokens_per_sec,51777.628108618774
242
+ 1773783092.753959,train_epoch,207,1,perf/epoch_samples,53465.0
243
+ 1773783092.753959,train_epoch,207,1,perf/epoch_tokens,43850918.0
244
+ 1773783092.753959,train_epoch,207,1,system/cuda_epoch_peak_memory_gb,90.77418279647827
245
+ 1773783092.753959,train_epoch,207,1,eval/loss,5.447803701866757
246
+ 1773783092.753959,train_epoch,207,1,eval/duration_sec,14.591437232214957
247
+ 1773783101.5745099,audit_epoch,207,1,audit/delta,1e-05
248
+ 1773783101.5745099,audit_epoch,207,1,audit/num_canaries,500.0
249
+ 1773783101.5745099,audit_epoch,207,1,audit/num_members,250.0
250
+ 1773783101.5745099,audit_epoch,207,1,audit/paper_guess_fraction,0.2
251
+ 1773783101.5745099,audit_epoch,207,1,audit/paper_guess_steps,20.0
252
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/auc,0.923752
253
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon/0.05,3.4791953936219215
254
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon/0.01,3.023197554051876
255
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
256
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.05/num_guesses,100.0
257
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.05/correct_guesses,100.0
258
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.01/epsilon,3.023197554051876
259
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.01/num_guesses,100.0
260
+ 1773783101.5745099,audit_epoch,207,1,audit/loss/empirical_epsilon_details/0.01/correct_guesses,100.0
261
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/auc,0.916456
262
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon/0.05,3.4791953936219215
263
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon/0.01,3.023197554051876
264
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
265
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.05/num_guesses,100.0
266
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,100.0
267
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.01/epsilon,3.023197554051876
268
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.01/num_guesses,100.0
269
+ 1773783101.5745099,audit_epoch,207,1,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,100.0
270
+ 1773783101.5745099,audit_epoch,207,1,perf/audit_duration_sec,6.246651589404792
271
+ 1773783113.496209,train_step,210,2,train/step_loss,5.665921092033386
272
+ 1773783113.496209,train_step,210,2,train/step_real_loss,5.665921092033386
273
+ 1773783113.496209,train_step,210,2,train/lr,5.29959073680547e-05
274
+ 1773783113.496209,train_step,210,2,perf/step_duration_sec,3.699097161181271
275
+ 1773783113.496209,train_step,210,2,perf/samples_per_sec,8.650759524732544
276
+ 1773783113.496209,train_step,210,2,perf/tokens_per_sec,6618.371708890693
277
+ 1773783113.496209,train_step,210,2,perf/logical_batch_size,32.0
278
+ 1773783113.496209,train_step,210,2,perf/logical_token_count,24482.0
279
+ 1773783113.496209,train_step,210,2,perf/gradient_accumulation_steps,4.0
280
+ 1773783113.496209,train_step,210,2,system/cuda_memory_allocated_gb,15.10503625869751
281
+ 1773783113.496209,train_step,210,2,system/cuda_max_memory_allocated_gb,77.32090139389038
282
+ 1773783152.2505124,train_step,220,2,train/step_loss,5.316019058227539
283
+ 1773783152.2505124,train_step,220,2,train/step_real_loss,5.316019058227539
284
+ 1773783152.2505124,train_step,220,2,train/lr,4.9000832207739676e-05
285
+ 1773783152.2505124,train_step,220,2,perf/step_duration_sec,3.963131622876972
286
+ 1773783152.2505124,train_step,220,2,perf/samples_per_sec,8.074422715430812
287
+ 1773783152.2505124,train_step,220,2,perf/tokens_per_sec,6834.746502902325
288
+ 1773783152.2505124,train_step,220,2,perf/logical_batch_size,32.0
289
+ 1773783152.2505124,train_step,220,2,perf/logical_token_count,27087.0
290
+ 1773783152.2505124,train_step,220,2,perf/gradient_accumulation_steps,4.0
291
+ 1773783152.2505124,train_step,220,2,system/cuda_memory_allocated_gb,15.10503625869751
292
+ 1773783152.2505124,train_step,220,2,system/cuda_max_memory_allocated_gb,77.32090139389038
293
+ 1773783190.084273,train_step,230,2,train/step_loss,5.287669539451599
294
+ 1773783190.084273,train_step,230,2,train/step_real_loss,5.287669539451599
295
+ 1773783190.084273,train_step,230,2,train/lr,4.501213853296425e-05
296
+ 1773783190.084273,train_step,230,2,perf/step_duration_sec,3.7102178959175944
297
+ 1773783190.084273,train_step,230,2,perf/samples_per_sec,8.624830373226882
298
+ 1773783190.084273,train_step,230,2,perf/tokens_per_sec,7261.298596409544
299
+ 1773783190.084273,train_step,230,2,perf/logical_batch_size,32.0
300
+ 1773783190.084273,train_step,230,2,perf/logical_token_count,26941.0
301
+ 1773783190.084273,train_step,230,2,perf/gradient_accumulation_steps,4.0
302
+ 1773783190.084273,train_step,230,2,system/cuda_memory_allocated_gb,15.10503625869751
303
+ 1773783190.084273,train_step,230,2,system/cuda_max_memory_allocated_gb,77.32090139389038
304
+ 1773783228.4977129,train_step,240,2,train/step_loss,5.252316236495972
305
+ 1773783228.4977129,train_step,240,2,train/step_real_loss,5.252316236495972
306
+ 1773783228.4977129,train_step,240,2,train/lr,4.1055301335220955e-05
307
+ 1773783228.4977129,train_step,240,2,perf/step_duration_sec,3.832394160795957
308
+ 1773783228.4977129,train_step,240,2,perf/samples_per_sec,8.349871818339755
309
+ 1773783228.4977129,train_step,240,2,perf/tokens_per_sec,6669.981981887527
310
+ 1773783228.4977129,train_step,240,2,perf/logical_batch_size,32.0
311
+ 1773783228.4977129,train_step,240,2,perf/logical_token_count,25562.0
312
+ 1773783228.4977129,train_step,240,2,perf/gradient_accumulation_steps,4.0
313
+ 1773783228.4977129,train_step,240,2,system/cuda_memory_allocated_gb,15.10503625869751
314
+ 1773783228.4977129,train_step,240,2,system/cuda_max_memory_allocated_gb,84.09655332565308
315
+ 1773783266.2993257,train_step,250,2,train/step_loss,5.295487284660339
316
+ 1773783266.2993257,train_step,250,2,train/step_real_loss,5.295487284660339
317
+ 1773783266.2993257,train_step,250,2,train/lr,3.715559214503298e-05
318
+ 1773783266.2993257,train_step,250,2,perf/step_duration_sec,3.704376870766282
319
+ 1773783266.2993257,train_step,250,2,perf/samples_per_sec,8.63842992124625
320
+ 1773783266.2993257,train_step,250,2,perf/tokens_per_sec,6620.006779960055
321
+ 1773783266.2993257,train_step,250,2,perf/logical_batch_size,32.0
322
+ 1773783266.2993257,train_step,250,2,perf/logical_token_count,24523.0
323
+ 1773783266.2993257,train_step,250,2,perf/gradient_accumulation_steps,4.0
324
+ 1773783266.2993257,train_step,250,2,system/cuda_memory_allocated_gb,15.10503625869751
325
+ 1773783266.2993257,train_step,250,2,system/cuda_max_memory_allocated_gb,84.09655332565308
326
+ 1773783280.9439437,eval_step,250,2,eval/loss,5.170890462669458
327
+ 1773783280.9439437,eval_step,250,2,eval/duration_sec,14.642520225141197
328
+ 1773783318.7159903,train_step,260,2,train/step_loss,5.102391600608826
329
+ 1773783318.7159903,train_step,260,2,train/step_real_loss,5.102391600608826
330
+ 1773783318.7159903,train_step,260,2,train/lr,3.33379176277258e-05
331
+ 1773783318.7159903,train_step,260,2,perf/step_duration_sec,3.9554404942318797
332
+ 1773783318.7159903,train_step,260,2,perf/samples_per_sec,8.090122970289858
333
+ 1773783318.7159903,train_step,260,2,perf/tokens_per_sec,6473.109641603173
334
+ 1773783318.7159903,train_step,260,2,perf/logical_batch_size,32.0
335
+ 1773783318.7159903,train_step,260,2,perf/logical_token_count,25604.0
336
+ 1773783318.7159903,train_step,260,2,perf/gradient_accumulation_steps,4.0
337
+ 1773783318.7159903,train_step,260,2,system/cuda_memory_allocated_gb,15.10503625869751
338
+ 1773783318.7159903,train_step,260,2,system/cuda_max_memory_allocated_gb,84.09655332565308
339
+ 1773783356.4853346,train_step,270,2,train/step_loss,5.203338623046875
340
+ 1773783356.4853346,train_step,270,2,train/step_real_loss,5.203338623046875
341
+ 1773783356.4853346,train_step,270,2,train/lr,2.962666050951997e-05
342
+ 1773783356.4853346,train_step,270,2,perf/step_duration_sec,3.9644229151308537
343
+ 1773783356.4853346,train_step,270,2,perf/samples_per_sec,8.071792713604516
344
+ 1773783356.4853346,train_step,270,2,perf/tokens_per_sec,7155.896484132703
345
+ 1773783356.4853346,train_step,270,2,perf/logical_batch_size,32.0
346
+ 1773783356.4853346,train_step,270,2,perf/logical_token_count,28369.0
347
+ 1773783356.4853346,train_step,270,2,perf/gradient_accumulation_steps,4.0
348
+ 1773783356.4853346,train_step,270,2,system/cuda_memory_allocated_gb,15.10503625869751
349
+ 1773783356.4853346,train_step,270,2,system/cuda_max_memory_allocated_gb,84.09655332565308
350
+ 1773783394.5975869,train_step,280,2,train/step_loss,4.966692328453064
351
+ 1773783394.5975869,train_step,280,2,train/step_real_loss,4.966692328453064
352
+ 1773783394.5975869,train_step,280,2,train/lr,2.604552384991855e-05
353
+ 1773783394.5975869,train_step,280,2,perf/step_duration_sec,3.573012210894376
354
+ 1773783394.5975869,train_step,280,2,perf/samples_per_sec,8.956028726246625
355
+ 1773783394.5975869,train_step,280,2,perf/tokens_per_sec,7867.31148421227
356
+ 1773783394.5975869,train_step,280,2,perf/logical_batch_size,32.0
357
+ 1773783394.5975869,train_step,280,2,perf/logical_token_count,28110.0
358
+ 1773783394.5975869,train_step,280,2,perf/gradient_accumulation_steps,4.0
359
+ 1773783394.5975869,train_step,280,2,system/cuda_memory_allocated_gb,15.10503625869751
360
+ 1773783394.5975869,train_step,280,2,system/cuda_max_memory_allocated_gb,84.09655332565308
361
+ 1773783432.839679,train_step,290,2,train/step_loss,5.069774425390995
362
+ 1773783432.839679,train_step,290,2,train/step_real_loss,4.921564221382141
363
+ 1773783432.839679,train_step,290,2,train/lr,2.2617379654990623e-05
364
+ 1773783432.839679,train_step,290,2,train/step_canary_loss,9.8125
365
+ 1773783432.839679,train_step,290,2,perf/step_duration_sec,3.8366584139876068
366
+ 1773783432.839679,train_step,290,2,perf/samples_per_sec,8.601234834899378
367
+ 1773783432.839679,train_step,290,2,perf/tokens_per_sec,7269.607296369047
368
+ 1773783432.839679,train_step,290,2,perf/logical_batch_size,33.0
369
+ 1773783432.839679,train_step,290,2,perf/logical_token_count,27891.0
370
+ 1773783432.839679,train_step,290,2,perf/gradient_accumulation_steps,4.0
371
+ 1773783432.839679,train_step,290,2,system/cuda_memory_allocated_gb,15.10503625869751
372
+ 1773783432.839679,train_step,290,2,system/cuda_max_memory_allocated_gb,84.09655332565308
373
+ 1773783471.2908254,train_step,300,2,train/step_loss,5.246372468543775
374
+ 1773783471.2908254,train_step,300,2,train/step_real_loss,5.078290581703186
375
+ 1773783471.2908254,train_step,300,2,train/lr,1.936412279842705e-05
376
+ 1773783471.2908254,train_step,300,2,train/step_canary_loss,10.625
377
+ 1773783471.2908254,train_step,300,2,perf/step_duration_sec,4.068281149957329
378
+ 1773783471.2908254,train_step,300,2,perf/samples_per_sec,8.111533786288621
379
+ 1773783471.2908254,train_step,300,2,perf/tokens_per_sec,6243.177170846809
380
+ 1773783471.2908254,train_step,300,2,perf/logical_batch_size,33.0
381
+ 1773783471.2908254,train_step,300,2,perf/logical_token_count,25399.0
382
+ 1773783471.2908254,train_step,300,2,perf/gradient_accumulation_steps,4.0
383
+ 1773783471.2908254,train_step,300,2,system/cuda_memory_allocated_gb,15.10503625869751
384
+ 1773783471.2908254,train_step,300,2,system/cuda_max_memory_allocated_gb,84.09655332565308
385
+ 1773783485.8945208,eval_step,300,2,eval/loss,4.965634505857121
386
+ 1773783485.8945208,eval_step,300,2,eval/duration_sec,14.601576885208488
387
+ 1773783524.0127125,train_step,310,2,train/step_loss,5.0469924211502075
388
+ 1773783524.0127125,train_step,310,2,train/step_real_loss,5.0469924211502075
389
+ 1773783524.0127125,train_step,310,2,train/lr,1.6306531183346385e-05
390
+ 1773783524.0127125,train_step,310,2,perf/step_duration_sec,3.7034485950134695
391
+ 1773783524.0127125,train_step,310,2,perf/samples_per_sec,8.64059515854671
392
+ 1773783524.0127125,train_step,310,2,perf/tokens_per_sec,6898.975196902139
393
+ 1773783524.0127125,train_step,310,2,perf/logical_batch_size,32.0
394
+ 1773783524.0127125,train_step,310,2,perf/logical_token_count,25550.0
395
+ 1773783524.0127125,train_step,310,2,perf/gradient_accumulation_steps,4.0
396
+ 1773783524.0127125,train_step,310,2,system/cuda_memory_allocated_gb,15.10503625869751
397
+ 1773783524.0127125,train_step,310,2,system/cuda_max_memory_allocated_gb,84.09655332565308
398
+ 1773783561.666821,train_step,320,2,train/step_loss,5.018897533416748
399
+ 1773783561.666821,train_step,320,2,train/step_real_loss,5.018897533416748
400
+ 1773783561.666821,train_step,320,2,train/lr,1.3464133037968912e-05
401
+ 1773783561.666821,train_step,320,2,perf/step_duration_sec,3.5697252051904798
402
+ 1773783561.666821,train_step,320,2,perf/samples_per_sec,8.964275444359446
403
+ 1773783561.666821,train_step,320,2,perf/tokens_per_sec,7121.5565733283065
404
+ 1773783561.666821,train_step,320,2,perf/logical_batch_size,32.0
405
+ 1773783561.666821,train_step,320,2,perf/logical_token_count,25422.0
406
+ 1773783561.666821,train_step,320,2,perf/gradient_accumulation_steps,4.0
407
+ 1773783561.666821,train_step,320,2,system/cuda_memory_allocated_gb,15.10503625869751
408
+ 1773783561.666821,train_step,320,2,system/cuda_max_memory_allocated_gb,84.09655332565308
409
+ 1773783600.0391417,train_step,330,2,train/step_loss,5.0458667278289795
410
+ 1773783600.0391417,train_step,330,2,train/step_real_loss,5.0458667278289795
411
+ 1773783600.0391417,train_step,330,2,train/lr,1.0855082192715294e-05
412
+ 1773783600.0391417,train_step,330,2,perf/step_duration_sec,3.95492945285514
413
+ 1773783600.0391417,train_step,330,2,perf/samples_per_sec,8.091168346099975
414
+ 1773783600.0391417,train_step,330,2,perf/tokens_per_sec,6913.6505027316125
415
+ 1773783600.0391417,train_step,330,2,perf/logical_batch_size,32.0
416
+ 1773783600.0391417,train_step,330,2,perf/logical_token_count,27343.0
417
+ 1773783600.0391417,train_step,330,2,perf/gradient_accumulation_steps,4.0
418
+ 1773783600.0391417,train_step,330,2,system/cuda_memory_allocated_gb,15.10503625869751
419
+ 1773783600.0391417,train_step,330,2,system/cuda_max_memory_allocated_gb,84.09655332565308
420
+ 1773783637.8100953,train_step,340,2,train/step_loss,5.095871806144714
421
+ 1773783637.8100953,train_step,340,2,train/step_real_loss,5.095871806144714
422
+ 1773783637.8100953,train_step,340,2,train/lr,8.49604213531004e-06
423
+ 1773783637.8100953,train_step,340,2,perf/step_duration_sec,3.57716670492664
424
+ 1773783637.8100953,train_step,340,2,perf/samples_per_sec,8.945627263031415
425
+ 1773783637.8100953,train_step,340,2,perf/tokens_per_sec,7446.396043917744
426
+ 1773783637.8100953,train_step,340,2,perf/logical_batch_size,32.0
427
+ 1773783637.8100953,train_step,340,2,perf/logical_token_count,26637.0
428
+ 1773783637.8100953,train_step,340,2,perf/gradient_accumulation_steps,4.0
429
+ 1773783637.8100953,train_step,340,2,system/cuda_memory_allocated_gb,15.10503625869751
430
+ 1773783637.8100953,train_step,340,2,system/cuda_max_memory_allocated_gb,84.09655332565308
431
+ 1773783676.205935,train_step,350,2,train/step_loss,5.241442084312439
432
+ 1773783676.205935,train_step,350,2,train/step_real_loss,5.241442084312439
433
+ 1773783676.205935,train_step,350,2,train/lr,6.402079584406673e-06
434
+ 1773783676.205935,train_step,350,2,perf/step_duration_sec,3.8292608251795173
435
+ 1773783676.205935,train_step,350,2,perf/samples_per_sec,8.356704194601273
436
+ 1773783676.205935,train_step,350,2,perf/tokens_per_sec,5819.13873650938
437
+ 1773783676.205935,train_step,350,2,perf/logical_batch_size,32.0
438
+ 1773783676.205935,train_step,350,2,perf/logical_token_count,22283.0
439
+ 1773783676.205935,train_step,350,2,perf/gradient_accumulation_steps,4.0
440
+ 1773783676.205935,train_step,350,2,system/cuda_memory_allocated_gb,15.10503625869751
441
+ 1773783676.205935,train_step,350,2,system/cuda_max_memory_allocated_gb,84.09655332565308
442
+ 1773783690.815634,eval_step,350,2,eval/loss,4.8690267882563845
443
+ 1773783690.815634,eval_step,350,2,eval/duration_sec,14.607622059062123
444
+ 1773783728.3614442,train_step,360,2,train/step_loss,4.913779258728027
445
+ 1773783728.3614442,train_step,360,2,train/step_real_loss,4.913779258728027
446
+ 1773783728.3614442,train_step,360,2,train/lr,4.586568261458729e-06
447
+ 1773783728.3614442,train_step,360,2,perf/step_duration_sec,3.8354132031090558
448
+ 1773783728.3614442,train_step,360,2,perf/samples_per_sec,8.343299223682136
449
+ 1773783728.3614442,train_step,360,2,perf/tokens_per_sec,7633.075877266194
450
+ 1773783728.3614442,train_step,360,2,perf/logical_batch_size,32.0
451
+ 1773783728.3614442,train_step,360,2,perf/logical_token_count,29276.0
452
+ 1773783728.3614442,train_step,360,2,perf/gradient_accumulation_steps,4.0
453
+ 1773783728.3614442,train_step,360,2,system/cuda_memory_allocated_gb,15.10503625869751
454
+ 1773783728.3614442,train_step,360,2,system/cuda_max_memory_allocated_gb,84.09655332565308
455
+ 1773783766.5169628,train_step,370,2,train/step_loss,5.0288437604904175
456
+ 1773783766.5169628,train_step,370,2,train/step_real_loss,5.0288437604904175
457
+ 1773783766.5169628,train_step,370,2,train/lr,3.06110347542643e-06
458
+ 1773783766.5169628,train_step,370,2,perf/step_duration_sec,3.820353894960135
459
+ 1773783766.5169628,train_step,370,2,perf/samples_per_sec,8.376187358510125
460
+ 1773783766.5169628,train_step,370,2,perf/tokens_per_sec,7220.011747180773
461
+ 1773783766.5169628,train_step,370,2,perf/logical_batch_size,32.0
462
+ 1773783766.5169628,train_step,370,2,perf/logical_token_count,27583.0
463
+ 1773783766.5169628,train_step,370,2,perf/gradient_accumulation_steps,4.0
464
+ 1773783766.5169628,train_step,370,2,system/cuda_memory_allocated_gb,15.10503625869751
465
+ 1773783766.5169628,train_step,370,2,system/cuda_max_memory_allocated_gb,84.09655332565308
466
+ 1773783804.5456383,train_step,380,2,train/step_loss,4.826860070228577
467
+ 1773783804.5456383,train_step,380,2,train/step_real_loss,4.826860070228577
468
+ 1773783804.5456383,train_step,380,2,train/lr,1.8354280658494649e-06
469
+ 1773783804.5456383,train_step,380,2,perf/step_duration_sec,3.825985827948898
470
+ 1773783804.5456383,train_step,380,2,perf/samples_per_sec,8.363857431525073
471
+ 1773783804.5456383,train_step,380,2,perf/tokens_per_sec,7291.454086476718
472
+ 1773783804.5456383,train_step,380,2,perf/logical_batch_size,32.0
473
+ 1773783804.5456383,train_step,380,2,perf/logical_token_count,27897.0
474
+ 1773783804.5456383,train_step,380,2,perf/gradient_accumulation_steps,4.0
475
+ 1773783804.5456383,train_step,380,2,system/cuda_memory_allocated_gb,15.10503625869751
476
+ 1773783804.5456383,train_step,380,2,system/cuda_max_memory_allocated_gb,84.09655332565308
477
+ 1773783843.007424,train_step,390,2,train/step_loss,4.915419220924377
478
+ 1773783843.007424,train_step,390,2,train/step_real_loss,4.915419220924377
479
+ 1773783843.007424,train_step,390,2,train/lr,9.17370177272775e-07
480
+ 1773783843.007424,train_step,390,2,perf/step_duration_sec,3.7134576980024576
481
+ 1773783843.007424,train_step,390,2,perf/samples_per_sec,8.617305649452646
482
+ 1773783843.007424,train_step,390,2,perf/tokens_per_sec,7082.61737144544
483
+ 1773783843.007424,train_step,390,2,perf/logical_batch_size,32.0
484
+ 1773783843.007424,train_step,390,2,perf/logical_token_count,26301.0
485
+ 1773783843.007424,train_step,390,2,perf/gradient_accumulation_steps,4.0
486
+ 1773783843.007424,train_step,390,2,system/cuda_memory_allocated_gb,15.10503625869751
487
+ 1773783843.007424,train_step,390,2,system/cuda_max_memory_allocated_gb,84.09655332565308
488
+ 1773783881.5532339,train_step,400,2,train/step_loss,4.916181445121765
489
+ 1773783881.5532339,train_step,400,2,train/step_real_loss,4.916181445121765
490
+ 1773783881.5532339,train_step,400,2,train/lr,3.127932624475638e-07
491
+ 1773783881.5532339,train_step,400,2,perf/step_duration_sec,3.831934977322817
492
+ 1773783881.5532339,train_step,400,2,perf/samples_per_sec,8.350872389373583
493
+ 1773783881.5532339,train_step,400,2,perf/tokens_per_sec,7147.302906255116
494
+ 1773783881.5532339,train_step,400,2,perf/logical_batch_size,32.0
495
+ 1773783881.5532339,train_step,400,2,perf/logical_token_count,27388.0
496
+ 1773783881.5532339,train_step,400,2,perf/gradient_accumulation_steps,4.0
497
+ 1773783881.5532339,train_step,400,2,system/cuda_memory_allocated_gb,15.10503625869751
498
+ 1773783881.5532339,train_step,400,2,system/cuda_max_memory_allocated_gb,84.09655332565308
499
+ 1773783896.2133112,eval_step,400,2,eval/loss,4.840054991570385
500
+ 1773783896.2133112,eval_step,400,2,eval/duration_sec,14.658054957631975
501
+ 1773783934.3512156,train_step,410,2,train/step_loss,4.948451399803162
502
+ 1773783934.3512156,train_step,410,2,train/step_real_loss,4.948451399803162
503
+ 1773783934.3512156,train_step,410,2,train/lr,2.5558633627303928e-08
504
+ 1773783934.3512156,train_step,410,2,perf/step_duration_sec,3.8317640791647136
505
+ 1773783934.3512156,train_step,410,2,perf/samples_per_sec,8.35124484150801
506
+ 1773783934.3512156,train_step,410,2,perf/tokens_per_sec,6492.570911469883
507
+ 1773783934.3512156,train_step,410,2,perf/logical_batch_size,32.0
508
+ 1773783934.3512156,train_step,410,2,perf/logical_token_count,24878.0
509
+ 1773783934.3512156,train_step,410,2,perf/gradient_accumulation_steps,4.0
510
+ 1773783934.3512156,train_step,410,2,system/cuda_memory_allocated_gb,15.10503625869751
511
+ 1773783934.3512156,train_step,410,2,system/cuda_max_memory_allocated_gb,84.09655332565308
512
+ 1773783964.370691,train_epoch,414,2,train/epoch_loss,5.128793139947108
513
+ 1773783964.370691,train_epoch,414,2,train/epoch_real_loss,5.082210323228928
514
+ 1773783964.370691,train_epoch,414,2,train/epoch_canary_loss,10.210301459293396
515
+ 1773783964.370691,train_epoch,414,2,perf/epoch_duration_sec,848.1400936129503
516
+ 1773783964.370691,train_epoch,414,2,perf/epoch_samples_per_sec,63.06976925490229
517
+ 1773783964.370691,train_epoch,414,2,perf/epoch_tokens_per_sec,51703.841535419684
518
+ 1773783964.370691,train_epoch,414,2,perf/epoch_samples,53492.0
519
+ 1773783964.370691,train_epoch,414,2,perf/epoch_tokens,43852101.0
520
+ 1773783964.370691,train_epoch,414,2,system/cuda_epoch_peak_memory_gb,84.09655332565308
521
+ 1773783964.370691,train_epoch,414,2,eval/loss,4.839725652878935
522
+ 1773783964.370691,train_epoch,414,2,eval/duration_sec,14.596055098809302
523
+ 1773783972.8350441,audit_epoch,414,2,audit/delta,1e-05
524
+ 1773783972.8350441,audit_epoch,414,2,audit/num_canaries,500.0
525
+ 1773783972.8350441,audit_epoch,414,2,audit/num_members,250.0
526
+ 1773783972.8350441,audit_epoch,414,2,audit/paper_guess_fraction,0.2
527
+ 1773783972.8350441,audit_epoch,414,2,audit/paper_guess_steps,20.0
528
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/auc,0.957184
529
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon/0.05,3.4791953936219215
530
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon/0.01,3.023197554051876
531
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
532
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,100.0
533
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,100.0
534
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.01/epsilon,3.023197554051876
535
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,100.0
536
+ 1773783972.8350441,audit_epoch,414,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,100.0
537
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/auc,0.968208
538
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon/0.05,3.4791953936219215
539
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon/0.01,3.023197554051876
540
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
541
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,100.0
542
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,100.0
543
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,3.023197554051876
544
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,100.0
545
+ 1773783972.8350441,audit_epoch,414,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,100.0
546
+ 1773783972.8350441,audit_epoch,414,2,perf/audit_duration_sec,5.8983861412853
547
+ 1773783981.1943111,audit_final,414,2,audit/delta,1e-05
548
+ 1773783981.1943111,audit_final,414,2,audit/num_canaries,500.0
549
+ 1773783981.1943111,audit_final,414,2,audit/num_members,250.0
550
+ 1773783981.1943111,audit_final,414,2,audit/paper_guess_fraction,0.2
551
+ 1773783981.1943111,audit_final,414,2,audit/paper_guess_steps,20.0
552
+ 1773783981.1943111,audit_final,414,2,audit/loss/auc,0.957184
553
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon/0.05,3.4791953936219215
554
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon/0.01,3.023197554051876
555
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
556
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,100.0
557
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,100.0
558
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.01/epsilon,3.023197554051876
559
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,100.0
560
+ 1773783981.1943111,audit_final,414,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,100.0
561
+ 1773783981.1943111,audit_final,414,2,audit/embedding/auc,0.968208
562
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon/0.05,3.4791953936219215
563
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon/0.01,3.023197554051876
564
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,3.4791953936219215
565
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,100.0
566
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,100.0
567
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,3.023197554051876
568
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,100.0
569
+ 1773783981.1943111,audit_final,414,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,100.0
570
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/duration,1810.5507336058654
571
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/emissions,0.09709380205154217
572
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/emissions_rate,5.362666742741399e-05
573
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/cpu_power,72.0230906963752
574
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/gpu_power,4629.388481318127
575
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/ram_power,54.0
576
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/cpu_energy,0.03488049743955748
577
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/gpu_energy,2.3248590518302024
578
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/ram_energy,0.026150659639004155
579
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/energy_consumed,2.3858902089087644
580
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/water_consumed,0.0
581
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/cpu_count,256.0
582
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/gpu_count,8.0
583
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/longitude,16.1885
584
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/latitude,58.594
585
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/ram_total_size,1511.49019241333
586
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/cpu_utilization_percent,3.485983379501395
587
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/gpu_utilization_percent,91.87222991689751
588
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/ram_utilization_percent,5.226869806094248
589
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/ram_used_gb,78.96254361435317
590
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/pue,1.0
591
+ 1773783981.7373376,energy_final,414,,energy/codecarbon/wue,0.0
deepseek-coder-6.7b/base/summary.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audit/delta": 1e-05,
3
+ "audit/embedding/auc": 0.968208,
4
+ "audit/embedding/empirical_epsilon/0.01": 3.023197554051876,
5
+ "audit/embedding/empirical_epsilon/0.05": 3.4791953936219215,
6
+ "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 100.0,
7
+ "audit/embedding/empirical_epsilon_details/0.01/epsilon": 3.023197554051876,
8
+ "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 100.0,
9
+ "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 100.0,
10
+ "audit/embedding/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215,
11
+ "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 100.0,
12
+ "audit/loss/auc": 0.957184,
13
+ "audit/loss/empirical_epsilon/0.01": 3.023197554051876,
14
+ "audit/loss/empirical_epsilon/0.05": 3.4791953936219215,
15
+ "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 100.0,
16
+ "audit/loss/empirical_epsilon_details/0.01/epsilon": 3.023197554051876,
17
+ "audit/loss/empirical_epsilon_details/0.01/num_guesses": 100.0,
18
+ "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 100.0,
19
+ "audit/loss/empirical_epsilon_details/0.05/epsilon": 3.4791953936219215,
20
+ "audit/loss/empirical_epsilon_details/0.05/num_guesses": 100.0,
21
+ "audit/num_canaries": 500.0,
22
+ "audit/num_members": 250.0,
23
+ "audit/paper_guess_fraction": 0.2,
24
+ "audit/paper_guess_steps": 20.0,
25
+ "energy/codecarbon/cpu_count": 256.0,
26
+ "energy/codecarbon/cpu_energy": 0.03488049743955748,
27
+ "energy/codecarbon/cpu_power": 72.0230906963752,
28
+ "energy/codecarbon/cpu_utilization_percent": 3.485983379501395,
29
+ "energy/codecarbon/duration": 1810.5507336058654,
30
+ "energy/codecarbon/emissions": 0.09709380205154217,
31
+ "energy/codecarbon/emissions_rate": 5.362666742741399e-05,
32
+ "energy/codecarbon/energy_consumed": 2.3858902089087644,
33
+ "energy/codecarbon/gpu_count": 8.0,
34
+ "energy/codecarbon/gpu_energy": 2.3248590518302024,
35
+ "energy/codecarbon/gpu_power": 4629.388481318127,
36
+ "energy/codecarbon/gpu_utilization_percent": 91.87222991689751,
37
+ "energy/codecarbon/latitude": 58.594,
38
+ "energy/codecarbon/longitude": 16.1885,
39
+ "energy/codecarbon/pue": 1.0,
40
+ "energy/codecarbon/ram_energy": 0.026150659639004155,
41
+ "energy/codecarbon/ram_power": 54.0,
42
+ "energy/codecarbon/ram_total_size": 1511.49019241333,
43
+ "energy/codecarbon/ram_used_gb": 78.96254361435317,
44
+ "energy/codecarbon/ram_utilization_percent": 5.226869806094248,
45
+ "energy/codecarbon/water_consumed": 0.0,
46
+ "energy/codecarbon/wue": 0.0,
47
+ "eval/duration_sec": 14.596055098809302,
48
+ "eval/loss": 4.839725652878935,
49
+ "perf/audit_duration_sec": 5.8983861412853,
50
+ "perf/epoch_duration_sec": 848.1400936129503,
51
+ "perf/epoch_samples": 53492.0,
52
+ "perf/epoch_samples_per_sec": 63.06976925490229,
53
+ "perf/epoch_tokens": 43852101.0,
54
+ "perf/epoch_tokens_per_sec": 51703.841535419684,
55
+ "perf/gradient_accumulation_steps": 4.0,
56
+ "perf/logical_batch_size": 32.0,
57
+ "perf/logical_token_count": 24878.0,
58
+ "perf/samples_per_sec": 8.35124484150801,
59
+ "perf/step_duration_sec": 3.8317640791647136,
60
+ "perf/tokens_per_sec": 6492.570911469883,
61
+ "system/cuda_epoch_peak_memory_gb": 84.09655332565308,
62
+ "system/cuda_max_memory_allocated_gb": 84.09655332565308,
63
+ "system/cuda_memory_allocated_gb": 15.10503625869751,
64
+ "train/epoch_canary_loss": 10.210301459293396,
65
+ "train/epoch_loss": 5.128793139947108,
66
+ "train/epoch_real_loss": 5.082210323228928,
67
+ "train/lr": 2.5558633627303928e-08,
68
+ "train/step_canary_loss": 10.625,
69
+ "train/step_loss": 4.948451399803162,
70
+ "train/step_real_loss": 4.948451399803162
71
+ }
deepseek-coder-6.7b/base/tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}
2
+ {% set add_generation_prompt = false %}
3
+ {% endif %}
4
+ {%- set ns = namespace(found=false) -%}
5
+ {%- for message in messages -%}
6
+ {%- if message['role'] == 'system' -%}
7
+ {%- set ns.found = true -%}
8
+ {%- endif -%}
9
+ {%- endfor -%}
10
+ {{bos_token}}{%- if not ns.found -%}
11
+ {{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\n'}}
12
+ {%- endif %}
13
+ {%- for message in messages %}
14
+ {%- if message['role'] == 'system' %}
15
+ {{ message['content'] }}
16
+ {%- else %}
17
+ {%- if message['role'] == 'user' %}
18
+ {{'### Instruction:\n' + message['content'] + '\n'}}
19
+ {%- else %}
20
+ {{'### Response:\n' + message['content'] + '\n<|EOT|>\n'}}
21
+ {%- endif %}
22
+ {%- endif %}
23
+ {%- endfor %}
24
+ {% if add_generation_prompt %}
25
+ {{'### Response:'}}
26
+ {% endif %}
deepseek-coder-6.7b/base/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
deepseek-coder-6.7b/base/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|begin▁of▁sentence|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|EOT|>",
7
+ "extra_special_tokens": [
8
+ "865331112869",
9
+ "569765693871",
10
+ "485177821815",
11
+ "135441121756",
12
+ "367459894796",
13
+ "877482678543",
14
+ "457919547633",
15
+ "765474393376",
16
+ "114848338811",
17
+ "746285987371",
18
+ "649291669397",
19
+ "927914615679",
20
+ "445925149649",
21
+ "691587454538",
22
+ "143777992227",
23
+ "997981281989",
24
+ "425949483533",
25
+ "982993456429",
26
+ "718726519731",
27
+ "172599315861",
28
+ "643489267333",
29
+ "282322838685",
30
+ "781653545886",
31
+ "796415361892",
32
+ "841991688488",
33
+ "211411365397",
34
+ "698218415444",
35
+ "355977139358",
36
+ "682564697312",
37
+ "383837596997",
38
+ "689362171782",
39
+ "749966767285",
40
+ "753159165157",
41
+ "795693824762",
42
+ "669689115557",
43
+ "327491773134",
44
+ "983569279932",
45
+ "612128769512",
46
+ "374327157578",
47
+ "311632789559",
48
+ "523918658846",
49
+ "765981581453",
50
+ "794825141891",
51
+ "873898736873",
52
+ "447445629421",
53
+ "473822473819",
54
+ "181439694557",
55
+ "592538279337",
56
+ "668134915514",
57
+ "643692393748",
58
+ "696651276628",
59
+ "853859348234",
60
+ "778466723723",
61
+ "929826356991",
62
+ "272362973463",
63
+ "694235616268",
64
+ "281673864127",
65
+ "479676316326",
66
+ "646979124677",
67
+ "922327493433",
68
+ "883685933161",
69
+ "264259917554",
70
+ "836746273134",
71
+ "658481324922",
72
+ "481884157827",
73
+ "587787496812",
74
+ "579184949249",
75
+ "912193598348",
76
+ "529679678956",
77
+ "795838284624",
78
+ "159337222655",
79
+ "173781362446",
80
+ "773687856563",
81
+ "535787224917",
82
+ "351885857332",
83
+ "578827344666",
84
+ "198462689911",
85
+ "722618266242",
86
+ "952872416512",
87
+ "517778845323",
88
+ "749665846687",
89
+ "661436365453",
90
+ "259666844669",
91
+ "242851284913",
92
+ "514532995959",
93
+ "161588262349",
94
+ "742765629356",
95
+ "225164373623",
96
+ "676539973863",
97
+ "826214551218",
98
+ "182345464792",
99
+ "232776999554",
100
+ "337326533813",
101
+ "676676697292",
102
+ "929185622831",
103
+ "545512344383",
104
+ "499444466686",
105
+ "314697386682",
106
+ "517379856925",
107
+ "379557332953",
108
+ "614797267726",
109
+ "429781429464",
110
+ "922466849763",
111
+ "721737645236",
112
+ "479227349997",
113
+ "136931728327",
114
+ "259533577263",
115
+ "488538864842",
116
+ "937495658852",
117
+ "489991411364",
118
+ "499148455254",
119
+ "441373944925",
120
+ "899151413682",
121
+ "467893531755",
122
+ "527117488925",
123
+ "928335588653",
124
+ "374439448821",
125
+ "879425227932",
126
+ "867678158885",
127
+ "399749397872",
128
+ "129693547287",
129
+ "689285841825",
130
+ "771619544974",
131
+ "724883568652",
132
+ "516968424863",
133
+ "733737988257",
134
+ "852347289392",
135
+ "296953381169",
136
+ "377273562477",
137
+ "262296912232",
138
+ "547149832394",
139
+ "298464134954",
140
+ "216667245274",
141
+ "843998562287",
142
+ "572154333646",
143
+ "124589118494",
144
+ "841824384614",
145
+ "232896526252",
146
+ "295448593321",
147
+ "123741461297",
148
+ "653573457168",
149
+ "196735786156",
150
+ "377338713663",
151
+ "964342468552",
152
+ "586855179568",
153
+ "484773717614",
154
+ "894885246797",
155
+ "677896358599",
156
+ "848845611563",
157
+ "851852651677",
158
+ "398549545767",
159
+ "454244839926",
160
+ "799364566435",
161
+ "967114116556",
162
+ "817378986438",
163
+ "233795848681",
164
+ "824387273757",
165
+ "916198946615",
166
+ "563117729724",
167
+ "951794811935",
168
+ "374598961236",
169
+ "922867396683",
170
+ "765737843639",
171
+ "175469284871",
172
+ "231853711778",
173
+ "662426712668",
174
+ "711412347158",
175
+ "753466987363",
176
+ "513361312532",
177
+ "712992815957",
178
+ "971621888444",
179
+ "829235161526",
180
+ "585544633356",
181
+ "582471228164",
182
+ "678666359123",
183
+ "557533689478",
184
+ "632962475133",
185
+ "484489193824",
186
+ "489562189822",
187
+ "589547936288",
188
+ "363214487524",
189
+ "244885399387",
190
+ "431751228368",
191
+ "433581868192",
192
+ "486391569221",
193
+ "185438575221",
194
+ "126574388585",
195
+ "741757479784",
196
+ "529854679937",
197
+ "996116119839",
198
+ "616248973917",
199
+ "763531783491",
200
+ "955456118295",
201
+ "364196983365",
202
+ "195792996468",
203
+ "151859598873",
204
+ "399223169721",
205
+ "938488813964",
206
+ "961981959227",
207
+ "183368827562",
208
+ "533417736566",
209
+ "786391632558",
210
+ "665661658354",
211
+ "693281533643",
212
+ "475794684356",
213
+ "652154162978",
214
+ "753233719644",
215
+ "668514843129",
216
+ "819162623892",
217
+ "941169431859",
218
+ "877385381798",
219
+ "752644929761",
220
+ "881136466196",
221
+ "275597777299",
222
+ "731681792655",
223
+ "961133895172",
224
+ "864718285734",
225
+ "963852916563",
226
+ "319584985416",
227
+ "563365646341",
228
+ "811371928234",
229
+ "837131396371",
230
+ "267514771964",
231
+ "944513428457",
232
+ "117298239631",
233
+ "158142752582",
234
+ "252867443568",
235
+ "839269684865",
236
+ "612788593128",
237
+ "145669731981",
238
+ "121557291859",
239
+ "245416776926",
240
+ "799417897197",
241
+ "997958836435",
242
+ "892336777248",
243
+ "158929292238",
244
+ "581976444672",
245
+ "897784492783",
246
+ "492373714791",
247
+ "512659818733",
248
+ "881112998642",
249
+ "619454958782",
250
+ "431149748713",
251
+ "624221476921",
252
+ "125866399464",
253
+ "339882449689",
254
+ "186198784585",
255
+ "943193294691",
256
+ "955668961269",
257
+ "232787996724",
258
+ "215671314196",
259
+ "286173241916",
260
+ "745977673725",
261
+ "556976448182",
262
+ "599961512792",
263
+ "766294538337",
264
+ "934912591213",
265
+ "295118729589",
266
+ "529455466433",
267
+ "196119929397",
268
+ "379571934299",
269
+ "251789649997",
270
+ "564544131355",
271
+ "244371196654",
272
+ "384598329253",
273
+ "887753195844",
274
+ "364947325679",
275
+ "655517954651",
276
+ "673948786567",
277
+ "857231548835",
278
+ "816115936673",
279
+ "644234165531",
280
+ "182782912224",
281
+ "234316622259",
282
+ "421369185549",
283
+ "434632855397",
284
+ "921889371893",
285
+ "415956914763",
286
+ "598916996413",
287
+ "773671349113",
288
+ "952465217972",
289
+ "117657531962",
290
+ "729825168745",
291
+ "691315125346",
292
+ "768461952319",
293
+ "664847713559",
294
+ "953267689786",
295
+ "886464195129",
296
+ "824488329416",
297
+ "837873762491",
298
+ "532833541879",
299
+ "669183782449",
300
+ "941976537588",
301
+ "739394546916",
302
+ "267954879268",
303
+ "637551427887",
304
+ "217756494954",
305
+ "524444658383",
306
+ "117783274348",
307
+ "138218735276",
308
+ "814611949491",
309
+ "711641973413",
310
+ "499156317423",
311
+ "515856611931",
312
+ "454164859837",
313
+ "345271433112",
314
+ "462294118988",
315
+ "511785788222",
316
+ "497294727353",
317
+ "866519986723",
318
+ "334513529294",
319
+ "549946382131",
320
+ "284445431422",
321
+ "396521188476",
322
+ "421435255895",
323
+ "133373659361",
324
+ "322683334381",
325
+ "228358422847",
326
+ "291762694874",
327
+ "143182978129",
328
+ "511923256573",
329
+ "327158398268",
330
+ "879764613759",
331
+ "564395222747",
332
+ "451161679736",
333
+ "538631466654",
334
+ "221762325616",
335
+ "218391991184",
336
+ "322589379462",
337
+ "876537814263",
338
+ "152676556624",
339
+ "332522971941",
340
+ "884354318946",
341
+ "513349618943",
342
+ "116639746413",
343
+ "635185846287",
344
+ "993832498489",
345
+ "813981174797",
346
+ "438745114173",
347
+ "983493951323",
348
+ "724492262421",
349
+ "622553389126",
350
+ "889965243135",
351
+ "364492359246",
352
+ "154962668224",
353
+ "179564995814",
354
+ "418412875665",
355
+ "718951851413",
356
+ "699446724178",
357
+ "624266421831",
358
+ "815458725125",
359
+ "455423278865",
360
+ "393741199486",
361
+ "328552864359",
362
+ "211662639865",
363
+ "218784516525",
364
+ "762486672996",
365
+ "142799718159",
366
+ "858146415154",
367
+ "767858144912",
368
+ "571317457151",
369
+ "635127952696",
370
+ "116427191984",
371
+ "268921994538",
372
+ "523937669294",
373
+ "165429152138",
374
+ "739246183345",
375
+ "591464355756",
376
+ "212985874612",
377
+ "191887635211",
378
+ "967214577653",
379
+ "119342152414",
380
+ "946444632795",
381
+ "618423867817",
382
+ "228565148417",
383
+ "729116422489",
384
+ "527874729936",
385
+ "739784153482",
386
+ "387763951128",
387
+ "331369926711",
388
+ "562716493614",
389
+ "739667844957",
390
+ "562389434565",
391
+ "256497188281",
392
+ "859927364588",
393
+ "417668946583",
394
+ "357621613582",
395
+ "438435178228",
396
+ "485692541169",
397
+ "825815739116",
398
+ "342221452223",
399
+ "697747991249",
400
+ "716763689965",
401
+ "141499982867",
402
+ "818479319499",
403
+ "336813343298",
404
+ "594688742928",
405
+ "472129283475",
406
+ "514354144759",
407
+ "349249721685",
408
+ "546276298359",
409
+ "353755529131",
410
+ "315534574435",
411
+ "523723475786",
412
+ "215826764872",
413
+ "367968398551",
414
+ "569853653352",
415
+ "389715484387",
416
+ "293847485454",
417
+ "714738141818",
418
+ "178478368922",
419
+ "581493616981",
420
+ "589439538674",
421
+ "846657726193",
422
+ "722339992679",
423
+ "138154781148",
424
+ "757785319772",
425
+ "492516914298",
426
+ "919181521716",
427
+ "985781138935",
428
+ "476969195485",
429
+ "313145133463",
430
+ "758963111966",
431
+ "147541537162",
432
+ "557163366873",
433
+ "144373897488",
434
+ "522515164754",
435
+ "724964923582",
436
+ "284776712475",
437
+ "375429755114",
438
+ "181233596124",
439
+ "948585673431",
440
+ "243165586174",
441
+ "396847976144",
442
+ "997724962668",
443
+ "558837194455",
444
+ "163165456396",
445
+ "378749551722",
446
+ "161238482259",
447
+ "754978243758",
448
+ "195388849133",
449
+ "229775525672",
450
+ "262437452884",
451
+ "441377892146",
452
+ "451885565366",
453
+ "981277526855",
454
+ "762495822823",
455
+ "368763327262",
456
+ "757422791351",
457
+ "636324136426",
458
+ "214193645583",
459
+ "412843856172",
460
+ "179386156569",
461
+ "756916173536",
462
+ "892697125149",
463
+ "625334487352",
464
+ "941861857715",
465
+ "887417525236",
466
+ "649516938598",
467
+ "717628619782",
468
+ "438124184139",
469
+ "547563892268",
470
+ "856317483891",
471
+ "313313831273",
472
+ "371496153876",
473
+ "587541149322",
474
+ "265847332563",
475
+ "449549215429",
476
+ "163497196769",
477
+ "861342291298",
478
+ "268433315926",
479
+ "774679513717",
480
+ "851254219729",
481
+ "583527834464",
482
+ "488496781997",
483
+ "556814553861",
484
+ "482829231639",
485
+ "618878266619",
486
+ "147444452794",
487
+ "949235426629",
488
+ "357299947518",
489
+ "175528632226",
490
+ "645527857972",
491
+ "186872457894",
492
+ "552738847828",
493
+ "626748382482",
494
+ "921894985642",
495
+ "943878645871",
496
+ "859289776479",
497
+ "614583493135",
498
+ "933775286797",
499
+ "332234613346",
500
+ "325196781219",
501
+ "142526557681",
502
+ "356722692178",
503
+ "449318681694",
504
+ "687284547244",
505
+ "947262995132",
506
+ "893974619684",
507
+ "797238311233"
508
+ ],
509
+ "is_local": false,
510
+ "model_max_length": 16384,
511
+ "pad_token": "<|end▁of▁sentence|>",
512
+ "sp_model_kwargs": {},
513
+ "tokenizer_class": "LlamaTokenizer",
514
+ "unk_token": null,
515
+ "use_default_system_prompt": false
516
+ }
deepseek-coder-6.7b/base/train.log ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-17 21:17:52,048 [INFO] new_opacus_codex.train_steps: epoch=1 step=10 loss=14.8217
2
+ 2026-03-17 21:18:29,786 [INFO] new_opacus_codex.train_steps: epoch=1 step=20 loss=13.9264
3
+ 2026-03-17 21:19:07,599 [INFO] new_opacus_codex.train_steps: epoch=1 step=30 loss=11.1213
4
+ 2026-03-17 21:19:46,064 [INFO] new_opacus_codex.train_steps: epoch=1 step=40 loss=8.4720
5
+ 2026-03-17 21:20:23,734 [INFO] new_opacus_codex.train_steps: epoch=1 step=50 loss=7.3598
6
+ 2026-03-17 21:20:38,185 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=50 eval_loss=6.9550 duration_sec=14.45
7
+ 2026-03-17 21:21:15,967 [INFO] new_opacus_codex.train_steps: epoch=1 step=60 loss=6.9297
8
+ 2026-03-17 21:21:54,374 [INFO] new_opacus_codex.train_steps: epoch=1 step=70 loss=6.7893
9
+ 2026-03-17 21:22:32,136 [INFO] new_opacus_codex.train_steps: epoch=1 step=80 loss=6.7460
10
+ 2026-03-17 21:23:10,082 [INFO] new_opacus_codex.train_steps: epoch=1 step=90 loss=6.6407
11
+ 2026-03-17 21:23:47,521 [INFO] new_opacus_codex.train_steps: epoch=1 step=100 loss=6.5150
12
+ 2026-03-17 21:24:02,007 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=100 eval_loss=6.3930 duration_sec=14.48
13
+ 2026-03-17 21:24:40,121 [INFO] new_opacus_codex.train_steps: epoch=1 step=110 loss=6.4560
14
+ 2026-03-17 21:25:17,354 [INFO] new_opacus_codex.train_steps: epoch=1 step=120 loss=6.2952
15
+ 2026-03-17 21:25:55,699 [INFO] new_opacus_codex.train_steps: epoch=1 step=130 loss=6.2335
16
+ 2026-03-17 21:26:34,009 [INFO] new_opacus_codex.train_steps: epoch=1 step=140 loss=6.1285
17
+ 2026-03-17 21:27:12,683 [INFO] new_opacus_codex.train_steps: epoch=1 step=150 loss=6.0463
18
+ 2026-03-17 21:27:27,265 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=150 eval_loss=5.9036 duration_sec=14.58
19
+ 2026-03-17 21:28:04,787 [INFO] new_opacus_codex.train_steps: epoch=1 step=160 loss=5.9513
20
+ 2026-03-17 21:28:42,167 [INFO] new_opacus_codex.train_steps: epoch=1 step=170 loss=5.8611
21
+ 2026-03-17 21:29:20,592 [INFO] new_opacus_codex.train_steps: epoch=1 step=180 loss=5.8208
22
+ 2026-03-17 21:29:59,226 [INFO] new_opacus_codex.train_steps: epoch=1 step=190 loss=5.8057
23
+ 2026-03-17 21:30:37,106 [INFO] new_opacus_codex.train_steps: epoch=1 step=200 loss=5.6750
24
+ 2026-03-17 21:30:51,629 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=200 eval_loss=5.4885 duration_sec=14.52
25
+ 2026-03-17 21:31:53,495 [INFO] new_opacus_codex.train_steps: epoch=2 step=210 loss=5.6623
26
+ 2026-03-17 21:32:32,250 [INFO] new_opacus_codex.train_steps: epoch=2 step=220 loss=5.4695
27
+ 2026-03-17 21:33:10,083 [INFO] new_opacus_codex.train_steps: epoch=2 step=230 loss=5.4676
28
+ 2026-03-17 21:33:48,497 [INFO] new_opacus_codex.train_steps: epoch=2 step=240 loss=5.3727
29
+ 2026-03-17 21:34:26,298 [INFO] new_opacus_codex.train_steps: epoch=2 step=250 loss=5.3441
30
+ 2026-03-17 21:34:40,943 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=250 eval_loss=5.1709 duration_sec=14.64
31
+ 2026-03-17 21:35:18,715 [INFO] new_opacus_codex.train_steps: epoch=2 step=260 loss=5.2466
32
+ 2026-03-17 21:35:56,484 [INFO] new_opacus_codex.train_steps: epoch=2 step=270 loss=5.1862
33
+ 2026-03-17 21:36:34,597 [INFO] new_opacus_codex.train_steps: epoch=2 step=280 loss=5.1289
34
+ 2026-03-17 21:37:12,839 [INFO] new_opacus_codex.train_steps: epoch=2 step=290 loss=5.1346
35
+ 2026-03-17 21:37:51,290 [INFO] new_opacus_codex.train_steps: epoch=2 step=300 loss=5.1520
36
+ 2026-03-17 21:38:05,894 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=300 eval_loss=4.9656 duration_sec=14.60
37
+ 2026-03-17 21:38:44,012 [INFO] new_opacus_codex.train_steps: epoch=2 step=310 loss=5.0725
38
+ 2026-03-17 21:39:21,666 [INFO] new_opacus_codex.train_steps: epoch=2 step=320 loss=5.0865
39
+ 2026-03-17 21:40:00,038 [INFO] new_opacus_codex.train_steps: epoch=2 step=330 loss=5.0690
40
+ 2026-03-17 21:40:37,809 [INFO] new_opacus_codex.train_steps: epoch=2 step=340 loss=5.0108
41
+ 2026-03-17 21:41:16,205 [INFO] new_opacus_codex.train_steps: epoch=2 step=350 loss=5.0537
42
+ 2026-03-17 21:41:30,815 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=350 eval_loss=4.8690 duration_sec=14.61
43
+ 2026-03-17 21:42:08,360 [INFO] new_opacus_codex.train_steps: epoch=2 step=360 loss=4.9427
44
+ 2026-03-17 21:42:46,516 [INFO] new_opacus_codex.train_steps: epoch=2 step=370 loss=4.9867
45
+ 2026-03-17 21:43:24,545 [INFO] new_opacus_codex.train_steps: epoch=2 step=380 loss=4.9544
46
+ 2026-03-17 21:44:03,007 [INFO] new_opacus_codex.train_steps: epoch=2 step=390 loss=5.0119
47
+ 2026-03-17 21:44:41,552 [INFO] new_opacus_codex.train_steps: epoch=2 step=400 loss=4.9797
48
+ 2026-03-17 21:44:56,213 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=400 eval_loss=4.8401 duration_sec=14.66
49
+ 2026-03-17 21:45:34,350 [INFO] new_opacus_codex.train_steps: epoch=2 step=410 loss=4.9406
deepseek-coder-6.7b/dp3/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "k_proj",
36
+ "v_proj",
37
+ "q_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/dp3/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.52164,
9
+ "empirical_epsilon": {
10
+ "0.05": 0.019017613492906094,
11
+ "0.01": 0.0
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 0.019017613492906094,
16
+ "num_guesses": 85,
17
+ "correct_guesses": 51,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 0.0,
44
+ "num_guesses": 0,
45
+ "correct_guesses": 0,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.543272,
74
+ "empirical_epsilon": {
75
+ "0.05": 0.0,
76
+ "0.01": 0.0
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 0.0,
81
+ "num_guesses": 0,
82
+ "correct_guesses": 0,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 0.0,
109
+ "num_guesses": 0,
110
+ "correct_guesses": 0,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/dp3/canary_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
deepseek-coder-6.7b/dp3/codecarbon.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,cpu_utilization_percent,gpu_utilization_percent,ram_utilization_percent,ram_used_gb,on_cloud,pue,wue
2
+ 2026-03-17T23:04:51,codedp-deepseek-coder-6.7b-cpt-dp3,0be3f13a-f9c0-4f72-8f3f-769479cfe611,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,2107.2360566542484,0.10251235144047784,4.864777779250855e-05,72.02572178945746,4183.395055207508,54.0,0.040608159184490934,2.447988524778083,0.030443774090695673,2.5190404580532704,0.0,Sweden,SWE,östergötland county,,,Linux-6.8.0-94-generic-x86_64-with-glibc2.39,3.11.0,3.2.3,256,AMD EPYC 9554 64-Core Processor,8,8 x NVIDIA H200,16.1885,58.594,1511.49019241333,machine,3.5188995215311123,80.96321770334929,5.3144497607653856,80.42555782920437,N,1.0,0.0
deepseek-coder-6.7b/dp3/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"timestamp": 1773786740.0321612, "event": "train_step", "step": 10, "epoch": 1, "metrics": {"train/step_loss": 14.807524247602982, "train/step_real_loss": 14.926509380340576, "train/lr": 0.00018181818181818183, "train/step_canary_loss": 11.0, "perf/step_duration_sec": 9.13194552809, "perf/samples_per_sec": 7.227375568216326, "perf/tokens_per_sec": 5865.562802059686, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 53564.0, "perf/physical_batches": 9.0, "privacy/epsilon": 0.7131647471248268, "system/cuda_memory_allocated_gb": 14.177838802337646, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
2
+ {"timestamp": 1773786833.0391247, "event": "train_step", "step": 20, "epoch": 1, "metrics": {"train/step_loss": 14.44931131250718, "train/step_real_loss": 14.67465889453888, "train/lr": 0.00019897180218885507, "train/step_canary_loss": 10.84375, "perf/step_duration_sec": 9.05874504102394, "perf/samples_per_sec": 7.50655854558787, "perf/tokens_per_sec": 6064.305789733377, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 54935.0, "perf/physical_batches": 9.0, "privacy/epsilon": 0.9542480237478311, "system/cuda_memory_allocated_gb": 14.301010608673096, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
3
+ {"timestamp": 1773786926.3488383, "event": "train_step", "step": 30, "epoch": 1, "metrics": {"train/step_loss": 13.955331860166607, "train/step_real_loss": 14.391435980796814, "train/lr": 0.00019544467510209388, "train/step_canary_loss": 0.0, "perf/step_duration_sec": 9.485276577994227, "perf/samples_per_sec": 6.74729929841788, "perf/tokens_per_sec": 5592.9839856307335, "perf/logical_batch_size": 64.0, "perf/logical_token_count": 53051.0, "perf/physical_batches": 10.0, "privacy/epsilon": 1.145336977893831, "system/cuda_memory_allocated_gb": 14.547617435455322, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
4
+ {"timestamp": 1773787018.6373072, "event": "train_step", "step": 40, "epoch": 1, "metrics": {"train/step_loss": 14.15506328235973, "train/step_real_loss": 14.120846509933472, "train/lr": 0.00018949541262593762, "train/step_canary_loss": 15.25, "perf/step_duration_sec": 8.954908549785614, "perf/samples_per_sec": 7.370259521140512, "perf/tokens_per_sec": 5682.023408404114, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 50882.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.311193108872294, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
5
+ {"timestamp": 1773787111.6987886, "event": "train_step", "step": 50, "epoch": 1, "metrics": {"train/step_loss": 13.791429372934195, "train/step_real_loss": 13.741295456886292, "train/lr": 0.00018127499143005268, "train/step_canary_loss": 17.0, "perf/step_duration_sec": 9.314833164215088, "perf/samples_per_sec": 6.97811746642026, "perf/tokens_per_sec": 5992.48521320173, "perf/logical_batch_size": 65.0, "perf/logical_token_count": 55819.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.4583641061524852, "system/cuda_memory_allocated_gb": 14.1148681640625, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
6
+ {"timestamp": 1773787126.3622315, "event": "eval_step", "step": 50, "epoch": 1, "metrics": {"eval/loss": 13.72979099913077, "eval/duration_sec": 14.660556460730731}}
7
+ {"timestamp": 1773787219.2685702, "event": "train_step", "step": 60, "epoch": 1, "metrics": {"train/step_loss": 13.225338772365026, "train/step_real_loss": 13.441776752471924, "train/lr": 0.0001709920242324663, "train/step_canary_loss": 10.916666984558105, "perf/step_duration_sec": 9.050328846089542, "perf/samples_per_sec": 7.734525583591975, "perf/tokens_per_sec": 6221.431392996139, "perf/logical_batch_size": 70.0, "perf/logical_token_count": 56306.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.5929146459722179, "system/cuda_memory_allocated_gb": 14.425142288208008, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
8
+ {"timestamp": 1773787312.4062448, "event": "train_step", "step": 70, "epoch": 1, "metrics": {"train/step_loss": 12.945215898401598, "train/step_real_loss": 13.073627829551697, "train/lr": 0.00015890746575622231, "train/step_canary_loss": 10.890625, "perf/step_duration_sec": 9.778116607107222, "perf/samples_per_sec": 6.9543044670355245, "perf/tokens_per_sec": 5356.041669817419, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 52372.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.7182200620091768, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
9
+ {"timestamp": 1773787406.2219386, "event": "train_step", "step": 80, "epoch": 1, "metrics": {"train/step_loss": 12.60925786635455, "train/step_real_loss": 12.712766170501709, "train/lr": 0.00014532799038330385, "train/step_canary_loss": 10.953125, "perf/step_duration_sec": 9.040645513217896, "perf/samples_per_sec": 7.521586804900208, "perf/tokens_per_sec": 5688.089409635122, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 51424.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.8360751937320303, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
10
+ {"timestamp": 1773787500.4468713, "event": "train_step", "step": 90, "epoch": 1, "metrics": {"train/step_loss": 12.172602767374977, "train/step_real_loss": 12.225615382194519, "train/lr": 0.00013059820956358998, "train/step_canary_loss": 11.041666984558105, "perf/step_duration_sec": 9.056960263755172, "perf/samples_per_sec": 7.397625477957065, "perf/tokens_per_sec": 6038.781048744861, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 54693.0, "perf/physical_batches": 9.0, "privacy/epsilon": 1.9476934830264792, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
11
+ {"timestamp": 1773787593.4491894, "event": "train_step", "step": 100, "epoch": 1, "metrics": {"train/step_loss": 11.868438344606211, "train/step_real_loss": 11.924361228942871, "train/lr": 0.00011509192648058249, "train/step_canary_loss": 11.35714340209961, "perf/step_duration_sec": 9.823987863957882, "perf/samples_per_sec": 7.227207625172652, "perf/tokens_per_sec": 4852.001107908164, "perf/logical_batch_size": 71.0, "perf/logical_token_count": 47666.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.054057754637441, "system/cuda_memory_allocated_gb": 14.487197399139404, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
12
+ {"timestamp": 1773787607.9792094, "event": "eval_step", "step": 100, "epoch": 1, "metrics": {"eval/loss": 11.81135509501804, "eval/duration_sec": 14.526865308173}}
13
+ {"timestamp": 1773787658.5497303, "event": "train_epoch", "step": 104, "epoch": 1, "metrics": {"train/epoch_loss": 13.449225129287315, "train/epoch_real_loss": 13.57216826545721, "train/epoch_canary_loss": 10.867904580308076, "perf/epoch_duration_sec": 997.745806640014, "perf/epoch_samples_per_sec": 55.996226321558325, "perf/epoch_tokens_per_sec": 44201.770337193724, "perf/epoch_samples": 55870.0, "perf/epoch_tokens": 44102131.0, "system/cuda_epoch_peak_memory_gb": 73.2897834777832, "eval/loss": 11.659296675161881, "eval/duration_sec": 14.518914998974651, "privacy/epsilon": 2.0952814257505974}}
14
+ {"timestamp": 1773787666.4096277, "event": "audit_epoch", "step": 104, "epoch": 1, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.536216, "audit/loss/empirical_epsilon/0.05": 0.0, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.542752, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0, "perf/audit_duration_sec": 5.343048084992915}}
15
+ {"timestamp": 1773787722.7301056, "event": "train_step", "step": 110, "epoch": 2, "metrics": {"train/step_loss": 11.533822839910334, "train/step_real_loss": 11.543668866157532, "train/lr": 9.920264990753837e-05, "train/step_canary_loss": 11.21875, "perf/step_duration_sec": 9.264618248213083, "perf/samples_per_sec": 7.123876908012889, "perf/tokens_per_sec": 5974.126350071172, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 55348.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.155895236118489, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.28975915908813}}
16
+ {"timestamp": 1773787815.270247, "event": "train_step", "step": 120, "epoch": 2, "metrics": {"train/step_loss": 11.204551952988353, "train/step_real_loss": 11.20437467098236, "train/lr": 8.333360798744496e-05, "train/step_canary_loss": 11.208333969116211, "perf/step_duration_sec": 9.1695022219792, "perf/samples_per_sec": 7.306830662999536, "perf/tokens_per_sec": 5355.143475760137, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 49104.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.2538737991591966, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.28975915908813}}
17
+ {"timestamp": 1773787907.84406, "event": "train_step", "step": 130, "epoch": 2, "metrics": {"train/step_loss": 10.907811731532, "train/step_real_loss": 10.883031368255615, "train/lr": 6.788751536089739e-05, "train/step_canary_loss": 11.225000381469727, "perf/step_duration_sec": 9.083537110127509, "perf/samples_per_sec": 7.59615986189673, "perf/tokens_per_sec": 6119.20217048794, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 55584.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.34829265801524, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
18
+ {"timestamp": 1773787999.754929, "event": "train_step", "step": 140, "epoch": 2, "metrics": {"train/step_loss": 10.733073462301226, "train/step_real_loss": 10.715678453445435, "train/lr": 5.325635332531864e-05, "train/step_canary_loss": 11.104166984558105, "perf/step_duration_sec": 9.54831570899114, "perf/samples_per_sec": 7.016944353537626, "perf/tokens_per_sec": 5555.115856721534, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 53042.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.4397183333948855, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
19
+ {"timestamp": 1773788092.0733988, "event": "train_step", "step": 150, "epoch": 2, "metrics": {"train/step_loss": 10.644248768903207, "train/step_real_loss": 10.618408799171448, "train/lr": 3.981142237826332e-05, "train/step_canary_loss": 10.975000381469727, "perf/step_duration_sec": 9.050548555329442, "perf/samples_per_sec": 7.623847281540647, "perf/tokens_per_sec": 6154.654567009547, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 55703.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.5283910517887054, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
20
+ {"timestamp": 1773788106.5788815, "event": "eval_step", "step": 150, "epoch": 2, "metrics": {"eval/loss": 10.528999336741187, "eval/duration_sec": 14.50253726914525}}
21
+ {"timestamp": 1773788198.8056324, "event": "train_step", "step": 160, "epoch": 2, "metrics": {"train/step_loss": 10.509951504794033, "train/step_real_loss": 10.495614051818848, "train/lr": 2.789391958515183e-05, "train/step_canary_loss": 10.96875, "perf/step_duration_sec": 8.9364311741665, "perf/samples_per_sec": 7.38549860830275, "perf/tokens_per_sec": 6128.061519492174, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 54763.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.6145698431381854, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
22
+ {"timestamp": 1773788292.7653294, "event": "train_step", "step": 170, "epoch": 2, "metrics": {"train/step_loss": 10.498227048276076, "train/step_real_loss": 10.467870473861694, "train/lr": 1.7806279893114875e-05, "train/step_canary_loss": 11.145833969116211, "perf/step_duration_sec": 9.595276300795376, "perf/samples_per_sec": 6.982602470180687, "perf/tokens_per_sec": 5185.6766225559795, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 49758.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.6985357679843074, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
23
+ {"timestamp": 1773788385.7951636, "event": "train_step", "step": 180, "epoch": 2, "metrics": {"train/step_loss": 10.444608576157513, "train/step_real_loss": 10.407943487167358, "train/lr": 9.804501125681243e-06, "train/step_canary_loss": 11.03125, "perf/step_duration_sec": 9.077591832727194, "perf/samples_per_sec": 7.490973515116802, "perf/tokens_per_sec": 5571.85219736901, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 50579.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.780402267783889, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
24
+ {"timestamp": 1773788479.450152, "event": "train_step", "step": 190, "epoch": 2, "metrics": {"train/step_loss": 10.40689816682235, "train/step_real_loss": 10.370327711105347, "train/lr": 4.091647429802869e-06, "train/step_canary_loss": 10.875, "perf/step_duration_sec": 9.14594066562131, "perf/samples_per_sec": 7.5443305967820455, "perf/tokens_per_sec": 5864.897003063607, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 53640.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.860377969759561, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.29015684127808}}
25
+ {"timestamp": 1773788572.2790332, "event": "train_step", "step": 200, "epoch": 2, "metrics": {"train/step_loss": 10.382469764122597, "train/step_real_loss": 10.374773979187012, "train/lr": 8.126960406835249e-07, "train/step_canary_loss": 10.875, "perf/step_duration_sec": 9.668661074247211, "perf/samples_per_sec": 6.722750906341063, "perf/tokens_per_sec": 5157.073933722741, "perf/logical_batch_size": 65.0, "perf/logical_token_count": 49862.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.938565800812133, "system/cuda_memory_allocated_gb": 14.1148681640625, "system/cuda_max_memory_allocated_gb": 73.29015684127808}}
26
+ {"timestamp": 1773788586.7941608, "event": "eval_step", "step": 200, "epoch": 2, "metrics": {"eval/loss": 10.326958019625057, "eval/duration_sec": 14.51232642121613}}
27
+ {"timestamp": 1773788674.2453506, "event": "train_epoch", "step": 208, "epoch": 2, "metrics": {"train/epoch_loss": 10.700755941458253, "train/epoch_real_loss": 10.694620649826392, "train/epoch_canary_loss": 10.469284867902852, "perf/epoch_duration_sec": 993.2564477077685, "perf/epoch_samples_per_sec": 56.02984015702426, "perf/epoch_tokens_per_sec": 44390.36172556743, "perf/epoch_samples": 55652.0, "perf/epoch_tokens": 44091013.0, "system/cuda_epoch_peak_memory_gb": 73.29015684127808, "eval/loss": 10.326239856806668, "eval/duration_sec": 14.522059130016714, "privacy/epsilon": 2.9999680995370417}}
28
+ {"timestamp": 1773788682.610862, "event": "audit_epoch", "step": 208, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.52164, "audit/loss/empirical_epsilon/0.05": 0.019017613492906094, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.019017613492906094, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 85.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 51.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.543272, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0, "perf/audit_duration_sec": 5.763615260832012}}
29
+ {"timestamp": 1773788690.7880108, "event": "audit_final", "step": 208, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.52164, "audit/loss/empirical_epsilon/0.05": 0.019017613492906094, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.019017613492906094, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 85.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 51.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.543272, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0}}
30
+ {"timestamp": 1773788691.3375037, "event": "energy_final", "step": 208, "epoch": null, "metrics": {"energy/codecarbon/duration": 2107.2360566542484, "energy/codecarbon/emissions": 0.10251235144047784, "energy/codecarbon/emissions_rate": 4.864777779250855e-05, "energy/codecarbon/cpu_power": 72.02572178945746, "energy/codecarbon/gpu_power": 4183.395055207508, "energy/codecarbon/ram_power": 54.0, "energy/codecarbon/cpu_energy": 0.040608159184490934, "energy/codecarbon/gpu_energy": 2.447988524778083, "energy/codecarbon/ram_energy": 0.030443774090695673, "energy/codecarbon/energy_consumed": 2.5190404580532704, "energy/codecarbon/water_consumed": 0.0, "energy/codecarbon/cpu_count": 256.0, "energy/codecarbon/gpu_count": 8.0, "energy/codecarbon/longitude": 16.1885, "energy/codecarbon/latitude": 58.594, "energy/codecarbon/ram_total_size": 1511.49019241333, "energy/codecarbon/cpu_utilization_percent": 3.5188995215311123, "energy/codecarbon/gpu_utilization_percent": 80.96321770334929, "energy/codecarbon/ram_utilization_percent": 5.3144497607653856, "energy/codecarbon/ram_used_gb": 80.42555782920437, "energy/codecarbon/pue": 1.0, "energy/codecarbon/wue": 0.0}}
deepseek-coder-6.7b/dp3/resolved_config.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
4
+ max_length: 1024
5
+ dtype: bfloat16
6
+ trust_remote_code: true
7
+ use_fast_tokenizer: true
8
+ cache_dir: null
9
+ local_files_only: false
10
+ low_cpu_mem_usage: true
11
+ tie_word_embeddings: true
12
+ gradient_checkpointing: false
13
+ use_chat_template: false
14
+ dataset:
15
+ name: melihcatal/codedp-cpt
16
+ split: train
17
+ mode: cpt
18
+ text_column: text
19
+ validation_ratio: 0.05
20
+ max_samples: -1
21
+ lora:
22
+ enabled: true
23
+ r: 16
24
+ alpha: 32
25
+ dropout: 0.05
26
+ target_modules:
27
+ - q_proj
28
+ - k_proj
29
+ - v_proj
30
+ - o_proj
31
+ modules_to_save:
32
+ - lm_head
33
+ bias: none
34
+ training:
35
+ seed: 42
36
+ epochs: 2
37
+ warmup_steps: null
38
+ warmup_ratio: 0.05
39
+ mixed_precision: false
40
+ mixed_precision_dtype: bfloat16
41
+ batch_size: 8
42
+ eval_batch_size: 8
43
+ eval_every_steps: 50
44
+ eval_every_epochs: 1
45
+ learning_rate: 0.0002
46
+ optimizer: adamw
47
+ lr_scheduler: cosine
48
+ adam_beta1: 0.9
49
+ adam_beta2: 0.999
50
+ adam_epsilon: 1.0e-08
51
+ sgd_momentum: 0.9
52
+ weight_decay: 0.01
53
+ max_grad_norm: 1.0
54
+ log_every: 10
55
+ gradient_accumulation_steps: 8
56
+ num_workers: 4
57
+ output_dir: runs/cpt/deepseek-coder-6.7b/dp3
58
+ distributed:
59
+ strategy: dpddp
60
+ backend: nccl
61
+ devices: null
62
+ dp:
63
+ module_validator: auto
64
+ target_delta: 1.0e-05
65
+ noise_multiplier: null
66
+ max_grad_norm: 1.0
67
+ grad_sample_mode: hooks
68
+ clipping: flat
69
+ secure_mode: false
70
+ enabled: true
71
+ target_epsilon: 3.0
72
+ audit:
73
+ enabled: true
74
+ run_every_epoch: true
75
+ epoch_device: cuda
76
+ q_canary: auto
77
+ num_canaries: 500
78
+ prefix_length: 49
79
+ num_digits: 12
80
+ batch_size: 32
81
+ delta: 1.0e-05
82
+ p_values:
83
+ - 0.05
84
+ - 0.01
85
+ paper_guess_fraction: 0.2
86
+ paper_guess_steps: 20
87
+ enable_holdout_empirical_epsilon: false
88
+ holdout_seed: 42
89
+ tie_seed: 42
90
+ tracking:
91
+ enabled: true
92
+ tensorboard: true
93
+ wandb: false
94
+ wandb_project: codedp-finetune-h200-audit
95
+ wandb_run_name: deepseek-coder-6.7b-cpt-dp3
96
+ wandb_mode: online
97
+ codecarbon: true
98
+ codecarbon_output_file: codecarbon.csv
99
+ codecarbon_measure_power_secs: 15
100
+ codecarbon_country_iso_code: null
101
+ codecarbon_project_name: codedp-deepseek-coder-6.7b-cpt-dp3
deepseek-coder-6.7b/dp3/scalars.csv ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timestamp,event,step,epoch,key,value
2
+ 1773786740.0321612,train_step,10,1,train/step_loss,14.807524247602982
3
+ 1773786740.0321612,train_step,10,1,train/step_real_loss,14.926509380340576
4
+ 1773786740.0321612,train_step,10,1,train/lr,0.00018181818181818183
5
+ 1773786740.0321612,train_step,10,1,train/step_canary_loss,11.0
6
+ 1773786740.0321612,train_step,10,1,perf/step_duration_sec,9.13194552809
7
+ 1773786740.0321612,train_step,10,1,perf/samples_per_sec,7.227375568216326
8
+ 1773786740.0321612,train_step,10,1,perf/tokens_per_sec,5865.562802059686
9
+ 1773786740.0321612,train_step,10,1,perf/logical_batch_size,66.0
10
+ 1773786740.0321612,train_step,10,1,perf/logical_token_count,53564.0
11
+ 1773786740.0321612,train_step,10,1,perf/physical_batches,9.0
12
+ 1773786740.0321612,train_step,10,1,privacy/epsilon,0.7131647471248268
13
+ 1773786740.0321612,train_step,10,1,system/cuda_memory_allocated_gb,14.177838802337646
14
+ 1773786740.0321612,train_step,10,1,system/cuda_max_memory_allocated_gb,73.28973627090454
15
+ 1773786833.0391247,train_step,20,1,train/step_loss,14.44931131250718
16
+ 1773786833.0391247,train_step,20,1,train/step_real_loss,14.67465889453888
17
+ 1773786833.0391247,train_step,20,1,train/lr,0.00019897180218885507
18
+ 1773786833.0391247,train_step,20,1,train/step_canary_loss,10.84375
19
+ 1773786833.0391247,train_step,20,1,perf/step_duration_sec,9.05874504102394
20
+ 1773786833.0391247,train_step,20,1,perf/samples_per_sec,7.50655854558787
21
+ 1773786833.0391247,train_step,20,1,perf/tokens_per_sec,6064.305789733377
22
+ 1773786833.0391247,train_step,20,1,perf/logical_batch_size,68.0
23
+ 1773786833.0391247,train_step,20,1,perf/logical_token_count,54935.0
24
+ 1773786833.0391247,train_step,20,1,perf/physical_batches,9.0
25
+ 1773786833.0391247,train_step,20,1,privacy/epsilon,0.9542480237478311
26
+ 1773786833.0391247,train_step,20,1,system/cuda_memory_allocated_gb,14.301010608673096
27
+ 1773786833.0391247,train_step,20,1,system/cuda_max_memory_allocated_gb,73.28973627090454
28
+ 1773786926.3488383,train_step,30,1,train/step_loss,13.955331860166607
29
+ 1773786926.3488383,train_step,30,1,train/step_real_loss,14.391435980796814
30
+ 1773786926.3488383,train_step,30,1,train/lr,0.00019544467510209388
31
+ 1773786926.3488383,train_step,30,1,train/step_canary_loss,0.0
32
+ 1773786926.3488383,train_step,30,1,perf/step_duration_sec,9.485276577994227
33
+ 1773786926.3488383,train_step,30,1,perf/samples_per_sec,6.74729929841788
34
+ 1773786926.3488383,train_step,30,1,perf/tokens_per_sec,5592.9839856307335
35
+ 1773786926.3488383,train_step,30,1,perf/logical_batch_size,64.0
36
+ 1773786926.3488383,train_step,30,1,perf/logical_token_count,53051.0
37
+ 1773786926.3488383,train_step,30,1,perf/physical_batches,10.0
38
+ 1773786926.3488383,train_step,30,1,privacy/epsilon,1.145336977893831
39
+ 1773786926.3488383,train_step,30,1,system/cuda_memory_allocated_gb,14.547617435455322
40
+ 1773786926.3488383,train_step,30,1,system/cuda_max_memory_allocated_gb,73.28973627090454
41
+ 1773787018.6373072,train_step,40,1,train/step_loss,14.15506328235973
42
+ 1773787018.6373072,train_step,40,1,train/step_real_loss,14.120846509933472
43
+ 1773787018.6373072,train_step,40,1,train/lr,0.00018949541262593762
44
+ 1773787018.6373072,train_step,40,1,train/step_canary_loss,15.25
45
+ 1773787018.6373072,train_step,40,1,perf/step_duration_sec,8.954908549785614
46
+ 1773787018.6373072,train_step,40,1,perf/samples_per_sec,7.370259521140512
47
+ 1773787018.6373072,train_step,40,1,perf/tokens_per_sec,5682.023408404114
48
+ 1773787018.6373072,train_step,40,1,perf/logical_batch_size,66.0
49
+ 1773787018.6373072,train_step,40,1,perf/logical_token_count,50882.0
50
+ 1773787018.6373072,train_step,40,1,perf/physical_batches,9.0
51
+ 1773787018.6373072,train_step,40,1,privacy/epsilon,1.311193108872294
52
+ 1773787018.6373072,train_step,40,1,system/cuda_memory_allocated_gb,14.17786169052124
53
+ 1773787018.6373072,train_step,40,1,system/cuda_max_memory_allocated_gb,73.2897834777832
54
+ 1773787111.6987886,train_step,50,1,train/step_loss,13.791429372934195
55
+ 1773787111.6987886,train_step,50,1,train/step_real_loss,13.741295456886292
56
+ 1773787111.6987886,train_step,50,1,train/lr,0.00018127499143005268
57
+ 1773787111.6987886,train_step,50,1,train/step_canary_loss,17.0
58
+ 1773787111.6987886,train_step,50,1,perf/step_duration_sec,9.314833164215088
59
+ 1773787111.6987886,train_step,50,1,perf/samples_per_sec,6.97811746642026
60
+ 1773787111.6987886,train_step,50,1,perf/tokens_per_sec,5992.48521320173
61
+ 1773787111.6987886,train_step,50,1,perf/logical_batch_size,65.0
62
+ 1773787111.6987886,train_step,50,1,perf/logical_token_count,55819.0
63
+ 1773787111.6987886,train_step,50,1,perf/physical_batches,9.0
64
+ 1773787111.6987886,train_step,50,1,privacy/epsilon,1.4583641061524852
65
+ 1773787111.6987886,train_step,50,1,system/cuda_memory_allocated_gb,14.1148681640625
66
+ 1773787111.6987886,train_step,50,1,system/cuda_max_memory_allocated_gb,73.2897834777832
67
+ 1773787126.3622315,eval_step,50,1,eval/loss,13.72979099913077
68
+ 1773787126.3622315,eval_step,50,1,eval/duration_sec,14.660556460730731
69
+ 1773787219.2685702,train_step,60,1,train/step_loss,13.225338772365026
70
+ 1773787219.2685702,train_step,60,1,train/step_real_loss,13.441776752471924
71
+ 1773787219.2685702,train_step,60,1,train/lr,0.0001709920242324663
72
+ 1773787219.2685702,train_step,60,1,train/step_canary_loss,10.916666984558105
73
+ 1773787219.2685702,train_step,60,1,perf/step_duration_sec,9.050328846089542
74
+ 1773787219.2685702,train_step,60,1,perf/samples_per_sec,7.734525583591975
75
+ 1773787219.2685702,train_step,60,1,perf/tokens_per_sec,6221.431392996139
76
+ 1773787219.2685702,train_step,60,1,perf/logical_batch_size,70.0
77
+ 1773787219.2685702,train_step,60,1,perf/logical_token_count,56306.0
78
+ 1773787219.2685702,train_step,60,1,perf/physical_batches,9.0
79
+ 1773787219.2685702,train_step,60,1,privacy/epsilon,1.5929146459722179
80
+ 1773787219.2685702,train_step,60,1,system/cuda_memory_allocated_gb,14.425142288208008
81
+ 1773787219.2685702,train_step,60,1,system/cuda_max_memory_allocated_gb,73.2897834777832
82
+ 1773787312.4062448,train_step,70,1,train/step_loss,12.945215898401598
83
+ 1773787312.4062448,train_step,70,1,train/step_real_loss,13.073627829551697
84
+ 1773787312.4062448,train_step,70,1,train/lr,0.00015890746575622231
85
+ 1773787312.4062448,train_step,70,1,train/step_canary_loss,10.890625
86
+ 1773787312.4062448,train_step,70,1,perf/step_duration_sec,9.778116607107222
87
+ 1773787312.4062448,train_step,70,1,perf/samples_per_sec,6.9543044670355245
88
+ 1773787312.4062448,train_step,70,1,perf/tokens_per_sec,5356.041669817419
89
+ 1773787312.4062448,train_step,70,1,perf/logical_batch_size,68.0
90
+ 1773787312.4062448,train_step,70,1,perf/logical_token_count,52372.0
91
+ 1773787312.4062448,train_step,70,1,perf/physical_batches,9.0
92
+ 1773787312.4062448,train_step,70,1,privacy/epsilon,1.7182200620091768
93
+ 1773787312.4062448,train_step,70,1,system/cuda_memory_allocated_gb,14.30103349685669
94
+ 1773787312.4062448,train_step,70,1,system/cuda_max_memory_allocated_gb,73.2897834777832
95
+ 1773787406.2219386,train_step,80,1,train/step_loss,12.60925786635455
96
+ 1773787406.2219386,train_step,80,1,train/step_real_loss,12.712766170501709
97
+ 1773787406.2219386,train_step,80,1,train/lr,0.00014532799038330385
98
+ 1773787406.2219386,train_step,80,1,train/step_canary_loss,10.953125
99
+ 1773787406.2219386,train_step,80,1,perf/step_duration_sec,9.040645513217896
100
+ 1773787406.2219386,train_step,80,1,perf/samples_per_sec,7.521586804900208
101
+ 1773787406.2219386,train_step,80,1,perf/tokens_per_sec,5688.089409635122
102
+ 1773787406.2219386,train_step,80,1,perf/logical_batch_size,68.0
103
+ 1773787406.2219386,train_step,80,1,perf/logical_token_count,51424.0
104
+ 1773787406.2219386,train_step,80,1,perf/physical_batches,9.0
105
+ 1773787406.2219386,train_step,80,1,privacy/epsilon,1.8360751937320303
106
+ 1773787406.2219386,train_step,80,1,system/cuda_memory_allocated_gb,14.30103349685669
107
+ 1773787406.2219386,train_step,80,1,system/cuda_max_memory_allocated_gb,73.2897834777832
108
+ 1773787500.4468713,train_step,90,1,train/step_loss,12.172602767374977
109
+ 1773787500.4468713,train_step,90,1,train/step_real_loss,12.225615382194519
110
+ 1773787500.4468713,train_step,90,1,train/lr,0.00013059820956358998
111
+ 1773787500.4468713,train_step,90,1,train/step_canary_loss,11.041666984558105
112
+ 1773787500.4468713,train_step,90,1,perf/step_duration_sec,9.056960263755172
113
+ 1773787500.4468713,train_step,90,1,perf/samples_per_sec,7.397625477957065
114
+ 1773787500.4468713,train_step,90,1,perf/tokens_per_sec,6038.781048744861
115
+ 1773787500.4468713,train_step,90,1,perf/logical_batch_size,67.0
116
+ 1773787500.4468713,train_step,90,1,perf/logical_token_count,54693.0
117
+ 1773787500.4468713,train_step,90,1,perf/physical_batches,9.0
118
+ 1773787500.4468713,train_step,90,1,privacy/epsilon,1.9476934830264792
119
+ 1773787500.4468713,train_step,90,1,system/cuda_memory_allocated_gb,14.238978385925293
120
+ 1773787500.4468713,train_step,90,1,system/cuda_max_memory_allocated_gb,73.2897834777832
121
+ 1773787593.4491894,train_step,100,1,train/step_loss,11.868438344606211
122
+ 1773787593.4491894,train_step,100,1,train/step_real_loss,11.924361228942871
123
+ 1773787593.4491894,train_step,100,1,train/lr,0.00011509192648058249
124
+ 1773787593.4491894,train_step,100,1,train/step_canary_loss,11.35714340209961
125
+ 1773787593.4491894,train_step,100,1,perf/step_duration_sec,9.823987863957882
126
+ 1773787593.4491894,train_step,100,1,perf/samples_per_sec,7.227207625172652
127
+ 1773787593.4491894,train_step,100,1,perf/tokens_per_sec,4852.001107908164
128
+ 1773787593.4491894,train_step,100,1,perf/logical_batch_size,71.0
129
+ 1773787593.4491894,train_step,100,1,perf/logical_token_count,47666.0
130
+ 1773787593.4491894,train_step,100,1,perf/physical_batches,9.0
131
+ 1773787593.4491894,train_step,100,1,privacy/epsilon,2.054057754637441
132
+ 1773787593.4491894,train_step,100,1,system/cuda_memory_allocated_gb,14.487197399139404
133
+ 1773787593.4491894,train_step,100,1,system/cuda_max_memory_allocated_gb,73.2897834777832
134
+ 1773787607.9792094,eval_step,100,1,eval/loss,11.81135509501804
135
+ 1773787607.9792094,eval_step,100,1,eval/duration_sec,14.526865308173
136
+ 1773787658.5497303,train_epoch,104,1,train/epoch_loss,13.449225129287315
137
+ 1773787658.5497303,train_epoch,104,1,train/epoch_real_loss,13.57216826545721
138
+ 1773787658.5497303,train_epoch,104,1,train/epoch_canary_loss,10.867904580308076
139
+ 1773787658.5497303,train_epoch,104,1,perf/epoch_duration_sec,997.745806640014
140
+ 1773787658.5497303,train_epoch,104,1,perf/epoch_samples_per_sec,55.996226321558325
141
+ 1773787658.5497303,train_epoch,104,1,perf/epoch_tokens_per_sec,44201.770337193724
142
+ 1773787658.5497303,train_epoch,104,1,perf/epoch_samples,55870.0
143
+ 1773787658.5497303,train_epoch,104,1,perf/epoch_tokens,44102131.0
144
+ 1773787658.5497303,train_epoch,104,1,system/cuda_epoch_peak_memory_gb,73.2897834777832
145
+ 1773787658.5497303,train_epoch,104,1,eval/loss,11.659296675161881
146
+ 1773787658.5497303,train_epoch,104,1,eval/duration_sec,14.518914998974651
147
+ 1773787658.5497303,train_epoch,104,1,privacy/epsilon,2.0952814257505974
148
+ 1773787666.4096277,audit_epoch,104,1,audit/delta,1e-05
149
+ 1773787666.4096277,audit_epoch,104,1,audit/num_canaries,500.0
150
+ 1773787666.4096277,audit_epoch,104,1,audit/num_members,250.0
151
+ 1773787666.4096277,audit_epoch,104,1,audit/paper_guess_fraction,0.2
152
+ 1773787666.4096277,audit_epoch,104,1,audit/paper_guess_steps,20.0
153
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/auc,0.536216
154
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon/0.05,0.0
155
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon/0.01,0.0
156
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/epsilon,0.0
157
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/num_guesses,0.0
158
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/correct_guesses,0.0
159
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
160
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
161
+ 1773787666.4096277,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
162
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/auc,0.542752
163
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon/0.05,0.0
164
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon/0.01,0.0
165
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
166
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
167
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
168
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
169
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
170
+ 1773787666.4096277,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
171
+ 1773787666.4096277,audit_epoch,104,1,perf/audit_duration_sec,5.343048084992915
172
+ 1773787722.7301056,train_step,110,2,train/step_loss,11.533822839910334
173
+ 1773787722.7301056,train_step,110,2,train/step_real_loss,11.543668866157532
174
+ 1773787722.7301056,train_step,110,2,train/lr,9.920264990753837e-05
175
+ 1773787722.7301056,train_step,110,2,train/step_canary_loss,11.21875
176
+ 1773787722.7301056,train_step,110,2,perf/step_duration_sec,9.264618248213083
177
+ 1773787722.7301056,train_step,110,2,perf/samples_per_sec,7.123876908012889
178
+ 1773787722.7301056,train_step,110,2,perf/tokens_per_sec,5974.126350071172
179
+ 1773787722.7301056,train_step,110,2,perf/logical_batch_size,66.0
180
+ 1773787722.7301056,train_step,110,2,perf/logical_token_count,55348.0
181
+ 1773787722.7301056,train_step,110,2,perf/physical_batches,9.0
182
+ 1773787722.7301056,train_step,110,2,privacy/epsilon,2.155895236118489
183
+ 1773787722.7301056,train_step,110,2,system/cuda_memory_allocated_gb,14.17786169052124
184
+ 1773787722.7301056,train_step,110,2,system/cuda_max_memory_allocated_gb,73.28975915908813
185
+ 1773787815.270247,train_step,120,2,train/step_loss,11.204551952988353
186
+ 1773787815.270247,train_step,120,2,train/step_real_loss,11.20437467098236
187
+ 1773787815.270247,train_step,120,2,train/lr,8.333360798744496e-05
188
+ 1773787815.270247,train_step,120,2,train/step_canary_loss,11.208333969116211
189
+ 1773787815.270247,train_step,120,2,perf/step_duration_sec,9.1695022219792
190
+ 1773787815.270247,train_step,120,2,perf/samples_per_sec,7.306830662999536
191
+ 1773787815.270247,train_step,120,2,perf/tokens_per_sec,5355.143475760137
192
+ 1773787815.270247,train_step,120,2,perf/logical_batch_size,67.0
193
+ 1773787815.270247,train_step,120,2,perf/logical_token_count,49104.0
194
+ 1773787815.270247,train_step,120,2,perf/physical_batches,9.0
195
+ 1773787815.270247,train_step,120,2,privacy/epsilon,2.2538737991591966
196
+ 1773787815.270247,train_step,120,2,system/cuda_memory_allocated_gb,14.238978385925293
197
+ 1773787815.270247,train_step,120,2,system/cuda_max_memory_allocated_gb,73.28975915908813
198
+ 1773787907.84406,train_step,130,2,train/step_loss,10.907811731532
199
+ 1773787907.84406,train_step,130,2,train/step_real_loss,10.883031368255615
200
+ 1773787907.84406,train_step,130,2,train/lr,6.788751536089739e-05
201
+ 1773787907.84406,train_step,130,2,train/step_canary_loss,11.225000381469727
202
+ 1773787907.84406,train_step,130,2,perf/step_duration_sec,9.083537110127509
203
+ 1773787907.84406,train_step,130,2,perf/samples_per_sec,7.59615986189673
204
+ 1773787907.84406,train_step,130,2,perf/tokens_per_sec,6119.20217048794
205
+ 1773787907.84406,train_step,130,2,perf/logical_batch_size,69.0
206
+ 1773787907.84406,train_step,130,2,perf/logical_token_count,55584.0
207
+ 1773787907.84406,train_step,130,2,perf/physical_batches,9.0
208
+ 1773787907.84406,train_step,130,2,privacy/epsilon,2.34829265801524
209
+ 1773787907.84406,train_step,130,2,system/cuda_memory_allocated_gb,14.363087177276611
210
+ 1773787907.84406,train_step,130,2,system/cuda_max_memory_allocated_gb,73.2897834777832
211
+ 1773787999.754929,train_step,140,2,train/step_loss,10.733073462301226
212
+ 1773787999.754929,train_step,140,2,train/step_real_loss,10.715678453445435
213
+ 1773787999.754929,train_step,140,2,train/lr,5.325635332531864e-05
214
+ 1773787999.754929,train_step,140,2,train/step_canary_loss,11.104166984558105
215
+ 1773787999.754929,train_step,140,2,perf/step_duration_sec,9.54831570899114
216
+ 1773787999.754929,train_step,140,2,perf/samples_per_sec,7.016944353537626
217
+ 1773787999.754929,train_step,140,2,perf/tokens_per_sec,5555.115856721534
218
+ 1773787999.754929,train_step,140,2,perf/logical_batch_size,67.0
219
+ 1773787999.754929,train_step,140,2,perf/logical_token_count,53042.0
220
+ 1773787999.754929,train_step,140,2,perf/physical_batches,9.0
221
+ 1773787999.754929,train_step,140,2,privacy/epsilon,2.4397183333948855
222
+ 1773787999.754929,train_step,140,2,system/cuda_memory_allocated_gb,14.238978385925293
223
+ 1773787999.754929,train_step,140,2,system/cuda_max_memory_allocated_gb,73.2897834777832
224
+ 1773788092.0733988,train_step,150,2,train/step_loss,10.644248768903207
225
+ 1773788092.0733988,train_step,150,2,train/step_real_loss,10.618408799171448
226
+ 1773788092.0733988,train_step,150,2,train/lr,3.981142237826332e-05
227
+ 1773788092.0733988,train_step,150,2,train/step_canary_loss,10.975000381469727
228
+ 1773788092.0733988,train_step,150,2,perf/step_duration_sec,9.050548555329442
229
+ 1773788092.0733988,train_step,150,2,perf/samples_per_sec,7.623847281540647
230
+ 1773788092.0733988,train_step,150,2,perf/tokens_per_sec,6154.654567009547
231
+ 1773788092.0733988,train_step,150,2,perf/logical_batch_size,69.0
232
+ 1773788092.0733988,train_step,150,2,perf/logical_token_count,55703.0
233
+ 1773788092.0733988,train_step,150,2,perf/physical_batches,9.0
234
+ 1773788092.0733988,train_step,150,2,privacy/epsilon,2.5283910517887054
235
+ 1773788092.0733988,train_step,150,2,system/cuda_memory_allocated_gb,14.363087177276611
236
+ 1773788092.0733988,train_step,150,2,system/cuda_max_memory_allocated_gb,73.2897834777832
237
+ 1773788106.5788815,eval_step,150,2,eval/loss,10.528999336741187
238
+ 1773788106.5788815,eval_step,150,2,eval/duration_sec,14.50253726914525
239
+ 1773788198.8056324,train_step,160,2,train/step_loss,10.509951504794033
240
+ 1773788198.8056324,train_step,160,2,train/step_real_loss,10.495614051818848
241
+ 1773788198.8056324,train_step,160,2,train/lr,2.789391958515183e-05
242
+ 1773788198.8056324,train_step,160,2,train/step_canary_loss,10.96875
243
+ 1773788198.8056324,train_step,160,2,perf/step_duration_sec,8.9364311741665
244
+ 1773788198.8056324,train_step,160,2,perf/samples_per_sec,7.38549860830275
245
+ 1773788198.8056324,train_step,160,2,perf/tokens_per_sec,6128.061519492174
246
+ 1773788198.8056324,train_step,160,2,perf/logical_batch_size,66.0
247
+ 1773788198.8056324,train_step,160,2,perf/logical_token_count,54763.0
248
+ 1773788198.8056324,train_step,160,2,perf/physical_batches,9.0
249
+ 1773788198.8056324,train_step,160,2,privacy/epsilon,2.6145698431381854
250
+ 1773788198.8056324,train_step,160,2,system/cuda_memory_allocated_gb,14.17786169052124
251
+ 1773788198.8056324,train_step,160,2,system/cuda_max_memory_allocated_gb,73.2897834777832
252
+ 1773788292.7653294,train_step,170,2,train/step_loss,10.498227048276076
253
+ 1773788292.7653294,train_step,170,2,train/step_real_loss,10.467870473861694
254
+ 1773788292.7653294,train_step,170,2,train/lr,1.7806279893114875e-05
255
+ 1773788292.7653294,train_step,170,2,train/step_canary_loss,11.145833969116211
256
+ 1773788292.7653294,train_step,170,2,perf/step_duration_sec,9.595276300795376
257
+ 1773788292.7653294,train_step,170,2,perf/samples_per_sec,6.982602470180687
258
+ 1773788292.7653294,train_step,170,2,perf/tokens_per_sec,5185.6766225559795
259
+ 1773788292.7653294,train_step,170,2,perf/logical_batch_size,67.0
260
+ 1773788292.7653294,train_step,170,2,perf/logical_token_count,49758.0
261
+ 1773788292.7653294,train_step,170,2,perf/physical_batches,9.0
262
+ 1773788292.7653294,train_step,170,2,privacy/epsilon,2.6985357679843074
263
+ 1773788292.7653294,train_step,170,2,system/cuda_memory_allocated_gb,14.238978385925293
264
+ 1773788292.7653294,train_step,170,2,system/cuda_max_memory_allocated_gb,73.2897834777832
265
+ 1773788385.7951636,train_step,180,2,train/step_loss,10.444608576157513
266
+ 1773788385.7951636,train_step,180,2,train/step_real_loss,10.407943487167358
267
+ 1773788385.7951636,train_step,180,2,train/lr,9.804501125681243e-06
268
+ 1773788385.7951636,train_step,180,2,train/step_canary_loss,11.03125
269
+ 1773788385.7951636,train_step,180,2,perf/step_duration_sec,9.077591832727194
270
+ 1773788385.7951636,train_step,180,2,perf/samples_per_sec,7.490973515116802
271
+ 1773788385.7951636,train_step,180,2,perf/tokens_per_sec,5571.85219736901
272
+ 1773788385.7951636,train_step,180,2,perf/logical_batch_size,68.0
273
+ 1773788385.7951636,train_step,180,2,perf/logical_token_count,50579.0
274
+ 1773788385.7951636,train_step,180,2,perf/physical_batches,9.0
275
+ 1773788385.7951636,train_step,180,2,privacy/epsilon,2.780402267783889
276
+ 1773788385.7951636,train_step,180,2,system/cuda_memory_allocated_gb,14.30103349685669
277
+ 1773788385.7951636,train_step,180,2,system/cuda_max_memory_allocated_gb,73.2897834777832
278
+ 1773788479.450152,train_step,190,2,train/step_loss,10.40689816682235
279
+ 1773788479.450152,train_step,190,2,train/step_real_loss,10.370327711105347
280
+ 1773788479.450152,train_step,190,2,train/lr,4.091647429802869e-06
281
+ 1773788479.450152,train_step,190,2,train/step_canary_loss,10.875
282
+ 1773788479.450152,train_step,190,2,perf/step_duration_sec,9.14594066562131
283
+ 1773788479.450152,train_step,190,2,perf/samples_per_sec,7.5443305967820455
284
+ 1773788479.450152,train_step,190,2,perf/tokens_per_sec,5864.897003063607
285
+ 1773788479.450152,train_step,190,2,perf/logical_batch_size,69.0
286
+ 1773788479.450152,train_step,190,2,perf/logical_token_count,53640.0
287
+ 1773788479.450152,train_step,190,2,perf/physical_batches,9.0
288
+ 1773788479.450152,train_step,190,2,privacy/epsilon,2.860377969759561
289
+ 1773788479.450152,train_step,190,2,system/cuda_memory_allocated_gb,14.363087177276611
290
+ 1773788479.450152,train_step,190,2,system/cuda_max_memory_allocated_gb,73.29015684127808
291
+ 1773788572.2790332,train_step,200,2,train/step_loss,10.382469764122597
292
+ 1773788572.2790332,train_step,200,2,train/step_real_loss,10.374773979187012
293
+ 1773788572.2790332,train_step,200,2,train/lr,8.126960406835249e-07
294
+ 1773788572.2790332,train_step,200,2,train/step_canary_loss,10.875
295
+ 1773788572.2790332,train_step,200,2,perf/step_duration_sec,9.668661074247211
296
+ 1773788572.2790332,train_step,200,2,perf/samples_per_sec,6.722750906341063
297
+ 1773788572.2790332,train_step,200,2,perf/tokens_per_sec,5157.073933722741
298
+ 1773788572.2790332,train_step,200,2,perf/logical_batch_size,65.0
299
+ 1773788572.2790332,train_step,200,2,perf/logical_token_count,49862.0
300
+ 1773788572.2790332,train_step,200,2,perf/physical_batches,9.0
301
+ 1773788572.2790332,train_step,200,2,privacy/epsilon,2.938565800812133
302
+ 1773788572.2790332,train_step,200,2,system/cuda_memory_allocated_gb,14.1148681640625
303
+ 1773788572.2790332,train_step,200,2,system/cuda_max_memory_allocated_gb,73.29015684127808
304
+ 1773788586.7941608,eval_step,200,2,eval/loss,10.326958019625057
305
+ 1773788586.7941608,eval_step,200,2,eval/duration_sec,14.51232642121613
306
+ 1773788674.2453506,train_epoch,208,2,train/epoch_loss,10.700755941458253
307
+ 1773788674.2453506,train_epoch,208,2,train/epoch_real_loss,10.694620649826392
308
+ 1773788674.2453506,train_epoch,208,2,train/epoch_canary_loss,10.469284867902852
309
+ 1773788674.2453506,train_epoch,208,2,perf/epoch_duration_sec,993.2564477077685
310
+ 1773788674.2453506,train_epoch,208,2,perf/epoch_samples_per_sec,56.02984015702426
311
+ 1773788674.2453506,train_epoch,208,2,perf/epoch_tokens_per_sec,44390.36172556743
312
+ 1773788674.2453506,train_epoch,208,2,perf/epoch_samples,55652.0
313
+ 1773788674.2453506,train_epoch,208,2,perf/epoch_tokens,44091013.0
314
+ 1773788674.2453506,train_epoch,208,2,system/cuda_epoch_peak_memory_gb,73.29015684127808
315
+ 1773788674.2453506,train_epoch,208,2,eval/loss,10.326239856806668
316
+ 1773788674.2453506,train_epoch,208,2,eval/duration_sec,14.522059130016714
317
+ 1773788674.2453506,train_epoch,208,2,privacy/epsilon,2.9999680995370417
318
+ 1773788682.610862,audit_epoch,208,2,audit/delta,1e-05
319
+ 1773788682.610862,audit_epoch,208,2,audit/num_canaries,500.0
320
+ 1773788682.610862,audit_epoch,208,2,audit/num_members,250.0
321
+ 1773788682.610862,audit_epoch,208,2,audit/paper_guess_fraction,0.2
322
+ 1773788682.610862,audit_epoch,208,2,audit/paper_guess_steps,20.0
323
+ 1773788682.610862,audit_epoch,208,2,audit/loss/auc,0.52164
324
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon/0.05,0.019017613492906094
325
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon/0.01,0.0
326
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/epsilon,0.019017613492906094
327
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,85.0
328
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,51.0
329
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
330
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
331
+ 1773788682.610862,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
332
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/auc,0.543272
333
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon/0.05,0.0
334
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon/0.01,0.0
335
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
336
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
337
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
338
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
339
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
340
+ 1773788682.610862,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
341
+ 1773788682.610862,audit_epoch,208,2,perf/audit_duration_sec,5.763615260832012
342
+ 1773788690.7880108,audit_final,208,2,audit/delta,1e-05
343
+ 1773788690.7880108,audit_final,208,2,audit/num_canaries,500.0
344
+ 1773788690.7880108,audit_final,208,2,audit/num_members,250.0
345
+ 1773788690.7880108,audit_final,208,2,audit/paper_guess_fraction,0.2
346
+ 1773788690.7880108,audit_final,208,2,audit/paper_guess_steps,20.0
347
+ 1773788690.7880108,audit_final,208,2,audit/loss/auc,0.52164
348
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon/0.05,0.019017613492906094
349
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon/0.01,0.0
350
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/epsilon,0.019017613492906094
351
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,85.0
352
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,51.0
353
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
354
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
355
+ 1773788690.7880108,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
356
+ 1773788690.7880108,audit_final,208,2,audit/embedding/auc,0.543272
357
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon/0.05,0.0
358
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon/0.01,0.0
359
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
360
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
361
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
362
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
363
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
364
+ 1773788690.7880108,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
365
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/duration,2107.2360566542484
366
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/emissions,0.10251235144047784
367
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/emissions_rate,4.864777779250855e-05
368
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/cpu_power,72.02572178945746
369
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/gpu_power,4183.395055207508
370
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/ram_power,54.0
371
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/cpu_energy,0.040608159184490934
372
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/gpu_energy,2.447988524778083
373
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/ram_energy,0.030443774090695673
374
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/energy_consumed,2.5190404580532704
375
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/water_consumed,0.0
376
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/cpu_count,256.0
377
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/gpu_count,8.0
378
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/longitude,16.1885
379
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/latitude,58.594
380
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/ram_total_size,1511.49019241333
381
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/cpu_utilization_percent,3.5188995215311123
382
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/gpu_utilization_percent,80.96321770334929
383
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/ram_utilization_percent,5.3144497607653856
384
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/ram_used_gb,80.42555782920437
385
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/pue,1.0
386
+ 1773788691.3375037,energy_final,208,,energy/codecarbon/wue,0.0
deepseek-coder-6.7b/dp3/summary.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audit/delta": 1e-05,
3
+ "audit/embedding/auc": 0.543272,
4
+ "audit/embedding/empirical_epsilon/0.01": 0.0,
5
+ "audit/embedding/empirical_epsilon/0.05": 0.0,
6
+ "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0,
7
+ "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0,
8
+ "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0,
9
+ "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0,
10
+ "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0,
11
+ "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0,
12
+ "audit/loss/auc": 0.52164,
13
+ "audit/loss/empirical_epsilon/0.01": 0.0,
14
+ "audit/loss/empirical_epsilon/0.05": 0.019017613492906094,
15
+ "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0,
16
+ "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0,
17
+ "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0,
18
+ "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 51.0,
19
+ "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.019017613492906094,
20
+ "audit/loss/empirical_epsilon_details/0.05/num_guesses": 85.0,
21
+ "audit/num_canaries": 500.0,
22
+ "audit/num_members": 250.0,
23
+ "audit/paper_guess_fraction": 0.2,
24
+ "audit/paper_guess_steps": 20.0,
25
+ "energy/codecarbon/cpu_count": 256.0,
26
+ "energy/codecarbon/cpu_energy": 0.040608159184490934,
27
+ "energy/codecarbon/cpu_power": 72.02572178945746,
28
+ "energy/codecarbon/cpu_utilization_percent": 3.5188995215311123,
29
+ "energy/codecarbon/duration": 2107.2360566542484,
30
+ "energy/codecarbon/emissions": 0.10251235144047784,
31
+ "energy/codecarbon/emissions_rate": 4.864777779250855e-05,
32
+ "energy/codecarbon/energy_consumed": 2.5190404580532704,
33
+ "energy/codecarbon/gpu_count": 8.0,
34
+ "energy/codecarbon/gpu_energy": 2.447988524778083,
35
+ "energy/codecarbon/gpu_power": 4183.395055207508,
36
+ "energy/codecarbon/gpu_utilization_percent": 80.96321770334929,
37
+ "energy/codecarbon/latitude": 58.594,
38
+ "energy/codecarbon/longitude": 16.1885,
39
+ "energy/codecarbon/pue": 1.0,
40
+ "energy/codecarbon/ram_energy": 0.030443774090695673,
41
+ "energy/codecarbon/ram_power": 54.0,
42
+ "energy/codecarbon/ram_total_size": 1511.49019241333,
43
+ "energy/codecarbon/ram_used_gb": 80.42555782920437,
44
+ "energy/codecarbon/ram_utilization_percent": 5.3144497607653856,
45
+ "energy/codecarbon/water_consumed": 0.0,
46
+ "energy/codecarbon/wue": 0.0,
47
+ "eval/duration_sec": 14.522059130016714,
48
+ "eval/loss": 10.326239856806668,
49
+ "perf/audit_duration_sec": 5.763615260832012,
50
+ "perf/epoch_duration_sec": 993.2564477077685,
51
+ "perf/epoch_samples": 55652.0,
52
+ "perf/epoch_samples_per_sec": 56.02984015702426,
53
+ "perf/epoch_tokens": 44091013.0,
54
+ "perf/epoch_tokens_per_sec": 44390.36172556743,
55
+ "perf/logical_batch_size": 65.0,
56
+ "perf/logical_token_count": 49862.0,
57
+ "perf/physical_batches": 9.0,
58
+ "perf/samples_per_sec": 6.722750906341063,
59
+ "perf/step_duration_sec": 9.668661074247211,
60
+ "perf/tokens_per_sec": 5157.073933722741,
61
+ "privacy/epsilon": 2.9999680995370417,
62
+ "system/cuda_epoch_peak_memory_gb": 73.29015684127808,
63
+ "system/cuda_max_memory_allocated_gb": 73.29015684127808,
64
+ "system/cuda_memory_allocated_gb": 14.1148681640625,
65
+ "train/epoch_canary_loss": 10.469284867902852,
66
+ "train/epoch_loss": 10.700755941458253,
67
+ "train/epoch_real_loss": 10.694620649826392,
68
+ "train/lr": 8.126960406835249e-07,
69
+ "train/step_canary_loss": 10.875,
70
+ "train/step_loss": 10.382469764122597,
71
+ "train/step_real_loss": 10.374773979187012
72
+ }
deepseek-coder-6.7b/dp3/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|begin▁of▁sentence|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|EOT|>",
7
+ "extra_special_tokens": [
8
+ "865331112869",
9
+ "569765693871",
10
+ "485177821815",
11
+ "135441121756",
12
+ "367459894796",
13
+ "877482678543",
14
+ "457919547633",
15
+ "765474393376",
16
+ "114848338811",
17
+ "746285987371",
18
+ "649291669397",
19
+ "927914615679",
20
+ "445925149649",
21
+ "691587454538",
22
+ "143777992227",
23
+ "997981281989",
24
+ "425949483533",
25
+ "982993456429",
26
+ "718726519731",
27
+ "172599315861",
28
+ "643489267333",
29
+ "282322838685",
30
+ "781653545886",
31
+ "796415361892",
32
+ "841991688488",
33
+ "211411365397",
34
+ "698218415444",
35
+ "355977139358",
36
+ "682564697312",
37
+ "383837596997",
38
+ "689362171782",
39
+ "749966767285",
40
+ "753159165157",
41
+ "795693824762",
42
+ "669689115557",
43
+ "327491773134",
44
+ "983569279932",
45
+ "612128769512",
46
+ "374327157578",
47
+ "311632789559",
48
+ "523918658846",
49
+ "765981581453",
50
+ "794825141891",
51
+ "873898736873",
52
+ "447445629421",
53
+ "473822473819",
54
+ "181439694557",
55
+ "592538279337",
56
+ "668134915514",
57
+ "643692393748",
58
+ "696651276628",
59
+ "853859348234",
60
+ "778466723723",
61
+ "929826356991",
62
+ "272362973463",
63
+ "694235616268",
64
+ "281673864127",
65
+ "479676316326",
66
+ "646979124677",
67
+ "922327493433",
68
+ "883685933161",
69
+ "264259917554",
70
+ "836746273134",
71
+ "658481324922",
72
+ "481884157827",
73
+ "587787496812",
74
+ "579184949249",
75
+ "912193598348",
76
+ "529679678956",
77
+ "795838284624",
78
+ "159337222655",
79
+ "173781362446",
80
+ "773687856563",
81
+ "535787224917",
82
+ "351885857332",
83
+ "578827344666",
84
+ "198462689911",
85
+ "722618266242",
86
+ "952872416512",
87
+ "517778845323",
88
+ "749665846687",
89
+ "661436365453",
90
+ "259666844669",
91
+ "242851284913",
92
+ "514532995959",
93
+ "161588262349",
94
+ "742765629356",
95
+ "225164373623",
96
+ "676539973863",
97
+ "826214551218",
98
+ "182345464792",
99
+ "232776999554",
100
+ "337326533813",
101
+ "676676697292",
102
+ "929185622831",
103
+ "545512344383",
104
+ "499444466686",
105
+ "314697386682",
106
+ "517379856925",
107
+ "379557332953",
108
+ "614797267726",
109
+ "429781429464",
110
+ "922466849763",
111
+ "721737645236",
112
+ "479227349997",
113
+ "136931728327",
114
+ "259533577263",
115
+ "488538864842",
116
+ "937495658852",
117
+ "489991411364",
118
+ "499148455254",
119
+ "441373944925",
120
+ "899151413682",
121
+ "467893531755",
122
+ "527117488925",
123
+ "928335588653",
124
+ "374439448821",
125
+ "879425227932",
126
+ "867678158885",
127
+ "399749397872",
128
+ "129693547287",
129
+ "689285841825",
130
+ "771619544974",
131
+ "724883568652",
132
+ "516968424863",
133
+ "733737988257",
134
+ "852347289392",
135
+ "296953381169",
136
+ "377273562477",
137
+ "262296912232",
138
+ "547149832394",
139
+ "298464134954",
140
+ "216667245274",
141
+ "843998562287",
142
+ "572154333646",
143
+ "124589118494",
144
+ "841824384614",
145
+ "232896526252",
146
+ "295448593321",
147
+ "123741461297",
148
+ "653573457168",
149
+ "196735786156",
150
+ "377338713663",
151
+ "964342468552",
152
+ "586855179568",
153
+ "484773717614",
154
+ "894885246797",
155
+ "677896358599",
156
+ "848845611563",
157
+ "851852651677",
158
+ "398549545767",
159
+ "454244839926",
160
+ "799364566435",
161
+ "967114116556",
162
+ "817378986438",
163
+ "233795848681",
164
+ "824387273757",
165
+ "916198946615",
166
+ "563117729724",
167
+ "951794811935",
168
+ "374598961236",
169
+ "922867396683",
170
+ "765737843639",
171
+ "175469284871",
172
+ "231853711778",
173
+ "662426712668",
174
+ "711412347158",
175
+ "753466987363",
176
+ "513361312532",
177
+ "712992815957",
178
+ "971621888444",
179
+ "829235161526",
180
+ "585544633356",
181
+ "582471228164",
182
+ "678666359123",
183
+ "557533689478",
184
+ "632962475133",
185
+ "484489193824",
186
+ "489562189822",
187
+ "589547936288",
188
+ "363214487524",
189
+ "244885399387",
190
+ "431751228368",
191
+ "433581868192",
192
+ "486391569221",
193
+ "185438575221",
194
+ "126574388585",
195
+ "741757479784",
196
+ "529854679937",
197
+ "996116119839",
198
+ "616248973917",
199
+ "763531783491",
200
+ "955456118295",
201
+ "364196983365",
202
+ "195792996468",
203
+ "151859598873",
204
+ "399223169721",
205
+ "938488813964",
206
+ "961981959227",
207
+ "183368827562",
208
+ "533417736566",
209
+ "786391632558",
210
+ "665661658354",
211
+ "693281533643",
212
+ "475794684356",
213
+ "652154162978",
214
+ "753233719644",
215
+ "668514843129",
216
+ "819162623892",
217
+ "941169431859",
218
+ "877385381798",
219
+ "752644929761",
220
+ "881136466196",
221
+ "275597777299",
222
+ "731681792655",
223
+ "961133895172",
224
+ "864718285734",
225
+ "963852916563",
226
+ "319584985416",
227
+ "563365646341",
228
+ "811371928234",
229
+ "837131396371",
230
+ "267514771964",
231
+ "944513428457",
232
+ "117298239631",
233
+ "158142752582",
234
+ "252867443568",
235
+ "839269684865",
236
+ "612788593128",
237
+ "145669731981",
238
+ "121557291859",
239
+ "245416776926",
240
+ "799417897197",
241
+ "997958836435",
242
+ "892336777248",
243
+ "158929292238",
244
+ "581976444672",
245
+ "897784492783",
246
+ "492373714791",
247
+ "512659818733",
248
+ "881112998642",
249
+ "619454958782",
250
+ "431149748713",
251
+ "624221476921",
252
+ "125866399464",
253
+ "339882449689",
254
+ "186198784585",
255
+ "943193294691",
256
+ "955668961269",
257
+ "232787996724",
258
+ "215671314196",
259
+ "286173241916",
260
+ "745977673725",
261
+ "556976448182",
262
+ "599961512792",
263
+ "766294538337",
264
+ "934912591213",
265
+ "295118729589",
266
+ "529455466433",
267
+ "196119929397",
268
+ "379571934299",
269
+ "251789649997",
270
+ "564544131355",
271
+ "244371196654",
272
+ "384598329253",
273
+ "887753195844",
274
+ "364947325679",
275
+ "655517954651",
276
+ "673948786567",
277
+ "857231548835",
278
+ "816115936673",
279
+ "644234165531",
280
+ "182782912224",
281
+ "234316622259",
282
+ "421369185549",
283
+ "434632855397",
284
+ "921889371893",
285
+ "415956914763",
286
+ "598916996413",
287
+ "773671349113",
288
+ "952465217972",
289
+ "117657531962",
290
+ "729825168745",
291
+ "691315125346",
292
+ "768461952319",
293
+ "664847713559",
294
+ "953267689786",
295
+ "886464195129",
296
+ "824488329416",
297
+ "837873762491",
298
+ "532833541879",
299
+ "669183782449",
300
+ "941976537588",
301
+ "739394546916",
302
+ "267954879268",
303
+ "637551427887",
304
+ "217756494954",
305
+ "524444658383",
306
+ "117783274348",
307
+ "138218735276",
308
+ "814611949491",
309
+ "711641973413",
310
+ "499156317423",
311
+ "515856611931",
312
+ "454164859837",
313
+ "345271433112",
314
+ "462294118988",
315
+ "511785788222",
316
+ "497294727353",
317
+ "866519986723",
318
+ "334513529294",
319
+ "549946382131",
320
+ "284445431422",
321
+ "396521188476",
322
+ "421435255895",
323
+ "133373659361",
324
+ "322683334381",
325
+ "228358422847",
326
+ "291762694874",
327
+ "143182978129",
328
+ "511923256573",
329
+ "327158398268",
330
+ "879764613759",
331
+ "564395222747",
332
+ "451161679736",
333
+ "538631466654",
334
+ "221762325616",
335
+ "218391991184",
336
+ "322589379462",
337
+ "876537814263",
338
+ "152676556624",
339
+ "332522971941",
340
+ "884354318946",
341
+ "513349618943",
342
+ "116639746413",
343
+ "635185846287",
344
+ "993832498489",
345
+ "813981174797",
346
+ "438745114173",
347
+ "983493951323",
348
+ "724492262421",
349
+ "622553389126",
350
+ "889965243135",
351
+ "364492359246",
352
+ "154962668224",
353
+ "179564995814",
354
+ "418412875665",
355
+ "718951851413",
356
+ "699446724178",
357
+ "624266421831",
358
+ "815458725125",
359
+ "455423278865",
360
+ "393741199486",
361
+ "328552864359",
362
+ "211662639865",
363
+ "218784516525",
364
+ "762486672996",
365
+ "142799718159",
366
+ "858146415154",
367
+ "767858144912",
368
+ "571317457151",
369
+ "635127952696",
370
+ "116427191984",
371
+ "268921994538",
372
+ "523937669294",
373
+ "165429152138",
374
+ "739246183345",
375
+ "591464355756",
376
+ "212985874612",
377
+ "191887635211",
378
+ "967214577653",
379
+ "119342152414",
380
+ "946444632795",
381
+ "618423867817",
382
+ "228565148417",
383
+ "729116422489",
384
+ "527874729936",
385
+ "739784153482",
386
+ "387763951128",
387
+ "331369926711",
388
+ "562716493614",
389
+ "739667844957",
390
+ "562389434565",
391
+ "256497188281",
392
+ "859927364588",
393
+ "417668946583",
394
+ "357621613582",
395
+ "438435178228",
396
+ "485692541169",
397
+ "825815739116",
398
+ "342221452223",
399
+ "697747991249",
400
+ "716763689965",
401
+ "141499982867",
402
+ "818479319499",
403
+ "336813343298",
404
+ "594688742928",
405
+ "472129283475",
406
+ "514354144759",
407
+ "349249721685",
408
+ "546276298359",
409
+ "353755529131",
410
+ "315534574435",
411
+ "523723475786",
412
+ "215826764872",
413
+ "367968398551",
414
+ "569853653352",
415
+ "389715484387",
416
+ "293847485454",
417
+ "714738141818",
418
+ "178478368922",
419
+ "581493616981",
420
+ "589439538674",
421
+ "846657726193",
422
+ "722339992679",
423
+ "138154781148",
424
+ "757785319772",
425
+ "492516914298",
426
+ "919181521716",
427
+ "985781138935",
428
+ "476969195485",
429
+ "313145133463",
430
+ "758963111966",
431
+ "147541537162",
432
+ "557163366873",
433
+ "144373897488",
434
+ "522515164754",
435
+ "724964923582",
436
+ "284776712475",
437
+ "375429755114",
438
+ "181233596124",
439
+ "948585673431",
440
+ "243165586174",
441
+ "396847976144",
442
+ "997724962668",
443
+ "558837194455",
444
+ "163165456396",
445
+ "378749551722",
446
+ "161238482259",
447
+ "754978243758",
448
+ "195388849133",
449
+ "229775525672",
450
+ "262437452884",
451
+ "441377892146",
452
+ "451885565366",
453
+ "981277526855",
454
+ "762495822823",
455
+ "368763327262",
456
+ "757422791351",
457
+ "636324136426",
458
+ "214193645583",
459
+ "412843856172",
460
+ "179386156569",
461
+ "756916173536",
462
+ "892697125149",
463
+ "625334487352",
464
+ "941861857715",
465
+ "887417525236",
466
+ "649516938598",
467
+ "717628619782",
468
+ "438124184139",
469
+ "547563892268",
470
+ "856317483891",
471
+ "313313831273",
472
+ "371496153876",
473
+ "587541149322",
474
+ "265847332563",
475
+ "449549215429",
476
+ "163497196769",
477
+ "861342291298",
478
+ "268433315926",
479
+ "774679513717",
480
+ "851254219729",
481
+ "583527834464",
482
+ "488496781997",
483
+ "556814553861",
484
+ "482829231639",
485
+ "618878266619",
486
+ "147444452794",
487
+ "949235426629",
488
+ "357299947518",
489
+ "175528632226",
490
+ "645527857972",
491
+ "186872457894",
492
+ "552738847828",
493
+ "626748382482",
494
+ "921894985642",
495
+ "943878645871",
496
+ "859289776479",
497
+ "614583493135",
498
+ "933775286797",
499
+ "332234613346",
500
+ "325196781219",
501
+ "142526557681",
502
+ "356722692178",
503
+ "449318681694",
504
+ "687284547244",
505
+ "947262995132",
506
+ "893974619684",
507
+ "797238311233"
508
+ ],
509
+ "is_local": false,
510
+ "model_max_length": 16384,
511
+ "pad_token": "<|end▁of▁sentence|>",
512
+ "sp_model_kwargs": {},
513
+ "tokenizer_class": "LlamaTokenizer",
514
+ "unk_token": null,
515
+ "use_default_system_prompt": false
516
+ }
deepseek-coder-6.7b/dp3/train.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-17 22:32:20,031 [INFO] new_opacus_codex.train_steps: epoch=1 step=10 loss=14.7503
2
+ 2026-03-17 22:33:53,038 [INFO] new_opacus_codex.train_steps: epoch=1 step=20 loss=14.6211
3
+ 2026-03-17 22:35:26,347 [INFO] new_opacus_codex.train_steps: epoch=1 step=30 loss=14.3372
4
+ 2026-03-17 22:36:58,636 [INFO] new_opacus_codex.train_steps: epoch=1 step=40 loss=14.1539
5
+ 2026-03-17 22:38:31,697 [INFO] new_opacus_codex.train_steps: epoch=1 step=50 loss=13.8100
6
+ 2026-03-17 22:38:46,362 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=50 eval_loss=13.7298 duration_sec=14.66
7
+ 2026-03-17 22:40:19,267 [INFO] new_opacus_codex.train_steps: epoch=1 step=60 loss=13.4669
8
+ 2026-03-17 22:41:52,405 [INFO] new_opacus_codex.train_steps: epoch=1 step=70 loss=13.1042
9
+ 2026-03-17 22:43:26,221 [INFO] new_opacus_codex.train_steps: epoch=1 step=80 loss=12.7799
10
+ 2026-03-17 22:45:00,446 [INFO] new_opacus_codex.train_steps: epoch=1 step=90 loss=12.3359
11
+ 2026-03-17 22:46:33,448 [INFO] new_opacus_codex.train_steps: epoch=1 step=100 loss=12.0393
12
+ 2026-03-17 22:46:47,978 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=100 eval_loss=11.8114 duration_sec=14.53
13
+ 2026-03-17 22:48:42,729 [INFO] new_opacus_codex.train_steps: epoch=2 step=110 loss=11.6187
14
+ 2026-03-17 22:50:15,269 [INFO] new_opacus_codex.train_steps: epoch=2 step=120 loss=11.3041
15
+ 2026-03-17 22:51:47,843 [INFO] new_opacus_codex.train_steps: epoch=2 step=130 loss=10.9837
16
+ 2026-03-17 22:53:19,754 [INFO] new_opacus_codex.train_steps: epoch=2 step=140 loss=10.7969
17
+ 2026-03-17 22:54:52,072 [INFO] new_opacus_codex.train_steps: epoch=2 step=150 loss=10.6606
18
+ 2026-03-17 22:55:06,578 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=150 eval_loss=10.5290 duration_sec=14.50
19
+ 2026-03-17 22:56:38,804 [INFO] new_opacus_codex.train_steps: epoch=2 step=160 loss=10.5269
20
+ 2026-03-17 22:58:12,764 [INFO] new_opacus_codex.train_steps: epoch=2 step=170 loss=10.5162
21
+ 2026-03-17 22:59:45,794 [INFO] new_opacus_codex.train_steps: epoch=2 step=180 loss=10.4517
22
+ 2026-03-17 23:01:19,449 [INFO] new_opacus_codex.train_steps: epoch=2 step=190 loss=10.4031
23
+ 2026-03-17 23:02:52,278 [INFO] new_opacus_codex.train_steps: epoch=2 step=200 loss=10.3874
24
+ 2026-03-17 23:03:06,793 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=200 eval_loss=10.3270 duration_sec=14.51
deepseek-coder-6.7b/dp8/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.533352,
9
+ "empirical_epsilon": {
10
+ "0.05": 0.0,
11
+ "0.01": 0.0
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 0.0,
16
+ "num_guesses": 0,
17
+ "correct_guesses": 0,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 0.0,
44
+ "num_guesses": 0,
45
+ "correct_guesses": 0,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.544592,
74
+ "empirical_epsilon": {
75
+ "0.05": 0.0,
76
+ "0.01": 0.0
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 0.0,
81
+ "num_guesses": 0,
82
+ "correct_guesses": 0,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 0.0,
109
+ "num_guesses": 0,
110
+ "correct_guesses": 0,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/dp8/canary_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
deepseek-coder-6.7b/dp8/codecarbon.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ timestamp,project_name,run_id,experiment_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,gpu_energy,ram_energy,energy_consumed,water_consumed,country_name,country_iso_code,region,cloud_provider,cloud_region,os,python_version,codecarbon_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,cpu_utilization_percent,gpu_utilization_percent,ram_utilization_percent,ram_used_gb,on_cloud,pue,wue
2
+ 2026-03-17T22:24:53,codedp-deepseek-coder-6.7b-cpt-dp8,468761a7-8e0a-40d1-b862-00771db2f603,5b0fa12a-3dd7-45bb-9766-cc326314d9f1,2107.286476707086,0.1023234868151428,4.85569892590194e-05,72.02878771140225,4175.089097615359,54.0,0.04057807438109415,2.443401743608831,0.030419661433661826,2.5143994794235853,0.0,Sweden,SWE,östergötland county,,,Linux-6.8.0-94-generic-x86_64-with-glibc2.39,3.11.0,3.2.3,256,AMD EPYC 9554 64-Core Processor,8,8 x NVIDIA H200,16.1885,58.594,1511.49019241333,machine,3.622827125119416,80.66583094555874,5.312368672397158,80.40963100407845,N,1.0,0.0
deepseek-coder-6.7b/dp8/epochs/epoch_001/adapter/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
deepseek-coder-6.7b/dp8/epochs/epoch_001/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "o_proj",
36
+ "v_proj",
37
+ "k_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/dp8/epochs/epoch_001/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.5184,
9
+ "empirical_epsilon": {
10
+ "0.05": 0.0,
11
+ "0.01": 0.0
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 0.0,
16
+ "num_guesses": 0,
17
+ "correct_guesses": 0,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 0.0,
44
+ "num_guesses": 0,
45
+ "correct_guesses": 0,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.53976,
74
+ "empirical_epsilon": {
75
+ "0.05": 0.0,
76
+ "0.01": 0.0
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 0.0,
81
+ "num_guesses": 0,
82
+ "correct_guesses": 0,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 0.0,
109
+ "num_guesses": 0,
110
+ "correct_guesses": 0,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/dp8/epochs/epoch_002/adapter/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:deepseek-ai/deepseek-coder-6.7b-instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
deepseek-coder-6.7b/dp8/epochs/epoch_002/adapter/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": true,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": [
25
+ "lm_head",
26
+ "embed_tokens"
27
+ ],
28
+ "peft_type": "LORA",
29
+ "peft_version": "0.18.1",
30
+ "qalora_group_size": 16,
31
+ "r": 16,
32
+ "rank_pattern": {},
33
+ "revision": null,
34
+ "target_modules": [
35
+ "o_proj",
36
+ "v_proj",
37
+ "k_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
deepseek-coder-6.7b/dp8/epochs/epoch_002/audit_results.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "delta": 1e-05,
3
+ "num_canaries": 500,
4
+ "num_members": 250,
5
+ "paper_guess_fraction": 0.2,
6
+ "paper_guess_steps": 20,
7
+ "loss": {
8
+ "auc": 0.533352,
9
+ "empirical_epsilon": {
10
+ "0.05": 0.0,
11
+ "0.01": 0.0
12
+ },
13
+ "empirical_epsilon_details": {
14
+ "0.05": {
15
+ "epsilon": 0.0,
16
+ "num_guesses": 0,
17
+ "correct_guesses": 0,
18
+ "candidate_num_guesses": [
19
+ 5,
20
+ 10,
21
+ 15,
22
+ 20,
23
+ 25,
24
+ 30,
25
+ 35,
26
+ 40,
27
+ 45,
28
+ 50,
29
+ 55,
30
+ 60,
31
+ 65,
32
+ 70,
33
+ 75,
34
+ 80,
35
+ 85,
36
+ 90,
37
+ 95,
38
+ 100
39
+ ],
40
+ "direction": "lower"
41
+ },
42
+ "0.01": {
43
+ "epsilon": 0.0,
44
+ "num_guesses": 0,
45
+ "correct_guesses": 0,
46
+ "candidate_num_guesses": [
47
+ 5,
48
+ 10,
49
+ 15,
50
+ 20,
51
+ 25,
52
+ 30,
53
+ 35,
54
+ 40,
55
+ 45,
56
+ 50,
57
+ 55,
58
+ 60,
59
+ 65,
60
+ 70,
61
+ 75,
62
+ 80,
63
+ 85,
64
+ 90,
65
+ 95,
66
+ 100
67
+ ],
68
+ "direction": "lower"
69
+ }
70
+ }
71
+ },
72
+ "embedding": {
73
+ "auc": 0.544592,
74
+ "empirical_epsilon": {
75
+ "0.05": 0.0,
76
+ "0.01": 0.0
77
+ },
78
+ "empirical_epsilon_details": {
79
+ "0.05": {
80
+ "epsilon": 0.0,
81
+ "num_guesses": 0,
82
+ "correct_guesses": 0,
83
+ "candidate_num_guesses": [
84
+ 5,
85
+ 10,
86
+ 15,
87
+ 20,
88
+ 25,
89
+ 30,
90
+ 35,
91
+ 40,
92
+ 45,
93
+ 50,
94
+ 55,
95
+ 60,
96
+ 65,
97
+ 70,
98
+ 75,
99
+ 80,
100
+ 85,
101
+ 90,
102
+ 95,
103
+ 100
104
+ ],
105
+ "direction": "lower"
106
+ },
107
+ "0.01": {
108
+ "epsilon": 0.0,
109
+ "num_guesses": 0,
110
+ "correct_guesses": 0,
111
+ "candidate_num_guesses": [
112
+ 5,
113
+ 10,
114
+ 15,
115
+ 20,
116
+ 25,
117
+ 30,
118
+ 35,
119
+ 40,
120
+ 45,
121
+ 50,
122
+ 55,
123
+ 60,
124
+ 65,
125
+ 70,
126
+ 75,
127
+ 80,
128
+ 85,
129
+ 90,
130
+ 95,
131
+ 100
132
+ ],
133
+ "direction": "lower"
134
+ }
135
+ }
136
+ }
137
+ }
deepseek-coder-6.7b/dp8/metrics.jsonl ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"timestamp": 1773784344.144435, "event": "train_step", "step": 10, "epoch": 1, "metrics": {"train/step_loss": 14.774163795239998, "train/step_real_loss": 14.892106413841248, "train/lr": 0.00018181818181818183, "train/step_canary_loss": 11.0, "perf/step_duration_sec": 9.184927014168352, "perf/samples_per_sec": 7.185685841399793, "perf/tokens_per_sec": 5831.728430435432, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 53564.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.347414329116491, "system/cuda_memory_allocated_gb": 14.177838802337646, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
2
+ {"timestamp": 1773784437.5361402, "event": "train_step", "step": 20, "epoch": 1, "metrics": {"train/step_loss": 14.265574847950655, "train/step_real_loss": 14.47943890094757, "train/lr": 0.00019897180218885507, "train/step_canary_loss": 10.84375, "perf/step_duration_sec": 9.109625170007348, "perf/samples_per_sec": 7.46463204917411, "perf/tokens_per_sec": 6030.434729726173, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 54935.0, "perf/physical_batches": 9.0, "privacy/epsilon": 2.9091472084164676, "system/cuda_memory_allocated_gb": 14.301010608673096, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
3
+ {"timestamp": 1773784530.4163961, "event": "train_step", "step": 30, "epoch": 1, "metrics": {"train/step_loss": 13.518484404592803, "train/step_real_loss": 13.940937042236328, "train/lr": 0.00019544467510209388, "train/step_canary_loss": 0.0, "perf/step_duration_sec": 9.479914616327733, "perf/samples_per_sec": 6.751115657705354, "perf/tokens_per_sec": 5596.14744932698, "perf/logical_batch_size": 64.0, "perf/logical_token_count": 53051.0, "perf/physical_batches": 10.0, "privacy/epsilon": 3.362310066272621, "system/cuda_memory_allocated_gb": 14.547617435455322, "system/cuda_max_memory_allocated_gb": 73.28973627090454}}
4
+ {"timestamp": 1773784622.9840994, "event": "train_step", "step": 40, "epoch": 1, "metrics": {"train/step_loss": 13.427360419071082, "train/step_real_loss": 13.384074807167053, "train/lr": 0.00018949541262593762, "train/step_canary_loss": 14.8125, "perf/step_duration_sec": 9.461411211173981, "perf/samples_per_sec": 6.975703573908047, "perf/tokens_per_sec": 5377.844685569534, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 50882.0, "perf/physical_batches": 9.0, "privacy/epsilon": 3.7564957912533217, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
5
+ {"timestamp": 1773784716.151862, "event": "train_step", "step": 50, "epoch": 1, "metrics": {"train/step_loss": 12.773963928222656, "train/step_real_loss": 12.735275864601135, "train/lr": 0.00018127499143005268, "train/step_canary_loss": 15.25, "perf/step_duration_sec": 9.381330342032015, "perf/samples_per_sec": 6.928654852795735, "perf/tokens_per_sec": 5950.009003510849, "perf/logical_batch_size": 65.0, "perf/logical_token_count": 55819.0, "perf/physical_batches": 9.0, "privacy/epsilon": 4.113499817895439, "system/cuda_memory_allocated_gb": 14.1148681640625, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
6
+ {"timestamp": 1773784730.6923964, "event": "eval_step", "step": 50, "epoch": 1, "metrics": {"eval/loss": 12.672339336438613, "eval/duration_sec": 14.538249942008406}}
7
+ {"timestamp": 1773784823.310222, "event": "train_step", "step": 60, "epoch": 1, "metrics": {"train/step_loss": 11.997090639386858, "train/step_real_loss": 12.100333452224731, "train/lr": 0.0001709920242324663, "train/step_canary_loss": 10.895833969116211, "perf/step_duration_sec": 9.064209748990834, "perf/samples_per_sec": 7.722680954927535, "perf/tokens_per_sec": 6211.903912116425, "perf/logical_batch_size": 70.0, "perf/logical_token_count": 56306.0, "perf/physical_batches": 9.0, "privacy/epsilon": 4.44246859480526, "system/cuda_memory_allocated_gb": 14.425142288208008, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
8
+ {"timestamp": 1773784915.821449, "event": "train_step", "step": 70, "epoch": 1, "metrics": {"train/step_loss": 11.382361131555895, "train/step_real_loss": 11.408211827278137, "train/lr": 0.00015890746575622231, "train/step_canary_loss": 10.96875, "perf/step_duration_sec": 9.208213192876428, "perf/samples_per_sec": 7.384711732413572, "perf/tokens_per_sec": 5687.531218381818, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 52372.0, "perf/physical_batches": 9.0, "privacy/epsilon": 4.751375515961613, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
9
+ {"timestamp": 1773785010.2598872, "event": "train_step", "step": 80, "epoch": 1, "metrics": {"train/step_loss": 10.664647719439339, "train/step_real_loss": 10.645641326904297, "train/lr": 0.00014532799038330385, "train/step_canary_loss": 10.96875, "perf/step_duration_sec": 9.077163262758404, "perf/samples_per_sec": 7.491327194586108, "perf/tokens_per_sec": 5665.206024329354, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 51424.0, "perf/physical_batches": 9.0, "privacy/epsilon": 5.041931555886533, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
10
+ {"timestamp": 1773785104.2668839, "event": "train_step", "step": 90, "epoch": 1, "metrics": {"train/step_loss": 10.064055400108224, "train/step_real_loss": 10.030925154685974, "train/lr": 0.00013059820956358998, "train/step_canary_loss": 10.770833969116211, "perf/step_duration_sec": 9.01999548682943, "perf/samples_per_sec": 7.427941632324565, "perf/tokens_per_sec": 6063.528532786976, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 54693.0, "perf/physical_batches": 9.0, "privacy/epsilon": 5.3191578546045415, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
11
+ {"timestamp": 1773785196.997746, "event": "train_step", "step": 100, "epoch": 1, "metrics": {"train/step_loss": 9.603301625856211, "train/step_real_loss": 9.47983455657959, "train/lr": 0.00011509192648058249, "train/step_canary_loss": 10.73214340209961, "perf/step_duration_sec": 9.839758691843599, "perf/samples_per_sec": 7.215624104568085, "perf/tokens_per_sec": 4844.224486878061, "perf/logical_batch_size": 71.0, "perf/logical_token_count": 47666.0, "perf/physical_batches": 9.0, "privacy/epsilon": 5.584672549637297, "system/cuda_memory_allocated_gb": 14.487197399139404, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
12
+ {"timestamp": 1773785211.3789763, "event": "eval_step", "step": 100, "epoch": 1, "metrics": {"eval/loss": 9.217194291678343, "eval/duration_sec": 14.378479943610728}}
13
+ {"timestamp": 1773785262.4304821, "event": "train_epoch", "step": 104, "epoch": 1, "metrics": {"train/epoch_loss": 12.341963201017878, "train/epoch_real_loss": 12.414478586751462, "train/epoch_canary_loss": 10.728888959316615, "perf/epoch_duration_sec": 998.3496765969321, "perf/epoch_samples_per_sec": 55.96235598577414, "perf/epoch_tokens_per_sec": 44175.03409259433, "perf/epoch_samples": 55870.0, "perf/epoch_tokens": 44102131.0, "system/cuda_epoch_peak_memory_gb": 73.2897834777832, "eval/loss": 8.922484595667232, "eval/duration_sec": 14.388765855692327, "privacy/epsilon": 5.688895252675942}}
14
+ {"timestamp": 1773785270.4813497, "event": "audit_epoch", "step": 104, "epoch": 1, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.5184, "audit/loss/empirical_epsilon/0.05": 0.0, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.53976, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0, "perf/audit_duration_sec": 5.523622438777238}}
15
+ {"timestamp": 1773785326.5610142, "event": "train_step", "step": 110, "epoch": 2, "metrics": {"train/step_loss": 8.857723698471531, "train/step_real_loss": 8.790777564048767, "train/lr": 9.920264990753837e-05, "train/step_canary_loss": 11.0, "perf/step_duration_sec": 9.301705885212868, "perf/samples_per_sec": 7.095472681513365, "perf/tokens_per_sec": 5950.3063935818445, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 55348.0, "perf/physical_batches": 9.0, "privacy/epsilon": 5.840075127464654, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.28975915908813}}
16
+ {"timestamp": 1773785418.9144747, "event": "train_step", "step": 120, "epoch": 2, "metrics": {"train/step_loss": 8.510492253659377, "train/step_real_loss": 8.394773125648499, "train/lr": 8.333360798744496e-05, "train/step_canary_loss": 10.979166984558105, "perf/step_duration_sec": 9.138508805073798, "perf/samples_per_sec": 7.331611910556007, "perf/tokens_per_sec": 5373.305541133465, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 49104.0, "perf/physical_batches": 9.0, "privacy/epsilon": 6.086971689386329, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.28975915908813}}
17
+ {"timestamp": 1773785511.284102, "event": "train_step", "step": 130, "epoch": 2, "metrics": {"train/step_loss": 8.294911356939785, "train/step_real_loss": 8.076740324497223, "train/lr": 6.788751536089739e-05, "train/step_canary_loss": 11.08750057220459, "perf/step_duration_sec": 9.095777447801083, "perf/samples_per_sec": 7.5859375843327, "perf/tokens_per_sec": 6110.967459239838, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 55584.0, "perf/physical_batches": 9.0, "privacy/epsilon": 6.327001481453001, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
18
+ {"timestamp": 1773785603.6243958, "event": "train_step", "step": 140, "epoch": 2, "metrics": {"train/step_loss": 8.052544166792684, "train/step_real_loss": 7.91242903470993, "train/lr": 5.325635332531864e-05, "train/step_canary_loss": 11.041666984558105, "perf/step_duration_sec": 9.209645525086671, "perf/samples_per_sec": 7.274981411336075, "perf/tokens_per_sec": 5759.396477911762, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 53042.0, "perf/physical_batches": 9.0, "privacy/epsilon": 6.557571491578655, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
19
+ {"timestamp": 1773785696.4521918, "event": "train_step", "step": 150, "epoch": 2, "metrics": {"train/step_loss": 8.013610591059146, "train/step_real_loss": 7.757837951183319, "train/lr": 3.981142237826332e-05, "train/step_canary_loss": 11.287500381469727, "perf/step_duration_sec": 9.033272176980972, "perf/samples_per_sec": 7.638428096501862, "perf/tokens_per_sec": 6166.425511006423, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 55703.0, "perf/physical_batches": 9.0, "privacy/epsilon": 6.783113430476456, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
20
+ {"timestamp": 1773785710.7493467, "event": "eval_step", "step": 150, "epoch": 2, "metrics": {"eval/loss": 7.667719158259305, "eval/duration_sec": 14.295333066955209}}
21
+ {"timestamp": 1773785803.0583243, "event": "train_step", "step": 160, "epoch": 2, "metrics": {"train/step_loss": 7.74585654518821, "train/step_real_loss": 7.636352062225342, "train/lr": 2.789391958515183e-05, "train/step_canary_loss": 11.25, "perf/step_duration_sec": 8.934944829903543, "perf/samples_per_sec": 7.386727199379082, "perf/tokens_per_sec": 6129.080933630252, "perf/logical_batch_size": 66.0, "perf/logical_token_count": 54763.0, "perf/physical_batches": 9.0, "privacy/epsilon": 7.0043611451158165, "system/cuda_memory_allocated_gb": 14.17786169052124, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
22
+ {"timestamp": 1773785896.8129106, "event": "train_step", "step": 170, "epoch": 2, "metrics": {"train/step_loss": 7.764082396208351, "train/step_real_loss": 7.601656556129456, "train/lr": 1.7806279893114875e-05, "train/step_canary_loss": 11.229166984558105, "perf/step_duration_sec": 9.581119411159307, "perf/samples_per_sec": 6.992919837943347, "perf/tokens_per_sec": 5193.338885020673, "perf/logical_batch_size": 67.0, "perf/logical_token_count": 49758.0, "perf/physical_batches": 9.0, "privacy/epsilon": 7.2178133573416465, "system/cuda_memory_allocated_gb": 14.238978385925293, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
23
+ {"timestamp": 1773785989.7971883, "event": "train_step", "step": 180, "epoch": 2, "metrics": {"train/step_loss": 7.800547375398524, "train/step_real_loss": 7.582026898860931, "train/lr": 9.804501125681243e-06, "train/step_canary_loss": 11.296875, "perf/step_duration_sec": 9.495344399940223, "perf/samples_per_sec": 7.161404277282253, "perf/tokens_per_sec": 5326.71569030381, "perf/logical_batch_size": 68.0, "perf/logical_token_count": 50579.0, "perf/physical_batches": 9.0, "privacy/epsilon": 7.430040084733278, "system/cuda_memory_allocated_gb": 14.30103349685669, "system/cuda_max_memory_allocated_gb": 73.2897834777832}}
24
+ {"timestamp": 1773786082.775681, "event": "train_step", "step": 190, "epoch": 2, "metrics": {"train/step_loss": 7.886315041694088, "train/step_real_loss": 7.6274334192276, "train/lr": 4.091647429802869e-06, "train/step_canary_loss": 11.199999809265137, "perf/step_duration_sec": 9.168436062987894, "perf/samples_per_sec": 7.525820055455963, "perf/tokens_per_sec": 5850.50706919794, "perf/logical_batch_size": 69.0, "perf/logical_token_count": 53640.0, "perf/physical_batches": 9.0, "privacy/epsilon": 7.633877151395027, "system/cuda_memory_allocated_gb": 14.363087177276611, "system/cuda_max_memory_allocated_gb": 73.29015684127808}}
25
+ {"timestamp": 1773786175.474541, "event": "train_step", "step": 200, "epoch": 2, "metrics": {"train/step_loss": 7.722104996901292, "train/step_real_loss": 7.670887887477875, "train/lr": 8.126960406835249e-07, "train/step_canary_loss": 11.0, "perf/step_duration_sec": 9.673653797712177, "perf/samples_per_sec": 6.719281189840856, "perf/tokens_per_sec": 5154.412287505304, "perf/logical_batch_size": 65.0, "perf/logical_token_count": 49862.0, "perf/physical_batches": 9.0, "privacy/epsilon": 7.837365803470095, "system/cuda_memory_allocated_gb": 14.1148681640625, "system/cuda_max_memory_allocated_gb": 73.29015684127808}}
26
+ {"timestamp": 1773786189.8028603, "event": "eval_step", "step": 200, "epoch": 2, "metrics": {"eval/loss": 7.523608349940993, "eval/duration_sec": 14.325929747894406}}
27
+ {"timestamp": 1773786276.8565547, "event": "train_epoch", "step": 208, "epoch": 2, "metrics": {"train/epoch_loss": 8.035395088671393, "train/epoch_real_loss": 7.897920552785785, "train/epoch_canary_loss": 10.571939474888266, "perf/epoch_duration_sec": 991.9095743251964, "perf/epoch_samples_per_sec": 56.10592078200321, "perf/epoch_tokens_per_sec": 44450.63757953486, "perf/epoch_samples": 55652.0, "perf/epoch_tokens": 44091013.0, "system/cuda_epoch_peak_memory_gb": 73.29015684127808, "eval/loss": 7.523212566971779, "eval/duration_sec": 14.324650165159255, "privacy/epsilon": 7.995186040237391}}
28
+ {"timestamp": 1773786284.9576344, "event": "audit_epoch", "step": 208, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.533352, "audit/loss/empirical_epsilon/0.05": 0.0, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.544592, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0, "perf/audit_duration_sec": 5.583974160254002}}
29
+ {"timestamp": 1773786292.845513, "event": "audit_final", "step": 208, "epoch": 2, "metrics": {"audit/delta": 1e-05, "audit/num_canaries": 500.0, "audit/num_members": 250.0, "audit/paper_guess_fraction": 0.2, "audit/paper_guess_steps": 20.0, "audit/loss/auc": 0.533352, "audit/loss/empirical_epsilon/0.05": 0.0, "audit/loss/empirical_epsilon/0.01": 0.0, "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0, "audit/embedding/auc": 0.544592, "audit/embedding/empirical_epsilon/0.05": 0.0, "audit/embedding/empirical_epsilon/0.01": 0.0, "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0, "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0, "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0}}
30
+ {"timestamp": 1773786293.3942149, "event": "energy_final", "step": 208, "epoch": null, "metrics": {"energy/codecarbon/duration": 2107.286476707086, "energy/codecarbon/emissions": 0.1023234868151428, "energy/codecarbon/emissions_rate": 4.85569892590194e-05, "energy/codecarbon/cpu_power": 72.02878771140225, "energy/codecarbon/gpu_power": 4175.089097615359, "energy/codecarbon/ram_power": 54.0, "energy/codecarbon/cpu_energy": 0.04057807438109415, "energy/codecarbon/gpu_energy": 2.443401743608831, "energy/codecarbon/ram_energy": 0.030419661433661826, "energy/codecarbon/energy_consumed": 2.5143994794235853, "energy/codecarbon/water_consumed": 0.0, "energy/codecarbon/cpu_count": 256.0, "energy/codecarbon/gpu_count": 8.0, "energy/codecarbon/longitude": 16.1885, "energy/codecarbon/latitude": 58.594, "energy/codecarbon/ram_total_size": 1511.49019241333, "energy/codecarbon/cpu_utilization_percent": 3.622827125119416, "energy/codecarbon/gpu_utilization_percent": 80.66583094555874, "energy/codecarbon/ram_utilization_percent": 5.312368672397158, "energy/codecarbon/ram_used_gb": 80.40963100407845, "energy/codecarbon/pue": 1.0, "energy/codecarbon/wue": 0.0}}
deepseek-coder-6.7b/dp8/resolved_config.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: deepseek-ai/deepseek-coder-6.7b-instruct
3
+ tokenizer_name: deepseek-ai/deepseek-coder-6.7b-instruct
4
+ max_length: 1024
5
+ dtype: bfloat16
6
+ trust_remote_code: true
7
+ use_fast_tokenizer: true
8
+ cache_dir: null
9
+ local_files_only: false
10
+ low_cpu_mem_usage: true
11
+ tie_word_embeddings: true
12
+ gradient_checkpointing: false
13
+ use_chat_template: false
14
+ dataset:
15
+ name: melihcatal/codedp-cpt
16
+ split: train
17
+ mode: cpt
18
+ text_column: text
19
+ validation_ratio: 0.05
20
+ max_samples: -1
21
+ lora:
22
+ enabled: true
23
+ r: 16
24
+ alpha: 32
25
+ dropout: 0.05
26
+ target_modules:
27
+ - q_proj
28
+ - k_proj
29
+ - v_proj
30
+ - o_proj
31
+ modules_to_save:
32
+ - lm_head
33
+ bias: none
34
+ training:
35
+ seed: 42
36
+ epochs: 2
37
+ warmup_steps: null
38
+ warmup_ratio: 0.05
39
+ mixed_precision: false
40
+ mixed_precision_dtype: bfloat16
41
+ batch_size: 8
42
+ eval_batch_size: 8
43
+ eval_every_steps: 50
44
+ eval_every_epochs: 1
45
+ learning_rate: 0.0002
46
+ optimizer: adamw
47
+ lr_scheduler: cosine
48
+ adam_beta1: 0.9
49
+ adam_beta2: 0.999
50
+ adam_epsilon: 1.0e-08
51
+ sgd_momentum: 0.9
52
+ weight_decay: 0.01
53
+ max_grad_norm: 1.0
54
+ log_every: 10
55
+ gradient_accumulation_steps: 8
56
+ num_workers: 4
57
+ output_dir: runs/cpt/deepseek-coder-6.7b/dp8
58
+ distributed:
59
+ strategy: dpddp
60
+ backend: nccl
61
+ devices: null
62
+ dp:
63
+ module_validator: auto
64
+ target_delta: 1.0e-05
65
+ noise_multiplier: null
66
+ max_grad_norm: 1.0
67
+ grad_sample_mode: hooks
68
+ clipping: flat
69
+ secure_mode: false
70
+ enabled: true
71
+ target_epsilon: 8.0
72
+ audit:
73
+ enabled: true
74
+ run_every_epoch: true
75
+ epoch_device: cuda
76
+ q_canary: auto
77
+ num_canaries: 500
78
+ prefix_length: 49
79
+ num_digits: 12
80
+ batch_size: 32
81
+ delta: 1.0e-05
82
+ p_values:
83
+ - 0.05
84
+ - 0.01
85
+ paper_guess_fraction: 0.2
86
+ paper_guess_steps: 20
87
+ enable_holdout_empirical_epsilon: false
88
+ holdout_seed: 42
89
+ tie_seed: 42
90
+ tracking:
91
+ enabled: true
92
+ tensorboard: true
93
+ wandb: false
94
+ wandb_project: codedp-finetune-h200-audit
95
+ wandb_run_name: deepseek-coder-6.7b-cpt-dp8
96
+ wandb_mode: online
97
+ codecarbon: true
98
+ codecarbon_output_file: codecarbon.csv
99
+ codecarbon_measure_power_secs: 15
100
+ codecarbon_country_iso_code: null
101
+ codecarbon_project_name: codedp-deepseek-coder-6.7b-cpt-dp8
deepseek-coder-6.7b/dp8/scalars.csv ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timestamp,event,step,epoch,key,value
2
+ 1773784344.144435,train_step,10,1,train/step_loss,14.774163795239998
3
+ 1773784344.144435,train_step,10,1,train/step_real_loss,14.892106413841248
4
+ 1773784344.144435,train_step,10,1,train/lr,0.00018181818181818183
5
+ 1773784344.144435,train_step,10,1,train/step_canary_loss,11.0
6
+ 1773784344.144435,train_step,10,1,perf/step_duration_sec,9.184927014168352
7
+ 1773784344.144435,train_step,10,1,perf/samples_per_sec,7.185685841399793
8
+ 1773784344.144435,train_step,10,1,perf/tokens_per_sec,5831.728430435432
9
+ 1773784344.144435,train_step,10,1,perf/logical_batch_size,66.0
10
+ 1773784344.144435,train_step,10,1,perf/logical_token_count,53564.0
11
+ 1773784344.144435,train_step,10,1,perf/physical_batches,9.0
12
+ 1773784344.144435,train_step,10,1,privacy/epsilon,2.347414329116491
13
+ 1773784344.144435,train_step,10,1,system/cuda_memory_allocated_gb,14.177838802337646
14
+ 1773784344.144435,train_step,10,1,system/cuda_max_memory_allocated_gb,73.28973627090454
15
+ 1773784437.5361402,train_step,20,1,train/step_loss,14.265574847950655
16
+ 1773784437.5361402,train_step,20,1,train/step_real_loss,14.47943890094757
17
+ 1773784437.5361402,train_step,20,1,train/lr,0.00019897180218885507
18
+ 1773784437.5361402,train_step,20,1,train/step_canary_loss,10.84375
19
+ 1773784437.5361402,train_step,20,1,perf/step_duration_sec,9.109625170007348
20
+ 1773784437.5361402,train_step,20,1,perf/samples_per_sec,7.46463204917411
21
+ 1773784437.5361402,train_step,20,1,perf/tokens_per_sec,6030.434729726173
22
+ 1773784437.5361402,train_step,20,1,perf/logical_batch_size,68.0
23
+ 1773784437.5361402,train_step,20,1,perf/logical_token_count,54935.0
24
+ 1773784437.5361402,train_step,20,1,perf/physical_batches,9.0
25
+ 1773784437.5361402,train_step,20,1,privacy/epsilon,2.9091472084164676
26
+ 1773784437.5361402,train_step,20,1,system/cuda_memory_allocated_gb,14.301010608673096
27
+ 1773784437.5361402,train_step,20,1,system/cuda_max_memory_allocated_gb,73.28973627090454
28
+ 1773784530.4163961,train_step,30,1,train/step_loss,13.518484404592803
29
+ 1773784530.4163961,train_step,30,1,train/step_real_loss,13.940937042236328
30
+ 1773784530.4163961,train_step,30,1,train/lr,0.00019544467510209388
31
+ 1773784530.4163961,train_step,30,1,train/step_canary_loss,0.0
32
+ 1773784530.4163961,train_step,30,1,perf/step_duration_sec,9.479914616327733
33
+ 1773784530.4163961,train_step,30,1,perf/samples_per_sec,6.751115657705354
34
+ 1773784530.4163961,train_step,30,1,perf/tokens_per_sec,5596.14744932698
35
+ 1773784530.4163961,train_step,30,1,perf/logical_batch_size,64.0
36
+ 1773784530.4163961,train_step,30,1,perf/logical_token_count,53051.0
37
+ 1773784530.4163961,train_step,30,1,perf/physical_batches,10.0
38
+ 1773784530.4163961,train_step,30,1,privacy/epsilon,3.362310066272621
39
+ 1773784530.4163961,train_step,30,1,system/cuda_memory_allocated_gb,14.547617435455322
40
+ 1773784530.4163961,train_step,30,1,system/cuda_max_memory_allocated_gb,73.28973627090454
41
+ 1773784622.9840994,train_step,40,1,train/step_loss,13.427360419071082
42
+ 1773784622.9840994,train_step,40,1,train/step_real_loss,13.384074807167053
43
+ 1773784622.9840994,train_step,40,1,train/lr,0.00018949541262593762
44
+ 1773784622.9840994,train_step,40,1,train/step_canary_loss,14.8125
45
+ 1773784622.9840994,train_step,40,1,perf/step_duration_sec,9.461411211173981
46
+ 1773784622.9840994,train_step,40,1,perf/samples_per_sec,6.975703573908047
47
+ 1773784622.9840994,train_step,40,1,perf/tokens_per_sec,5377.844685569534
48
+ 1773784622.9840994,train_step,40,1,perf/logical_batch_size,66.0
49
+ 1773784622.9840994,train_step,40,1,perf/logical_token_count,50882.0
50
+ 1773784622.9840994,train_step,40,1,perf/physical_batches,9.0
51
+ 1773784622.9840994,train_step,40,1,privacy/epsilon,3.7564957912533217
52
+ 1773784622.9840994,train_step,40,1,system/cuda_memory_allocated_gb,14.17786169052124
53
+ 1773784622.9840994,train_step,40,1,system/cuda_max_memory_allocated_gb,73.2897834777832
54
+ 1773784716.151862,train_step,50,1,train/step_loss,12.773963928222656
55
+ 1773784716.151862,train_step,50,1,train/step_real_loss,12.735275864601135
56
+ 1773784716.151862,train_step,50,1,train/lr,0.00018127499143005268
57
+ 1773784716.151862,train_step,50,1,train/step_canary_loss,15.25
58
+ 1773784716.151862,train_step,50,1,perf/step_duration_sec,9.381330342032015
59
+ 1773784716.151862,train_step,50,1,perf/samples_per_sec,6.928654852795735
60
+ 1773784716.151862,train_step,50,1,perf/tokens_per_sec,5950.009003510849
61
+ 1773784716.151862,train_step,50,1,perf/logical_batch_size,65.0
62
+ 1773784716.151862,train_step,50,1,perf/logical_token_count,55819.0
63
+ 1773784716.151862,train_step,50,1,perf/physical_batches,9.0
64
+ 1773784716.151862,train_step,50,1,privacy/epsilon,4.113499817895439
65
+ 1773784716.151862,train_step,50,1,system/cuda_memory_allocated_gb,14.1148681640625
66
+ 1773784716.151862,train_step,50,1,system/cuda_max_memory_allocated_gb,73.2897834777832
67
+ 1773784730.6923964,eval_step,50,1,eval/loss,12.672339336438613
68
+ 1773784730.6923964,eval_step,50,1,eval/duration_sec,14.538249942008406
69
+ 1773784823.310222,train_step,60,1,train/step_loss,11.997090639386858
70
+ 1773784823.310222,train_step,60,1,train/step_real_loss,12.100333452224731
71
+ 1773784823.310222,train_step,60,1,train/lr,0.0001709920242324663
72
+ 1773784823.310222,train_step,60,1,train/step_canary_loss,10.895833969116211
73
+ 1773784823.310222,train_step,60,1,perf/step_duration_sec,9.064209748990834
74
+ 1773784823.310222,train_step,60,1,perf/samples_per_sec,7.722680954927535
75
+ 1773784823.310222,train_step,60,1,perf/tokens_per_sec,6211.903912116425
76
+ 1773784823.310222,train_step,60,1,perf/logical_batch_size,70.0
77
+ 1773784823.310222,train_step,60,1,perf/logical_token_count,56306.0
78
+ 1773784823.310222,train_step,60,1,perf/physical_batches,9.0
79
+ 1773784823.310222,train_step,60,1,privacy/epsilon,4.44246859480526
80
+ 1773784823.310222,train_step,60,1,system/cuda_memory_allocated_gb,14.425142288208008
81
+ 1773784823.310222,train_step,60,1,system/cuda_max_memory_allocated_gb,73.2897834777832
82
+ 1773784915.821449,train_step,70,1,train/step_loss,11.382361131555895
83
+ 1773784915.821449,train_step,70,1,train/step_real_loss,11.408211827278137
84
+ 1773784915.821449,train_step,70,1,train/lr,0.00015890746575622231
85
+ 1773784915.821449,train_step,70,1,train/step_canary_loss,10.96875
86
+ 1773784915.821449,train_step,70,1,perf/step_duration_sec,9.208213192876428
87
+ 1773784915.821449,train_step,70,1,perf/samples_per_sec,7.384711732413572
88
+ 1773784915.821449,train_step,70,1,perf/tokens_per_sec,5687.531218381818
89
+ 1773784915.821449,train_step,70,1,perf/logical_batch_size,68.0
90
+ 1773784915.821449,train_step,70,1,perf/logical_token_count,52372.0
91
+ 1773784915.821449,train_step,70,1,perf/physical_batches,9.0
92
+ 1773784915.821449,train_step,70,1,privacy/epsilon,4.751375515961613
93
+ 1773784915.821449,train_step,70,1,system/cuda_memory_allocated_gb,14.30103349685669
94
+ 1773784915.821449,train_step,70,1,system/cuda_max_memory_allocated_gb,73.2897834777832
95
+ 1773785010.2598872,train_step,80,1,train/step_loss,10.664647719439339
96
+ 1773785010.2598872,train_step,80,1,train/step_real_loss,10.645641326904297
97
+ 1773785010.2598872,train_step,80,1,train/lr,0.00014532799038330385
98
+ 1773785010.2598872,train_step,80,1,train/step_canary_loss,10.96875
99
+ 1773785010.2598872,train_step,80,1,perf/step_duration_sec,9.077163262758404
100
+ 1773785010.2598872,train_step,80,1,perf/samples_per_sec,7.491327194586108
101
+ 1773785010.2598872,train_step,80,1,perf/tokens_per_sec,5665.206024329354
102
+ 1773785010.2598872,train_step,80,1,perf/logical_batch_size,68.0
103
+ 1773785010.2598872,train_step,80,1,perf/logical_token_count,51424.0
104
+ 1773785010.2598872,train_step,80,1,perf/physical_batches,9.0
105
+ 1773785010.2598872,train_step,80,1,privacy/epsilon,5.041931555886533
106
+ 1773785010.2598872,train_step,80,1,system/cuda_memory_allocated_gb,14.30103349685669
107
+ 1773785010.2598872,train_step,80,1,system/cuda_max_memory_allocated_gb,73.2897834777832
108
+ 1773785104.2668839,train_step,90,1,train/step_loss,10.064055400108224
109
+ 1773785104.2668839,train_step,90,1,train/step_real_loss,10.030925154685974
110
+ 1773785104.2668839,train_step,90,1,train/lr,0.00013059820956358998
111
+ 1773785104.2668839,train_step,90,1,train/step_canary_loss,10.770833969116211
112
+ 1773785104.2668839,train_step,90,1,perf/step_duration_sec,9.01999548682943
113
+ 1773785104.2668839,train_step,90,1,perf/samples_per_sec,7.427941632324565
114
+ 1773785104.2668839,train_step,90,1,perf/tokens_per_sec,6063.528532786976
115
+ 1773785104.2668839,train_step,90,1,perf/logical_batch_size,67.0
116
+ 1773785104.2668839,train_step,90,1,perf/logical_token_count,54693.0
117
+ 1773785104.2668839,train_step,90,1,perf/physical_batches,9.0
118
+ 1773785104.2668839,train_step,90,1,privacy/epsilon,5.3191578546045415
119
+ 1773785104.2668839,train_step,90,1,system/cuda_memory_allocated_gb,14.238978385925293
120
+ 1773785104.2668839,train_step,90,1,system/cuda_max_memory_allocated_gb,73.2897834777832
121
+ 1773785196.997746,train_step,100,1,train/step_loss,9.603301625856211
122
+ 1773785196.997746,train_step,100,1,train/step_real_loss,9.47983455657959
123
+ 1773785196.997746,train_step,100,1,train/lr,0.00011509192648058249
124
+ 1773785196.997746,train_step,100,1,train/step_canary_loss,10.73214340209961
125
+ 1773785196.997746,train_step,100,1,perf/step_duration_sec,9.839758691843599
126
+ 1773785196.997746,train_step,100,1,perf/samples_per_sec,7.215624104568085
127
+ 1773785196.997746,train_step,100,1,perf/tokens_per_sec,4844.224486878061
128
+ 1773785196.997746,train_step,100,1,perf/logical_batch_size,71.0
129
+ 1773785196.997746,train_step,100,1,perf/logical_token_count,47666.0
130
+ 1773785196.997746,train_step,100,1,perf/physical_batches,9.0
131
+ 1773785196.997746,train_step,100,1,privacy/epsilon,5.584672549637297
132
+ 1773785196.997746,train_step,100,1,system/cuda_memory_allocated_gb,14.487197399139404
133
+ 1773785196.997746,train_step,100,1,system/cuda_max_memory_allocated_gb,73.2897834777832
134
+ 1773785211.3789763,eval_step,100,1,eval/loss,9.217194291678343
135
+ 1773785211.3789763,eval_step,100,1,eval/duration_sec,14.378479943610728
136
+ 1773785262.4304821,train_epoch,104,1,train/epoch_loss,12.341963201017878
137
+ 1773785262.4304821,train_epoch,104,1,train/epoch_real_loss,12.414478586751462
138
+ 1773785262.4304821,train_epoch,104,1,train/epoch_canary_loss,10.728888959316615
139
+ 1773785262.4304821,train_epoch,104,1,perf/epoch_duration_sec,998.3496765969321
140
+ 1773785262.4304821,train_epoch,104,1,perf/epoch_samples_per_sec,55.96235598577414
141
+ 1773785262.4304821,train_epoch,104,1,perf/epoch_tokens_per_sec,44175.03409259433
142
+ 1773785262.4304821,train_epoch,104,1,perf/epoch_samples,55870.0
143
+ 1773785262.4304821,train_epoch,104,1,perf/epoch_tokens,44102131.0
144
+ 1773785262.4304821,train_epoch,104,1,system/cuda_epoch_peak_memory_gb,73.2897834777832
145
+ 1773785262.4304821,train_epoch,104,1,eval/loss,8.922484595667232
146
+ 1773785262.4304821,train_epoch,104,1,eval/duration_sec,14.388765855692327
147
+ 1773785262.4304821,train_epoch,104,1,privacy/epsilon,5.688895252675942
148
+ 1773785270.4813497,audit_epoch,104,1,audit/delta,1e-05
149
+ 1773785270.4813497,audit_epoch,104,1,audit/num_canaries,500.0
150
+ 1773785270.4813497,audit_epoch,104,1,audit/num_members,250.0
151
+ 1773785270.4813497,audit_epoch,104,1,audit/paper_guess_fraction,0.2
152
+ 1773785270.4813497,audit_epoch,104,1,audit/paper_guess_steps,20.0
153
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/auc,0.5184
154
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon/0.05,0.0
155
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon/0.01,0.0
156
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/epsilon,0.0
157
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/num_guesses,0.0
158
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.05/correct_guesses,0.0
159
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
160
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
161
+ 1773785270.4813497,audit_epoch,104,1,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
162
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/auc,0.53976
163
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon/0.05,0.0
164
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon/0.01,0.0
165
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
166
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
167
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
168
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
169
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
170
+ 1773785270.4813497,audit_epoch,104,1,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
171
+ 1773785270.4813497,audit_epoch,104,1,perf/audit_duration_sec,5.523622438777238
172
+ 1773785326.5610142,train_step,110,2,train/step_loss,8.857723698471531
173
+ 1773785326.5610142,train_step,110,2,train/step_real_loss,8.790777564048767
174
+ 1773785326.5610142,train_step,110,2,train/lr,9.920264990753837e-05
175
+ 1773785326.5610142,train_step,110,2,train/step_canary_loss,11.0
176
+ 1773785326.5610142,train_step,110,2,perf/step_duration_sec,9.301705885212868
177
+ 1773785326.5610142,train_step,110,2,perf/samples_per_sec,7.095472681513365
178
+ 1773785326.5610142,train_step,110,2,perf/tokens_per_sec,5950.3063935818445
179
+ 1773785326.5610142,train_step,110,2,perf/logical_batch_size,66.0
180
+ 1773785326.5610142,train_step,110,2,perf/logical_token_count,55348.0
181
+ 1773785326.5610142,train_step,110,2,perf/physical_batches,9.0
182
+ 1773785326.5610142,train_step,110,2,privacy/epsilon,5.840075127464654
183
+ 1773785326.5610142,train_step,110,2,system/cuda_memory_allocated_gb,14.17786169052124
184
+ 1773785326.5610142,train_step,110,2,system/cuda_max_memory_allocated_gb,73.28975915908813
185
+ 1773785418.9144747,train_step,120,2,train/step_loss,8.510492253659377
186
+ 1773785418.9144747,train_step,120,2,train/step_real_loss,8.394773125648499
187
+ 1773785418.9144747,train_step,120,2,train/lr,8.333360798744496e-05
188
+ 1773785418.9144747,train_step,120,2,train/step_canary_loss,10.979166984558105
189
+ 1773785418.9144747,train_step,120,2,perf/step_duration_sec,9.138508805073798
190
+ 1773785418.9144747,train_step,120,2,perf/samples_per_sec,7.331611910556007
191
+ 1773785418.9144747,train_step,120,2,perf/tokens_per_sec,5373.305541133465
192
+ 1773785418.9144747,train_step,120,2,perf/logical_batch_size,67.0
193
+ 1773785418.9144747,train_step,120,2,perf/logical_token_count,49104.0
194
+ 1773785418.9144747,train_step,120,2,perf/physical_batches,9.0
195
+ 1773785418.9144747,train_step,120,2,privacy/epsilon,6.086971689386329
196
+ 1773785418.9144747,train_step,120,2,system/cuda_memory_allocated_gb,14.238978385925293
197
+ 1773785418.9144747,train_step,120,2,system/cuda_max_memory_allocated_gb,73.28975915908813
198
+ 1773785511.284102,train_step,130,2,train/step_loss,8.294911356939785
199
+ 1773785511.284102,train_step,130,2,train/step_real_loss,8.076740324497223
200
+ 1773785511.284102,train_step,130,2,train/lr,6.788751536089739e-05
201
+ 1773785511.284102,train_step,130,2,train/step_canary_loss,11.08750057220459
202
+ 1773785511.284102,train_step,130,2,perf/step_duration_sec,9.095777447801083
203
+ 1773785511.284102,train_step,130,2,perf/samples_per_sec,7.5859375843327
204
+ 1773785511.284102,train_step,130,2,perf/tokens_per_sec,6110.967459239838
205
+ 1773785511.284102,train_step,130,2,perf/logical_batch_size,69.0
206
+ 1773785511.284102,train_step,130,2,perf/logical_token_count,55584.0
207
+ 1773785511.284102,train_step,130,2,perf/physical_batches,9.0
208
+ 1773785511.284102,train_step,130,2,privacy/epsilon,6.327001481453001
209
+ 1773785511.284102,train_step,130,2,system/cuda_memory_allocated_gb,14.363087177276611
210
+ 1773785511.284102,train_step,130,2,system/cuda_max_memory_allocated_gb,73.2897834777832
211
+ 1773785603.6243958,train_step,140,2,train/step_loss,8.052544166792684
212
+ 1773785603.6243958,train_step,140,2,train/step_real_loss,7.91242903470993
213
+ 1773785603.6243958,train_step,140,2,train/lr,5.325635332531864e-05
214
+ 1773785603.6243958,train_step,140,2,train/step_canary_loss,11.041666984558105
215
+ 1773785603.6243958,train_step,140,2,perf/step_duration_sec,9.209645525086671
216
+ 1773785603.6243958,train_step,140,2,perf/samples_per_sec,7.274981411336075
217
+ 1773785603.6243958,train_step,140,2,perf/tokens_per_sec,5759.396477911762
218
+ 1773785603.6243958,train_step,140,2,perf/logical_batch_size,67.0
219
+ 1773785603.6243958,train_step,140,2,perf/logical_token_count,53042.0
220
+ 1773785603.6243958,train_step,140,2,perf/physical_batches,9.0
221
+ 1773785603.6243958,train_step,140,2,privacy/epsilon,6.557571491578655
222
+ 1773785603.6243958,train_step,140,2,system/cuda_memory_allocated_gb,14.238978385925293
223
+ 1773785603.6243958,train_step,140,2,system/cuda_max_memory_allocated_gb,73.2897834777832
224
+ 1773785696.4521918,train_step,150,2,train/step_loss,8.013610591059146
225
+ 1773785696.4521918,train_step,150,2,train/step_real_loss,7.757837951183319
226
+ 1773785696.4521918,train_step,150,2,train/lr,3.981142237826332e-05
227
+ 1773785696.4521918,train_step,150,2,train/step_canary_loss,11.287500381469727
228
+ 1773785696.4521918,train_step,150,2,perf/step_duration_sec,9.033272176980972
229
+ 1773785696.4521918,train_step,150,2,perf/samples_per_sec,7.638428096501862
230
+ 1773785696.4521918,train_step,150,2,perf/tokens_per_sec,6166.425511006423
231
+ 1773785696.4521918,train_step,150,2,perf/logical_batch_size,69.0
232
+ 1773785696.4521918,train_step,150,2,perf/logical_token_count,55703.0
233
+ 1773785696.4521918,train_step,150,2,perf/physical_batches,9.0
234
+ 1773785696.4521918,train_step,150,2,privacy/epsilon,6.783113430476456
235
+ 1773785696.4521918,train_step,150,2,system/cuda_memory_allocated_gb,14.363087177276611
236
+ 1773785696.4521918,train_step,150,2,system/cuda_max_memory_allocated_gb,73.2897834777832
237
+ 1773785710.7493467,eval_step,150,2,eval/loss,7.667719158259305
238
+ 1773785710.7493467,eval_step,150,2,eval/duration_sec,14.295333066955209
239
+ 1773785803.0583243,train_step,160,2,train/step_loss,7.74585654518821
240
+ 1773785803.0583243,train_step,160,2,train/step_real_loss,7.636352062225342
241
+ 1773785803.0583243,train_step,160,2,train/lr,2.789391958515183e-05
242
+ 1773785803.0583243,train_step,160,2,train/step_canary_loss,11.25
243
+ 1773785803.0583243,train_step,160,2,perf/step_duration_sec,8.934944829903543
244
+ 1773785803.0583243,train_step,160,2,perf/samples_per_sec,7.386727199379082
245
+ 1773785803.0583243,train_step,160,2,perf/tokens_per_sec,6129.080933630252
246
+ 1773785803.0583243,train_step,160,2,perf/logical_batch_size,66.0
247
+ 1773785803.0583243,train_step,160,2,perf/logical_token_count,54763.0
248
+ 1773785803.0583243,train_step,160,2,perf/physical_batches,9.0
249
+ 1773785803.0583243,train_step,160,2,privacy/epsilon,7.0043611451158165
250
+ 1773785803.0583243,train_step,160,2,system/cuda_memory_allocated_gb,14.17786169052124
251
+ 1773785803.0583243,train_step,160,2,system/cuda_max_memory_allocated_gb,73.2897834777832
252
+ 1773785896.8129106,train_step,170,2,train/step_loss,7.764082396208351
253
+ 1773785896.8129106,train_step,170,2,train/step_real_loss,7.601656556129456
254
+ 1773785896.8129106,train_step,170,2,train/lr,1.7806279893114875e-05
255
+ 1773785896.8129106,train_step,170,2,train/step_canary_loss,11.229166984558105
256
+ 1773785896.8129106,train_step,170,2,perf/step_duration_sec,9.581119411159307
257
+ 1773785896.8129106,train_step,170,2,perf/samples_per_sec,6.992919837943347
258
+ 1773785896.8129106,train_step,170,2,perf/tokens_per_sec,5193.338885020673
259
+ 1773785896.8129106,train_step,170,2,perf/logical_batch_size,67.0
260
+ 1773785896.8129106,train_step,170,2,perf/logical_token_count,49758.0
261
+ 1773785896.8129106,train_step,170,2,perf/physical_batches,9.0
262
+ 1773785896.8129106,train_step,170,2,privacy/epsilon,7.2178133573416465
263
+ 1773785896.8129106,train_step,170,2,system/cuda_memory_allocated_gb,14.238978385925293
264
+ 1773785896.8129106,train_step,170,2,system/cuda_max_memory_allocated_gb,73.2897834777832
265
+ 1773785989.7971883,train_step,180,2,train/step_loss,7.800547375398524
266
+ 1773785989.7971883,train_step,180,2,train/step_real_loss,7.582026898860931
267
+ 1773785989.7971883,train_step,180,2,train/lr,9.804501125681243e-06
268
+ 1773785989.7971883,train_step,180,2,train/step_canary_loss,11.296875
269
+ 1773785989.7971883,train_step,180,2,perf/step_duration_sec,9.495344399940223
270
+ 1773785989.7971883,train_step,180,2,perf/samples_per_sec,7.161404277282253
271
+ 1773785989.7971883,train_step,180,2,perf/tokens_per_sec,5326.71569030381
272
+ 1773785989.7971883,train_step,180,2,perf/logical_batch_size,68.0
273
+ 1773785989.7971883,train_step,180,2,perf/logical_token_count,50579.0
274
+ 1773785989.7971883,train_step,180,2,perf/physical_batches,9.0
275
+ 1773785989.7971883,train_step,180,2,privacy/epsilon,7.430040084733278
276
+ 1773785989.7971883,train_step,180,2,system/cuda_memory_allocated_gb,14.30103349685669
277
+ 1773785989.7971883,train_step,180,2,system/cuda_max_memory_allocated_gb,73.2897834777832
278
+ 1773786082.775681,train_step,190,2,train/step_loss,7.886315041694088
279
+ 1773786082.775681,train_step,190,2,train/step_real_loss,7.6274334192276
280
+ 1773786082.775681,train_step,190,2,train/lr,4.091647429802869e-06
281
+ 1773786082.775681,train_step,190,2,train/step_canary_loss,11.199999809265137
282
+ 1773786082.775681,train_step,190,2,perf/step_duration_sec,9.168436062987894
283
+ 1773786082.775681,train_step,190,2,perf/samples_per_sec,7.525820055455963
284
+ 1773786082.775681,train_step,190,2,perf/tokens_per_sec,5850.50706919794
285
+ 1773786082.775681,train_step,190,2,perf/logical_batch_size,69.0
286
+ 1773786082.775681,train_step,190,2,perf/logical_token_count,53640.0
287
+ 1773786082.775681,train_step,190,2,perf/physical_batches,9.0
288
+ 1773786082.775681,train_step,190,2,privacy/epsilon,7.633877151395027
289
+ 1773786082.775681,train_step,190,2,system/cuda_memory_allocated_gb,14.363087177276611
290
+ 1773786082.775681,train_step,190,2,system/cuda_max_memory_allocated_gb,73.29015684127808
291
+ 1773786175.474541,train_step,200,2,train/step_loss,7.722104996901292
292
+ 1773786175.474541,train_step,200,2,train/step_real_loss,7.670887887477875
293
+ 1773786175.474541,train_step,200,2,train/lr,8.126960406835249e-07
294
+ 1773786175.474541,train_step,200,2,train/step_canary_loss,11.0
295
+ 1773786175.474541,train_step,200,2,perf/step_duration_sec,9.673653797712177
296
+ 1773786175.474541,train_step,200,2,perf/samples_per_sec,6.719281189840856
297
+ 1773786175.474541,train_step,200,2,perf/tokens_per_sec,5154.412287505304
298
+ 1773786175.474541,train_step,200,2,perf/logical_batch_size,65.0
299
+ 1773786175.474541,train_step,200,2,perf/logical_token_count,49862.0
300
+ 1773786175.474541,train_step,200,2,perf/physical_batches,9.0
301
+ 1773786175.474541,train_step,200,2,privacy/epsilon,7.837365803470095
302
+ 1773786175.474541,train_step,200,2,system/cuda_memory_allocated_gb,14.1148681640625
303
+ 1773786175.474541,train_step,200,2,system/cuda_max_memory_allocated_gb,73.29015684127808
304
+ 1773786189.8028603,eval_step,200,2,eval/loss,7.523608349940993
305
+ 1773786189.8028603,eval_step,200,2,eval/duration_sec,14.325929747894406
306
+ 1773786276.8565547,train_epoch,208,2,train/epoch_loss,8.035395088671393
307
+ 1773786276.8565547,train_epoch,208,2,train/epoch_real_loss,7.897920552785785
308
+ 1773786276.8565547,train_epoch,208,2,train/epoch_canary_loss,10.571939474888266
309
+ 1773786276.8565547,train_epoch,208,2,perf/epoch_duration_sec,991.9095743251964
310
+ 1773786276.8565547,train_epoch,208,2,perf/epoch_samples_per_sec,56.10592078200321
311
+ 1773786276.8565547,train_epoch,208,2,perf/epoch_tokens_per_sec,44450.63757953486
312
+ 1773786276.8565547,train_epoch,208,2,perf/epoch_samples,55652.0
313
+ 1773786276.8565547,train_epoch,208,2,perf/epoch_tokens,44091013.0
314
+ 1773786276.8565547,train_epoch,208,2,system/cuda_epoch_peak_memory_gb,73.29015684127808
315
+ 1773786276.8565547,train_epoch,208,2,eval/loss,7.523212566971779
316
+ 1773786276.8565547,train_epoch,208,2,eval/duration_sec,14.324650165159255
317
+ 1773786276.8565547,train_epoch,208,2,privacy/epsilon,7.995186040237391
318
+ 1773786284.9576344,audit_epoch,208,2,audit/delta,1e-05
319
+ 1773786284.9576344,audit_epoch,208,2,audit/num_canaries,500.0
320
+ 1773786284.9576344,audit_epoch,208,2,audit/num_members,250.0
321
+ 1773786284.9576344,audit_epoch,208,2,audit/paper_guess_fraction,0.2
322
+ 1773786284.9576344,audit_epoch,208,2,audit/paper_guess_steps,20.0
323
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/auc,0.533352
324
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon/0.05,0.0
325
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon/0.01,0.0
326
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/epsilon,0.0
327
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,0.0
328
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,0.0
329
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
330
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
331
+ 1773786284.9576344,audit_epoch,208,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
332
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/auc,0.544592
333
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon/0.05,0.0
334
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon/0.01,0.0
335
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
336
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
337
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
338
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
339
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
340
+ 1773786284.9576344,audit_epoch,208,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
341
+ 1773786284.9576344,audit_epoch,208,2,perf/audit_duration_sec,5.583974160254002
342
+ 1773786292.845513,audit_final,208,2,audit/delta,1e-05
343
+ 1773786292.845513,audit_final,208,2,audit/num_canaries,500.0
344
+ 1773786292.845513,audit_final,208,2,audit/num_members,250.0
345
+ 1773786292.845513,audit_final,208,2,audit/paper_guess_fraction,0.2
346
+ 1773786292.845513,audit_final,208,2,audit/paper_guess_steps,20.0
347
+ 1773786292.845513,audit_final,208,2,audit/loss/auc,0.533352
348
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon/0.05,0.0
349
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon/0.01,0.0
350
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/epsilon,0.0
351
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/num_guesses,0.0
352
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.05/correct_guesses,0.0
353
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/epsilon,0.0
354
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/num_guesses,0.0
355
+ 1773786292.845513,audit_final,208,2,audit/loss/empirical_epsilon_details/0.01/correct_guesses,0.0
356
+ 1773786292.845513,audit_final,208,2,audit/embedding/auc,0.544592
357
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon/0.05,0.0
358
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon/0.01,0.0
359
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/epsilon,0.0
360
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/num_guesses,0.0
361
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.05/correct_guesses,0.0
362
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/epsilon,0.0
363
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/num_guesses,0.0
364
+ 1773786292.845513,audit_final,208,2,audit/embedding/empirical_epsilon_details/0.01/correct_guesses,0.0
365
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/duration,2107.286476707086
366
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/emissions,0.1023234868151428
367
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/emissions_rate,4.85569892590194e-05
368
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/cpu_power,72.02878771140225
369
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/gpu_power,4175.089097615359
370
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/ram_power,54.0
371
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/cpu_energy,0.04057807438109415
372
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/gpu_energy,2.443401743608831
373
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/ram_energy,0.030419661433661826
374
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/energy_consumed,2.5143994794235853
375
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/water_consumed,0.0
376
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/cpu_count,256.0
377
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/gpu_count,8.0
378
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/longitude,16.1885
379
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/latitude,58.594
380
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/ram_total_size,1511.49019241333
381
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/cpu_utilization_percent,3.622827125119416
382
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/gpu_utilization_percent,80.66583094555874
383
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/ram_utilization_percent,5.312368672397158
384
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/ram_used_gb,80.40963100407845
385
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/pue,1.0
386
+ 1773786293.3942149,energy_final,208,,energy/codecarbon/wue,0.0
deepseek-coder-6.7b/dp8/summary.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audit/delta": 1e-05,
3
+ "audit/embedding/auc": 0.544592,
4
+ "audit/embedding/empirical_epsilon/0.01": 0.0,
5
+ "audit/embedding/empirical_epsilon/0.05": 0.0,
6
+ "audit/embedding/empirical_epsilon_details/0.01/correct_guesses": 0.0,
7
+ "audit/embedding/empirical_epsilon_details/0.01/epsilon": 0.0,
8
+ "audit/embedding/empirical_epsilon_details/0.01/num_guesses": 0.0,
9
+ "audit/embedding/empirical_epsilon_details/0.05/correct_guesses": 0.0,
10
+ "audit/embedding/empirical_epsilon_details/0.05/epsilon": 0.0,
11
+ "audit/embedding/empirical_epsilon_details/0.05/num_guesses": 0.0,
12
+ "audit/loss/auc": 0.533352,
13
+ "audit/loss/empirical_epsilon/0.01": 0.0,
14
+ "audit/loss/empirical_epsilon/0.05": 0.0,
15
+ "audit/loss/empirical_epsilon_details/0.01/correct_guesses": 0.0,
16
+ "audit/loss/empirical_epsilon_details/0.01/epsilon": 0.0,
17
+ "audit/loss/empirical_epsilon_details/0.01/num_guesses": 0.0,
18
+ "audit/loss/empirical_epsilon_details/0.05/correct_guesses": 0.0,
19
+ "audit/loss/empirical_epsilon_details/0.05/epsilon": 0.0,
20
+ "audit/loss/empirical_epsilon_details/0.05/num_guesses": 0.0,
21
+ "audit/num_canaries": 500.0,
22
+ "audit/num_members": 250.0,
23
+ "audit/paper_guess_fraction": 0.2,
24
+ "audit/paper_guess_steps": 20.0,
25
+ "energy/codecarbon/cpu_count": 256.0,
26
+ "energy/codecarbon/cpu_energy": 0.04057807438109415,
27
+ "energy/codecarbon/cpu_power": 72.02878771140225,
28
+ "energy/codecarbon/cpu_utilization_percent": 3.622827125119416,
29
+ "energy/codecarbon/duration": 2107.286476707086,
30
+ "energy/codecarbon/emissions": 0.1023234868151428,
31
+ "energy/codecarbon/emissions_rate": 4.85569892590194e-05,
32
+ "energy/codecarbon/energy_consumed": 2.5143994794235853,
33
+ "energy/codecarbon/gpu_count": 8.0,
34
+ "energy/codecarbon/gpu_energy": 2.443401743608831,
35
+ "energy/codecarbon/gpu_power": 4175.089097615359,
36
+ "energy/codecarbon/gpu_utilization_percent": 80.66583094555874,
37
+ "energy/codecarbon/latitude": 58.594,
38
+ "energy/codecarbon/longitude": 16.1885,
39
+ "energy/codecarbon/pue": 1.0,
40
+ "energy/codecarbon/ram_energy": 0.030419661433661826,
41
+ "energy/codecarbon/ram_power": 54.0,
42
+ "energy/codecarbon/ram_total_size": 1511.49019241333,
43
+ "energy/codecarbon/ram_used_gb": 80.40963100407845,
44
+ "energy/codecarbon/ram_utilization_percent": 5.312368672397158,
45
+ "energy/codecarbon/water_consumed": 0.0,
46
+ "energy/codecarbon/wue": 0.0,
47
+ "eval/duration_sec": 14.324650165159255,
48
+ "eval/loss": 7.523212566971779,
49
+ "perf/audit_duration_sec": 5.583974160254002,
50
+ "perf/epoch_duration_sec": 991.9095743251964,
51
+ "perf/epoch_samples": 55652.0,
52
+ "perf/epoch_samples_per_sec": 56.10592078200321,
53
+ "perf/epoch_tokens": 44091013.0,
54
+ "perf/epoch_tokens_per_sec": 44450.63757953486,
55
+ "perf/logical_batch_size": 65.0,
56
+ "perf/logical_token_count": 49862.0,
57
+ "perf/physical_batches": 9.0,
58
+ "perf/samples_per_sec": 6.719281189840856,
59
+ "perf/step_duration_sec": 9.673653797712177,
60
+ "perf/tokens_per_sec": 5154.412287505304,
61
+ "privacy/epsilon": 7.995186040237391,
62
+ "system/cuda_epoch_peak_memory_gb": 73.29015684127808,
63
+ "system/cuda_max_memory_allocated_gb": 73.29015684127808,
64
+ "system/cuda_memory_allocated_gb": 14.1148681640625,
65
+ "train/epoch_canary_loss": 10.571939474888266,
66
+ "train/epoch_loss": 8.035395088671393,
67
+ "train/epoch_real_loss": 7.897920552785785,
68
+ "train/lr": 8.126960406835249e-07,
69
+ "train/step_canary_loss": 11.0,
70
+ "train/step_loss": 7.722104996901292,
71
+ "train/step_real_loss": 7.670887887477875
72
+ }
deepseek-coder-6.7b/dp8/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
deepseek-coder-6.7b/dp8/train.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-03-17 21:52:24,143 [INFO] new_opacus_codex.train_steps: epoch=1 step=10 loss=14.7407
2
+ 2026-03-17 21:53:57,535 [INFO] new_opacus_codex.train_steps: epoch=1 step=20 loss=14.5101
3
+ 2026-03-17 21:55:30,416 [INFO] new_opacus_codex.train_steps: epoch=1 step=30 loss=14.0244
4
+ 2026-03-17 21:57:02,983 [INFO] new_opacus_codex.train_steps: epoch=1 step=40 loss=13.5601
5
+ 2026-03-17 21:58:36,151 [INFO] new_opacus_codex.train_steps: epoch=1 step=50 loss=12.9428
6
+ 2026-03-17 21:58:50,692 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=50 eval_loss=12.6723 duration_sec=14.54
7
+ 2026-03-17 22:00:23,309 [INFO] new_opacus_codex.train_steps: epoch=1 step=60 loss=12.3388
8
+ 2026-03-17 22:01:55,821 [INFO] new_opacus_codex.train_steps: epoch=1 step=70 loss=11.6780
9
+ 2026-03-17 22:03:30,259 [INFO] new_opacus_codex.train_steps: epoch=1 step=80 loss=10.9401
10
+ 2026-03-17 22:05:04,266 [INFO] new_opacus_codex.train_steps: epoch=1 step=90 loss=10.3183
11
+ 2026-03-17 22:06:36,997 [INFO] new_opacus_codex.train_steps: epoch=1 step=100 loss=9.7867
12
+ 2026-03-17 22:06:51,378 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=1 step=100 eval_loss=9.2172 duration_sec=14.38
13
+ 2026-03-17 22:08:46,560 [INFO] new_opacus_codex.train_steps: epoch=2 step=110 loss=9.0149
14
+ 2026-03-17 22:10:18,914 [INFO] new_opacus_codex.train_steps: epoch=2 step=120 loss=8.6390
15
+ 2026-03-17 22:11:51,283 [INFO] new_opacus_codex.train_steps: epoch=2 step=130 loss=8.3007
16
+ 2026-03-17 22:13:23,623 [INFO] new_opacus_codex.train_steps: epoch=2 step=140 loss=8.1182
17
+ 2026-03-17 22:14:56,451 [INFO] new_opacus_codex.train_steps: epoch=2 step=150 loss=7.9517
18
+ 2026-03-17 22:15:10,749 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=150 eval_loss=7.6677 duration_sec=14.30
19
+ 2026-03-17 22:16:43,057 [INFO] new_opacus_codex.train_steps: epoch=2 step=160 loss=7.8052
20
+ 2026-03-17 22:18:16,812 [INFO] new_opacus_codex.train_steps: epoch=2 step=170 loss=7.8663
21
+ 2026-03-17 22:19:49,796 [INFO] new_opacus_codex.train_steps: epoch=2 step=180 loss=7.8266
22
+ 2026-03-17 22:21:22,775 [INFO] new_opacus_codex.train_steps: epoch=2 step=190 loss=7.7819
23
+ 2026-03-17 22:22:55,474 [INFO] new_opacus_codex.train_steps: epoch=2 step=200 loss=7.7509
24
+ 2026-03-17 22:23:09,802 [INFO] new_opacus_codex.train_steps: eval event=eval_step epoch=2 step=200 eval_loss=7.5236 duration_sec=14.33
granite-4.0-h-tiny/base/canary_meta.json ADDED
The diff for this file is too large to render. See raw diff
 
granite-4.0-h-tiny/base/resolved_config.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: ibm-granite/granite-4.0-h-tiny
3
+ tokenizer_name: ibm-granite/granite-4.0-h-tiny
4
+ max_length: 1024
5
+ dtype: bfloat16
6
+ trust_remote_code: true
7
+ use_fast_tokenizer: true
8
+ cache_dir: null
9
+ local_files_only: false
10
+ low_cpu_mem_usage: true
11
+ tie_word_embeddings: true
12
+ gradient_checkpointing: false
13
+ use_chat_template: false
14
+ dataset:
15
+ name: melihcatal/codedp-cpt
16
+ split: train
17
+ mode: cpt
18
+ text_column: text
19
+ validation_ratio: 0.05
20
+ max_samples: -1
21
+ lora:
22
+ enabled: true
23
+ r: 16
24
+ alpha: 32
25
+ dropout: 0.05
26
+ target_modules:
27
+ - q_proj
28
+ - k_proj
29
+ - v_proj
30
+ - o_proj
31
+ modules_to_save:
32
+ - lm_head
33
+ bias: none
34
+ training:
35
+ seed: 42
36
+ epochs: 2
37
+ warmup_steps: null
38
+ warmup_ratio: 0.05
39
+ mixed_precision: false
40
+ mixed_precision_dtype: bfloat16
41
+ batch_size: 8
42
+ eval_batch_size: 8
43
+ eval_every_steps: 50
44
+ eval_every_epochs: 1
45
+ learning_rate: 0.0001
46
+ optimizer: adamw
47
+ lr_scheduler: cosine
48
+ adam_beta1: 0.9
49
+ adam_beta2: 0.999
50
+ adam_epsilon: 1.0e-08
51
+ sgd_momentum: 0.9
52
+ weight_decay: 0.01
53
+ max_grad_norm: 1.0
54
+ log_every: 10
55
+ gradient_accumulation_steps: 4
56
+ num_workers: 4
57
+ output_dir: runs/cpt/granite-4.0-h-tiny/base
58
+ distributed:
59
+ strategy: dpddp
60
+ backend: nccl
61
+ devices: null
62
+ dp:
63
+ module_validator: auto
64
+ target_delta: 1.0e-05
65
+ noise_multiplier: null
66
+ max_grad_norm: 1.0
67
+ grad_sample_mode: hooks
68
+ clipping: flat
69
+ secure_mode: false
70
+ enabled: false
71
+ target_epsilon: 8.0
72
+ audit:
73
+ enabled: true
74
+ run_every_epoch: true
75
+ epoch_device: cuda
76
+ q_canary: auto
77
+ num_canaries: 500
78
+ prefix_length: 49
79
+ num_digits: 12
80
+ batch_size: 32
81
+ delta: 1.0e-05
82
+ p_values:
83
+ - 0.05
84
+ - 0.01
85
+ paper_guess_fraction: 0.2
86
+ paper_guess_steps: 20
87
+ enable_holdout_empirical_epsilon: false
88
+ holdout_seed: 42
89
+ tie_seed: 42
90
+ tracking:
91
+ enabled: true
92
+ tensorboard: true
93
+ wandb: false
94
+ wandb_project: codedp-finetune-h200-audit
95
+ wandb_run_name: granite-4.0-h-tiny-cpt-base
96
+ wandb_mode: online
97
+ codecarbon: true
98
+ codecarbon_output_file: codecarbon.csv
99
+ codecarbon_measure_power_secs: 15
100
+ codecarbon_country_iso_code: null
101
+ codecarbon_project_name: codedp-granite-4.0-h-tiny-cpt-base
102
+ moe:
103
+ output_router_logits: false
104
+ router_aux_loss_coef: 0.0
105
+ freeze_router: true
106
+ profile:
107
+ enabled: false
108
+ num_batches: 8
109
+ top_experts: 8
110
+ output_file: moe_expert_profile.json
granite-4.0-h-tiny/base/scalars.csv ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ timestamp,event,step,epoch,key,value
2
+ 1773822187.1417096,train_step,10,1,train/step_loss,4.135385597453398
3
+ 1773822187.1417096,train_step,10,1,train/step_real_loss,3.4641597270965576
4
+ 1773822187.1417096,train_step,10,1,train/lr,4.545454545454546e-05
5
+ 1773822187.1417096,train_step,10,1,train/step_canary_loss,14.875
6
+ 1773822187.1417096,train_step,10,1,perf/step_duration_sec,4.6632686029188335
7
+ 1773822187.1417096,train_step,10,1,perf/samples_per_sec,7.291023291842704
8
+ 1773822187.1417096,train_step,10,1,perf/tokens_per_sec,5700.722446774896
9
+ 1773822187.1417096,train_step,10,1,perf/logical_batch_size,34.0
10
+ 1773822187.1417096,train_step,10,1,perf/logical_token_count,26584.0
11
+ 1773822187.1417096,train_step,10,1,perf/gradient_accumulation_steps,4.0
12
+ 1773822187.1417096,train_step,10,1,system/cuda_memory_allocated_gb,16.85233783721924
13
+ 1773822187.1417096,train_step,10,1,system/cuda_max_memory_allocated_gb,60.90630769729614
14
+ 1773822228.5490818,train_step,20,1,train/step_loss,2.740965247154236
15
+ 1773822228.5490818,train_step,20,1,train/step_real_loss,2.740965247154236
16
+ 1773822228.5490818,train_step,20,1,train/lr,9.090909090909092e-05
17
+ 1773822228.5490818,train_step,20,1,perf/step_duration_sec,3.847110118251294
18
+ 1773822228.5490818,train_step,20,1,perf/samples_per_sec,8.317931906390456
19
+ 1773822228.5490818,train_step,20,1,perf/tokens_per_sec,6799.649398102124
20
+ 1773822228.5490818,train_step,20,1,perf/logical_batch_size,32.0
21
+ 1773822228.5490818,train_step,20,1,perf/logical_token_count,26159.0
22
+ 1773822228.5490818,train_step,20,1,perf/gradient_accumulation_steps,4.0
23
+ 1773822228.5490818,train_step,20,1,system/cuda_memory_allocated_gb,16.85233783721924
24
+ 1773822228.5490818,train_step,20,1,system/cuda_max_memory_allocated_gb,60.90630769729614
25
+ 1773822269.2440736,train_step,30,1,train/step_loss,1.4690485894680023
26
+ 1773822269.2440736,train_step,30,1,train/step_real_loss,1.4690485894680023
27
+ 1773822269.2440736,train_step,30,1,train/lr,9.990789447882137e-05
28
+ 1773822269.2440736,train_step,30,1,perf/step_duration_sec,3.921983283944428
29
+ 1773822269.2440736,train_step,30,1,perf/samples_per_sec,8.159137273990844
30
+ 1773822269.2440736,train_step,30,1,perf/tokens_per_sec,6951.839930480011
31
+ 1773822269.2440736,train_step,30,1,perf/logical_batch_size,32.0
32
+ 1773822269.2440736,train_step,30,1,perf/logical_token_count,27265.0
33
+ 1773822269.2440736,train_step,30,1,perf/gradient_accumulation_steps,4.0
34
+ 1773822269.2440736,train_step,30,1,system/cuda_memory_allocated_gb,16.85233783721924
35
+ 1773822269.2440736,train_step,30,1,system/cuda_max_memory_allocated_gb,60.90630769729614
granite-4.0-h-tiny/base/summary.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "perf/gradient_accumulation_steps": 4.0,
3
+ "perf/logical_batch_size": 32.0,
4
+ "perf/logical_token_count": 27265.0,
5
+ "perf/samples_per_sec": 8.159137273990844,
6
+ "perf/step_duration_sec": 3.921983283944428,
7
+ "perf/tokens_per_sec": 6951.839930480011,
8
+ "system/cuda_max_memory_allocated_gb": 60.90630769729614,
9
+ "system/cuda_memory_allocated_gb": 16.85233783721924,
10
+ "train/lr": 9.990789447882137e-05,
11
+ "train/step_canary_loss": 14.875,
12
+ "train/step_loss": 1.4690485894680023,
13
+ "train/step_real_loss": 1.4690485894680023
14
+ }
granite-4.0-h-tiny/base/tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' %}
2
+ {%- set tools_system_message_suffix = '\n</tools>\n\nFor each tool call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %}
3
+ {%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within <documents></documents> XML tags:\n<documents>' %}
4
+ {%- set documents_system_message_suffix = '\n</documents>\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %}
5
+ {%- set g4_default_system_message = 'You are a helpful assistant. Please ensure responses are professional, accurate, and safe.' %}
6
+ {%- if available_tools is defined and available_tools %}
7
+ {%- set tools = available_tools %}
8
+ {%- endif %}
9
+ {%- set ns = namespace(tools_system_message=tools_system_message_prefix,
10
+ documents_system_message=documents_system_message_prefix,
11
+ default_system_message=g4_default_system_message,
12
+ system_message=''
13
+ ) %}
14
+ {%- if tools %}
15
+ {%- for tool in tools %}
16
+ {%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %}
17
+ {%- endfor %}
18
+ {%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %}
19
+ {%- else %}
20
+ {%- set ns.tools_system_message = '' %}
21
+ {%- endif %}
22
+ {%- if documents %}
23
+ {%- for document in documents %}
24
+ {%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %}
25
+ {%- endfor %}
26
+ {%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %}
27
+ {%- else %}
28
+ {%- set ns.documents_system_message = '' %}
29
+ {%- endif %}
30
+ {%- if messages[0].role == 'system' %}
31
+ {%- if messages[0].content is string %}
32
+ {%- set ns.system_message = messages[0].content %}
33
+ {%- elif messages[0].content is iterable %}
34
+ {%- for entry in messages[0].content %}
35
+ {%- if entry.type== 'text' %}
36
+ {%- if ns.system_message != '' %}
37
+ {%- set ns.system_message = ns.system_message + '\n' %}
38
+ {%- endif %}
39
+ {%- set ns.system_message = ns.system_message + entry.text %}
40
+ {%- endif %}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {%- if tools and documents %}
44
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message + '\n\n' + ns.documents_system_message %}
45
+ {%- elif tools %}
46
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %}
47
+ {%- elif documents %}
48
+ {%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %}
49
+ {%- endif %}
50
+ {%- else %}
51
+ {%- if tools and documents %}
52
+ {%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message %}
53
+ {%- elif tools %}
54
+ {%- set ns.system_message = ns.tools_system_message %}
55
+ {%- elif documents %}
56
+ {%- set ns.system_message = ns.documents_system_message %}
57
+ {%- endif %}
58
+ {%- endif %}
59
+ {%- if ns.system_message %}
60
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }}
61
+ {%- else %}
62
+ {{- '<|start_of_role|>system<|end_of_role|>' + ns.default_system_message + '<|end_of_text|>\n' }}
63
+ {%- endif %}
64
+ {%- for message in messages %}
65
+ {%- set content = namespace(val='') %}
66
+ {%- if message.content is string %}
67
+ {%- set content.val = message.content %}
68
+ {%- else %}
69
+ {%- if message.content is iterable %}
70
+ {%- for entry in message.content %}
71
+ {%- if entry.type== 'text' %}
72
+ {%- if content.val != '' %}
73
+ {%- set content.val = content.val + '\n' %}
74
+ {%- endif %}
75
+ {%- set content.val = content.val + entry.text %}
76
+ {%- endif %}
77
+ {%- endfor %}
78
+ {%- endif %}
79
+ {%- endif %}
80
+ {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}
81
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val + '<|end_of_text|>\n' }}
82
+ {%- elif message.role == 'assistant' %}
83
+ {{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }}
84
+ {%- if message.tool_calls %}
85
+ {%- for tool_call in message.tool_calls %}
86
+ {%- if (loop.first and content.val) or (not loop.first) %}
87
+ {{- '\n' }}
88
+ {%- endif %}
89
+ {%- if tool_call.function %}
90
+ {%- set tool_call = tool_call.function %}
91
+ {%- endif %}
92
+ {{- '<tool_call>\n{"name": "' }}
93
+ {{- tool_call.name }}
94
+ {{- '", "arguments": ' }}
95
+ {%- if tool_call.arguments is string %}
96
+ {{- tool_call.arguments }}
97
+ {%- else %}
98
+ {{- tool_call.arguments | tojson }}
99
+ {%- endif %}
100
+ {{- '}\n</tool_call>' }}
101
+ {%- endfor %}
102
+ {%- endif %}
103
+ {{- '<|end_of_text|>\n' }}
104
+ {%- elif message.role == 'tool' %}
105
+ {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
106
+ {{- '<|start_of_role|>user<|end_of_role|>' }}
107
+ {%- endif %}
108
+ {{- '\n<tool_response>\n' }}
109
+ {{- content.val }}
110
+ {{- '\n</tool_response>' }}
111
+ {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
112
+ {{- '<|end_of_text|>\n' }}
113
+ {%- endif %}
114
+ {%- endif %}
115
+ {%- endfor %}
116
+ {%- if add_generation_prompt %}
117
+ {{- '<|start_of_role|>assistant<|end_of_role|>' }}
118
+ {%- endif %}
granite-4.0-h-tiny/base/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
granite-4.0-h-tiny/base/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|end_of_text|>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|end_of_text|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "865331112869",
10
+ "569765693871",
11
+ "485177821815",
12
+ "135441121756",
13
+ "367459894796",
14
+ "877482678543",
15
+ "457919547633",
16
+ "765474393376",
17
+ "114848338811",
18
+ "746285987371",
19
+ "649291669397",
20
+ "927914615679",
21
+ "445925149649",
22
+ "691587454538",
23
+ "143777992227",
24
+ "997981281989",
25
+ "425949483533",
26
+ "982993456429",
27
+ "718726519731",
28
+ "172599315861",
29
+ "643489267333",
30
+ "282322838685",
31
+ "781653545886",
32
+ "796415361892",
33
+ "841991688488",
34
+ "211411365397",
35
+ "698218415444",
36
+ "355977139358",
37
+ "682564697312",
38
+ "383837596997",
39
+ "689362171782",
40
+ "749966767285",
41
+ "753159165157",
42
+ "795693824762",
43
+ "669689115557",
44
+ "327491773134",
45
+ "983569279932",
46
+ "612128769512",
47
+ "374327157578",
48
+ "311632789559",
49
+ "523918658846",
50
+ "765981581453",
51
+ "794825141891",
52
+ "873898736873",
53
+ "447445629421",
54
+ "473822473819",
55
+ "181439694557",
56
+ "592538279337",
57
+ "668134915514",
58
+ "643692393748",
59
+ "696651276628",
60
+ "853859348234",
61
+ "778466723723",
62
+ "929826356991",
63
+ "272362973463",
64
+ "694235616268",
65
+ "281673864127",
66
+ "479676316326",
67
+ "646979124677",
68
+ "922327493433",
69
+ "883685933161",
70
+ "264259917554",
71
+ "836746273134",
72
+ "658481324922",
73
+ "481884157827",
74
+ "587787496812",
75
+ "579184949249",
76
+ "912193598348",
77
+ "529679678956",
78
+ "795838284624",
79
+ "159337222655",
80
+ "173781362446",
81
+ "773687856563",
82
+ "535787224917",
83
+ "351885857332",
84
+ "578827344666",
85
+ "198462689911",
86
+ "722618266242",
87
+ "952872416512",
88
+ "517778845323",
89
+ "749665846687",
90
+ "661436365453",
91
+ "259666844669",
92
+ "242851284913",
93
+ "514532995959",
94
+ "161588262349",
95
+ "742765629356",
96
+ "225164373623",
97
+ "676539973863",
98
+ "826214551218",
99
+ "182345464792",
100
+ "232776999554",
101
+ "337326533813",
102
+ "676676697292",
103
+ "929185622831",
104
+ "545512344383",
105
+ "499444466686",
106
+ "314697386682",
107
+ "517379856925",
108
+ "379557332953",
109
+ "614797267726",
110
+ "429781429464",
111
+ "922466849763",
112
+ "721737645236",
113
+ "479227349997",
114
+ "136931728327",
115
+ "259533577263",
116
+ "488538864842",
117
+ "937495658852",
118
+ "489991411364",
119
+ "499148455254",
120
+ "441373944925",
121
+ "899151413682",
122
+ "467893531755",
123
+ "527117488925",
124
+ "928335588653",
125
+ "374439448821",
126
+ "879425227932",
127
+ "867678158885",
128
+ "399749397872",
129
+ "129693547287",
130
+ "689285841825",
131
+ "771619544974",
132
+ "724883568652",
133
+ "516968424863",
134
+ "733737988257",
135
+ "852347289392",
136
+ "296953381169",
137
+ "377273562477",
138
+ "262296912232",
139
+ "547149832394",
140
+ "298464134954",
141
+ "216667245274",
142
+ "843998562287",
143
+ "572154333646",
144
+ "124589118494",
145
+ "841824384614",
146
+ "232896526252",
147
+ "295448593321",
148
+ "123741461297",
149
+ "653573457168",
150
+ "196735786156",
151
+ "377338713663",
152
+ "964342468552",
153
+ "586855179568",
154
+ "484773717614",
155
+ "894885246797",
156
+ "677896358599",
157
+ "848845611563",
158
+ "851852651677",
159
+ "398549545767",
160
+ "454244839926",
161
+ "799364566435",
162
+ "967114116556",
163
+ "817378986438",
164
+ "233795848681",
165
+ "824387273757",
166
+ "916198946615",
167
+ "563117729724",
168
+ "951794811935",
169
+ "374598961236",
170
+ "922867396683",
171
+ "765737843639",
172
+ "175469284871",
173
+ "231853711778",
174
+ "662426712668",
175
+ "711412347158",
176
+ "753466987363",
177
+ "513361312532",
178
+ "712992815957",
179
+ "971621888444",
180
+ "829235161526",
181
+ "585544633356",
182
+ "582471228164",
183
+ "678666359123",
184
+ "557533689478",
185
+ "632962475133",
186
+ "484489193824",
187
+ "489562189822",
188
+ "589547936288",
189
+ "363214487524",
190
+ "244885399387",
191
+ "431751228368",
192
+ "433581868192",
193
+ "486391569221",
194
+ "185438575221",
195
+ "126574388585",
196
+ "741757479784",
197
+ "529854679937",
198
+ "996116119839",
199
+ "616248973917",
200
+ "763531783491",
201
+ "955456118295",
202
+ "364196983365",
203
+ "195792996468",
204
+ "151859598873",
205
+ "399223169721",
206
+ "938488813964",
207
+ "961981959227",
208
+ "183368827562",
209
+ "533417736566",
210
+ "786391632558",
211
+ "665661658354",
212
+ "693281533643",
213
+ "475794684356",
214
+ "652154162978",
215
+ "753233719644",
216
+ "668514843129",
217
+ "819162623892",
218
+ "941169431859",
219
+ "877385381798",
220
+ "752644929761",
221
+ "881136466196",
222
+ "275597777299",
223
+ "731681792655",
224
+ "961133895172",
225
+ "864718285734",
226
+ "963852916563",
227
+ "319584985416",
228
+ "563365646341",
229
+ "811371928234",
230
+ "837131396371",
231
+ "267514771964",
232
+ "944513428457",
233
+ "117298239631",
234
+ "158142752582",
235
+ "252867443568",
236
+ "839269684865",
237
+ "612788593128",
238
+ "145669731981",
239
+ "121557291859",
240
+ "245416776926",
241
+ "799417897197",
242
+ "997958836435",
243
+ "892336777248",
244
+ "158929292238",
245
+ "581976444672",
246
+ "897784492783",
247
+ "492373714791",
248
+ "512659818733",
249
+ "881112998642",
250
+ "619454958782",
251
+ "431149748713",
252
+ "624221476921",
253
+ "125866399464",
254
+ "339882449689",
255
+ "186198784585",
256
+ "943193294691",
257
+ "955668961269",
258
+ "232787996724",
259
+ "215671314196",
260
+ "286173241916",
261
+ "745977673725",
262
+ "556976448182",
263
+ "599961512792",
264
+ "766294538337",
265
+ "934912591213",
266
+ "295118729589",
267
+ "529455466433",
268
+ "196119929397",
269
+ "379571934299",
270
+ "251789649997",
271
+ "564544131355",
272
+ "244371196654",
273
+ "384598329253",
274
+ "887753195844",
275
+ "364947325679",
276
+ "655517954651",
277
+ "673948786567",
278
+ "857231548835",
279
+ "816115936673",
280
+ "644234165531",
281
+ "182782912224",
282
+ "234316622259",
283
+ "421369185549",
284
+ "434632855397",
285
+ "921889371893",
286
+ "415956914763",
287
+ "598916996413",
288
+ "773671349113",
289
+ "952465217972",
290
+ "117657531962",
291
+ "729825168745",
292
+ "691315125346",
293
+ "768461952319",
294
+ "664847713559",
295
+ "953267689786",
296
+ "886464195129",
297
+ "824488329416",
298
+ "837873762491",
299
+ "532833541879",
300
+ "669183782449",
301
+ "941976537588",
302
+ "739394546916",
303
+ "267954879268",
304
+ "637551427887",
305
+ "217756494954",
306
+ "524444658383",
307
+ "117783274348",
308
+ "138218735276",
309
+ "814611949491",
310
+ "711641973413",
311
+ "499156317423",
312
+ "515856611931",
313
+ "454164859837",
314
+ "345271433112",
315
+ "462294118988",
316
+ "511785788222",
317
+ "497294727353",
318
+ "866519986723",
319
+ "334513529294",
320
+ "549946382131",
321
+ "284445431422",
322
+ "396521188476",
323
+ "421435255895",
324
+ "133373659361",
325
+ "322683334381",
326
+ "228358422847",
327
+ "291762694874",
328
+ "143182978129",
329
+ "511923256573",
330
+ "327158398268",
331
+ "879764613759",
332
+ "564395222747",
333
+ "451161679736",
334
+ "538631466654",
335
+ "221762325616",
336
+ "218391991184",
337
+ "322589379462",
338
+ "876537814263",
339
+ "152676556624",
340
+ "332522971941",
341
+ "884354318946",
342
+ "513349618943",
343
+ "116639746413",
344
+ "635185846287",
345
+ "993832498489",
346
+ "813981174797",
347
+ "438745114173",
348
+ "983493951323",
349
+ "724492262421",
350
+ "622553389126",
351
+ "889965243135",
352
+ "364492359246",
353
+ "154962668224",
354
+ "179564995814",
355
+ "418412875665",
356
+ "718951851413",
357
+ "699446724178",
358
+ "624266421831",
359
+ "815458725125",
360
+ "455423278865",
361
+ "393741199486",
362
+ "328552864359",
363
+ "211662639865",
364
+ "218784516525",
365
+ "762486672996",
366
+ "142799718159",
367
+ "858146415154",
368
+ "767858144912",
369
+ "571317457151",
370
+ "635127952696",
371
+ "116427191984",
372
+ "268921994538",
373
+ "523937669294",
374
+ "165429152138",
375
+ "739246183345",
376
+ "591464355756",
377
+ "212985874612",
378
+ "191887635211",
379
+ "967214577653",
380
+ "119342152414",
381
+ "946444632795",
382
+ "618423867817",
383
+ "228565148417",
384
+ "729116422489",
385
+ "527874729936",
386
+ "739784153482",
387
+ "387763951128",
388
+ "331369926711",
389
+ "562716493614",
390
+ "739667844957",
391
+ "562389434565",
392
+ "256497188281",
393
+ "859927364588",
394
+ "417668946583",
395
+ "357621613582",
396
+ "438435178228",
397
+ "485692541169",
398
+ "825815739116",
399
+ "342221452223",
400
+ "697747991249",
401
+ "716763689965",
402
+ "141499982867",
403
+ "818479319499",
404
+ "336813343298",
405
+ "594688742928",
406
+ "472129283475",
407
+ "514354144759",
408
+ "349249721685",
409
+ "546276298359",
410
+ "353755529131",
411
+ "315534574435",
412
+ "523723475786",
413
+ "215826764872",
414
+ "367968398551",
415
+ "569853653352",
416
+ "389715484387",
417
+ "293847485454",
418
+ "714738141818",
419
+ "178478368922",
420
+ "581493616981",
421
+ "589439538674",
422
+ "846657726193",
423
+ "722339992679",
424
+ "138154781148",
425
+ "757785319772",
426
+ "492516914298",
427
+ "919181521716",
428
+ "985781138935",
429
+ "476969195485",
430
+ "313145133463",
431
+ "758963111966",
432
+ "147541537162",
433
+ "557163366873",
434
+ "144373897488",
435
+ "522515164754",
436
+ "724964923582",
437
+ "284776712475",
438
+ "375429755114",
439
+ "181233596124",
440
+ "948585673431",
441
+ "243165586174",
442
+ "396847976144",
443
+ "997724962668",
444
+ "558837194455",
445
+ "163165456396",
446
+ "378749551722",
447
+ "161238482259",
448
+ "754978243758",
449
+ "195388849133",
450
+ "229775525672",
451
+ "262437452884",
452
+ "441377892146",
453
+ "451885565366",
454
+ "981277526855",
455
+ "762495822823",
456
+ "368763327262",
457
+ "757422791351",
458
+ "636324136426",
459
+ "214193645583",
460
+ "412843856172",
461
+ "179386156569",
462
+ "756916173536",
463
+ "892697125149",
464
+ "625334487352",
465
+ "941861857715",
466
+ "887417525236",
467
+ "649516938598",
468
+ "717628619782",
469
+ "438124184139",
470
+ "547563892268",
471
+ "856317483891",
472
+ "313313831273",
473
+ "371496153876",
474
+ "587541149322",
475
+ "265847332563",
476
+ "449549215429",
477
+ "163497196769",
478
+ "861342291298",
479
+ "268433315926",
480
+ "774679513717",
481
+ "851254219729",
482
+ "583527834464",
483
+ "488496781997",
484
+ "556814553861",
485
+ "482829231639",
486
+ "618878266619",
487
+ "147444452794",
488
+ "949235426629",
489
+ "357299947518",
490
+ "175528632226",
491
+ "645527857972",
492
+ "186872457894",
493
+ "552738847828",
494
+ "626748382482",
495
+ "921894985642",
496
+ "943878645871",
497
+ "859289776479",
498
+ "614583493135",
499
+ "933775286797",
500
+ "332234613346",
501
+ "325196781219",
502
+ "142526557681",
503
+ "356722692178",
504
+ "449318681694",
505
+ "687284547244",
506
+ "947262995132",
507
+ "893974619684",
508
+ "797238311233"
509
+ ],
510
+ "is_local": false,
511
+ "model_max_length": 1000000000000000019884624838656,
512
+ "pad_token": "<|pad|>",
513
+ "padding_side": "left",
514
+ "tokenizer_class": "GPT2Tokenizer",
515
+ "unk_token": "<|unk|>"
516
+ }
granite-4.0-h-tiny/base/train.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2026-03-18 08:23:07,141 [INFO] new_opacus_codex.train_steps: epoch=1 step=10 loss=3.7489
2
+ 2026-03-18 08:23:48,548 [INFO] new_opacus_codex.train_steps: epoch=1 step=20 loss=3.2296
3
+ 2026-03-18 08:24:29,243 [INFO] new_opacus_codex.train_steps: epoch=1 step=30 loss=2.0376