AdityaNarayan commited on
Commit
311f565
·
verified ·
1 Parent(s): 11bb043

Delete training_info.json

Browse files
Files changed (1) hide show
  1. training_info.json +0 -140
training_info.json DELETED
@@ -1,140 +0,0 @@
1
- {
2
- "model": {
3
- "base_model": "zai-org/GLM-4.5-Air",
4
- "final_model_path": "outputs_fsdp/final_model"
5
- },
6
- "training_config": {
7
- "lora_r": 128,
8
- "lora_alpha": 256,
9
- "lora_dropout": 0.05,
10
- "lora_target_modules": [
11
- "q_proj",
12
- "k_proj",
13
- "v_proj",
14
- "o_proj"
15
- ],
16
- "learning_rate": 2.5e-05,
17
- "lr_scheduler_type": "cosine",
18
- "micro_batch_size": 1,
19
- "gradient_accumulation_steps": 2,
20
- "effective_batch_size": 32,
21
- "sequence_length": 16384,
22
- "chunk_overlap": 2048,
23
- "weight_decay": 0.01,
24
- "max_grad_norm": 1.0,
25
- "warmup_ratio": 0.1,
26
- "eval_split": 0.05,
27
- "bf16": true,
28
- "seed": 42
29
- },
30
- "hardware": {
31
- "num_gpus": 16,
32
- "gpu_name": "NVIDIA H200",
33
- "num_nodes": 1,
34
- "gpus_per_node": 8
35
- },
36
- "phases": [
37
- {
38
- "phase": 1,
39
- "name": "phase1_foundation",
40
- "description": "Foundation: Learn codebase structure and file patterns",
41
- "dataset": "dataset/phase1_foundation.jsonl",
42
- "epochs": 2,
43
- "learning_rate": 2.5e-05,
44
- "warmup_ratio": 0.15,
45
- "num_train_samples": 9293,
46
- "num_eval_samples": 512,
47
- "num_chunks": 9805,
48
- "train_metrics": {
49
- "train_runtime": 45748.92132782936,
50
- "train_runtime_minutes": 762.4820221304893,
51
- "train_steps": 581,
52
- "train_loss": 0.5921854273129171,
53
- "train_perplexity": 1.8079352121008547,
54
- "samples_per_second": 0.40626094475136876,
55
- "steps_per_second": 0.012699752980767526
56
- },
57
- "eval_metrics": {
58
- "eval_loss": 0.36529209305808763,
59
- "eval_perplexity": 1.4409348337482015,
60
- "eval_accuracy": 88.77101374493351,
61
- "best_eval_loss": 0.36561795309899026
62
- }
63
- },
64
- {
65
- "phase": 2,
66
- "name": "phase2_evolution",
67
- "description": "Evolution: Learn commit patterns and code changes",
68
- "dataset": "dataset/phase2_evolution.jsonl",
69
- "epochs": 2,
70
- "learning_rate": 2e-05,
71
- "warmup_ratio": 0.1,
72
- "num_train_samples": 16622,
73
- "num_eval_samples": 1545,
74
- "num_chunks": 18167,
75
- "train_metrics": {
76
- "train_runtime": 88820.11419820786,
77
- "train_runtime_minutes": 1480.3352366367976,
78
- "train_steps": 1039,
79
- "train_loss": 0.790716471444525,
80
- "train_perplexity": 2.204975662547615,
81
- "samples_per_second": 0.37428458970243894,
82
- "steps_per_second": 0.0116978007670808
83
- },
84
- "eval_metrics": {
85
- "eval_loss": 2.551615942151948,
86
- "eval_perplexity": 12.827816051917177,
87
- "eval_accuracy": 40.84345327062199,
88
- "best_eval_loss": 2.5516352893463
89
- }
90
- },
91
- {
92
- "phase": 3,
93
- "name": "phase3_pr_mastery",
94
- "description": "PR Mastery: Learn PR review patterns and discussions",
95
- "dataset": "dataset/phase3_pr_mastery.jsonl",
96
- "epochs": 1,
97
- "learning_rate": 1.5e-05,
98
- "warmup_ratio": 0.05,
99
- "num_train_samples": 9797,
100
- "num_eval_samples": 509,
101
- "num_chunks": 10306,
102
- "train_metrics": {
103
- "train_runtime": 24744.46716451645,
104
- "train_runtime_minutes": 412.40778607527415,
105
- "train_steps": 306,
106
- "train_loss": 0.49508867293498876,
107
- "train_perplexity": 1.6406437133004639,
108
- "samples_per_second": 0.3959268928631,
109
- "steps_per_second": 0.012366400859049565
110
- },
111
- "eval_metrics": {
112
- "eval_loss": 0.5012174650255474,
113
- "eval_perplexity": 1.6507297535648182,
114
- "eval_accuracy": 90.171501333015,
115
- "best_eval_loss": 0.5012283607793506
116
- }
117
- }
118
- ],
119
- "phase_checkpoints": [
120
- "outputs_fsdp/phase1_foundation/final",
121
- "outputs_fsdp/phase2_evolution/final",
122
- "outputs_fsdp/phase3_pr_mastery/final"
123
- ],
124
- "summary": {
125
- "initial_train_loss": 0.5921854273129171,
126
- "final_train_loss": 0.49508867293498876,
127
- "initial_eval_loss": 0.36529209305808763,
128
- "final_eval_loss": 0.5012174650255474,
129
- "initial_perplexity": 1.4409348337482015,
130
- "final_perplexity": 1.6507297535648182,
131
- "total_epochs": 5,
132
- "total_phases": 3,
133
- "total_steps": 1926,
134
- "total_training_time_seconds": 161561.2551908493,
135
- "total_training_time_hours": 44.87812644190259
136
- },
137
- "timestamp": "20251211_212051",
138
- "run_name": "glm-air-curriculum-16gpu",
139
- "output_directory": "outputs_fsdp"
140
- }