AdityaNarayan commited on
Commit
7fc0d01
·
verified ·
1 Parent(s): 3922567

added training_info.json

Browse files
Files changed (1) hide show
  1. training_info.json +86 -0
training_info.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "training_metadata": {
3
+ "timestamp": "20251021_084709",
4
+ "training_date": "2025-10-21",
5
+ "training_time": "15:45:44",
6
+ "final_epoch": 5.0,
7
+ "total_steps": 2355,
8
+ "status": "completed"
9
+ },
10
+ "model_config": {
11
+ "base_model": "Kwaipilot/KAT-Dev",
12
+ "model_type": "causal_lm",
13
+ "architecture": "Qwen2ForCausalLM"
14
+ },
15
+ "lora_config": {
16
+ "r": 64,
17
+ "lora_alpha": 128,
18
+ "lora_dropout": 0.05,
19
+ "target_modules": [
20
+ "q_proj",
21
+ "k_proj",
22
+ "v_proj",
23
+ "o_proj",
24
+ "gate_proj",
25
+ "up_proj",
26
+ "down_proj"
27
+ ]
28
+ },
29
+ "training_config": {
30
+ "num_epochs": 5,
31
+ "per_device_train_batch_size": 2,
32
+ "per_device_eval_batch_size": 2,
33
+ "gradient_accumulation_steps": 8,
34
+ "effective_batch_size": 32,
35
+ "learning_rate": 5e-05,
36
+ "lr_scheduler_type": "cosine",
37
+ "warmup_ratio": 0.03,
38
+ "weight_decay": 0.1,
39
+ "max_grad_norm": 0.5,
40
+ "bf16": true,
41
+ "gradient_checkpointing": true,
42
+ "optim": "adamw_torch",
43
+ "logging_steps": 10,
44
+ "save_steps": 50,
45
+ "eval_steps": 25
46
+ },
47
+ "dataset_info": {
48
+ "train_samples": 15057,
49
+ "eval_samples": 1674,
50
+ "max_seq_length": 8192,
51
+ "sample_packing": false
52
+ },
53
+ "hardware_config": {
54
+ "num_gpus": 2,
55
+ "gpu_model": "NVIDIA H200",
56
+ "distributed_strategy": "DeepSpeed ZeRO-2",
57
+ "flash_attention": "2.8.3"
58
+ },
59
+ "performance_metrics": {
60
+ "final_train_loss": 0.4799,
61
+ "final_eval_loss": 0.46369343996047974,
62
+ "final_train_perplexity": 1.6159128028327767,
63
+ "final_eval_perplexity": 1.5899354850369571,
64
+ "final_token_accuracy": 0.8899670094251633,
65
+ "initial_loss": 1.6335,
66
+ "initial_perplexity": 5.121769577787628,
67
+ "initial_accuracy": 0.6116928663104773
68
+ },
69
+ "framework_versions": {
70
+ "torch": "2.4.1+cu124",
71
+ "transformers": "4.57.1",
72
+ "peft": "0.17.1",
73
+ "trl": "0.23.1",
74
+ "deepspeed": "0.18.0",
75
+ "flash_attn": "2.8.3",
76
+ "python": "3.12.3"
77
+ },
78
+ "special_features": {
79
+ "flash_attention_2": true,
80
+ "gradient_checkpointing": true,
81
+ "bf16_training": true,
82
+ "sample_packing": false,
83
+ "deepspeed_zero2": true,
84
+ "distributed_training": true
85
+ }
86
+ }