| { | |
| "model": { | |
| "base_model": "zai-org/GLM-4.6", | |
| "final_model_path": "outputs_fsdp/final_model" | |
| }, | |
| "training_config": { | |
| "lora_r": 64, | |
| "lora_alpha": 128, | |
| "lora_dropout": 0.05, | |
| "lora_target_modules": [ | |
| "q_proj", | |
| "k_proj", | |
| "v_proj", | |
| "o_proj" | |
| ], | |
| "learning_rate": 2.5e-05, | |
| "lr_scheduler_type": "cosine", | |
| "micro_batch_size": 1, | |
| "gradient_accumulation_steps": 2, | |
| "effective_batch_size": 32, | |
| "sequence_length": 16384, | |
| "chunk_overlap": 2048, | |
| "weight_decay": 0.01, | |
| "max_grad_norm": 1.0, | |
| "warmup_ratio": 0.1, | |
| "eval_split": 0.05, | |
| "bf16": true, | |
| "seed": 42 | |
| }, | |
| "hardware": { | |
| "num_gpus": 16, | |
| "gpu_name": "NVIDIA H200", | |
| "num_nodes": 1, | |
| "gpus_per_node": 8 | |
| }, | |
| "phases": [ | |
| { | |
| "phase": 1, | |
| "name": "phase1_foundation", | |
| "description": "Foundation: Learn codebase structure and file patterns", | |
| "dataset": "dataset/phase1_foundation.jsonl", | |
| "epochs": 2, | |
| "learning_rate": 2.5e-05, | |
| "warmup_ratio": 0.15, | |
| "num_train_samples": 9293, | |
| "num_eval_samples": 512, | |
| "num_chunks": 9805, | |
| "train_metrics": { | |
| "train_runtime": 116260.86471509933, | |
| "train_runtime_minutes": 1937.6810785849889, | |
| "train_steps": 581, | |
| "train_loss": 0.6090635275339993, | |
| "train_perplexity": 1.838708692210649, | |
| "samples_per_second": 0.1598646289578659, | |
| "steps_per_second": 0.004997382407431405 | |
| }, | |
| "eval_metrics": { | |
| "eval_loss": 0.34885369252151577, | |
| "eval_perplexity": 1.4174417928358451, | |
| "eval_accuracy": 90.55727554179566, | |
| "best_eval_loss": 0.34950478435712284 | |
| } | |
| }, | |
| { | |
| "phase": 2, | |
| "name": "phase2_evolution", | |
| "description": "Evolution: Learn commit patterns and code changes", | |
| "dataset": "dataset/phase2_evolution.jsonl", | |
| "epochs": 2, | |
| "learning_rate": 2e-05, | |
| "warmup_ratio": 0.1, | |
| "num_train_samples": 16622, | |
| "num_eval_samples": 1545, | |
| "num_chunks": 18167, | |
| "train_metrics": { | |
| "train_runtime": 232094.11823368073, | |
| "train_runtime_minutes": 3868.235303894679, | |
| "train_steps": 1039, | |
| "train_loss": 0.7480631428085002, | |
| "train_perplexity": 2.112903658217297, | |
| "samples_per_second": 0.14323499558282102, | |
| "steps_per_second": 0.004476632186576557 | |
| }, | |
| "eval_metrics": { | |
| "eval_loss": 2.45627436399119, | |
| "eval_perplexity": 11.661284805363318, | |
| "eval_accuracy": 42.272774071154785, | |
| "best_eval_loss": 2.456274959661988 | |
| } | |
| }, | |
| { | |
| "phase": 3, | |
| "name": "phase3_pr_mastery", | |
| "description": "PR Mastery: Learn PR review patterns and discussions", | |
| "dataset": "dataset/phase3_pr_mastery.jsonl", | |
| "epochs": 1, | |
| "learning_rate": 1.5e-05, | |
| "warmup_ratio": 0.05, | |
| "num_train_samples": 9797, | |
| "num_eval_samples": 509, | |
| "num_chunks": 10306, | |
| "train_metrics": { | |
| "train_runtime": 63952.77484560013, | |
| "train_runtime_minutes": 1065.8795807600022, | |
| "train_steps": 306, | |
| "train_loss": 0.4651245652436236, | |
| "train_perplexity": 1.592212510874149, | |
| "samples_per_second": 0.15319116369309535, | |
| "steps_per_second": 0.004784780656332263 | |
| }, | |
| "eval_metrics": { | |
| "eval_loss": 0.47181596884547616, | |
| "eval_perplexity": 1.6029023726075684, | |
| "eval_accuracy": 90.83844610286057, | |
| "best_eval_loss": 0.47182859617532813 | |
| } | |
| } | |
| ], | |
| "phase_checkpoints": [ | |
| "outputs_fsdp/phase1_foundation/final", | |
| "outputs_fsdp/phase2_evolution/final", | |
| "outputs_fsdp/phase3_pr_mastery/final" | |
| ], | |
| "summary": { | |
| "initial_train_loss": 0.6090635275339993, | |
| "final_train_loss": 0.4651245652436236, | |
| "initial_eval_loss": 0.34885369252151577, | |
| "final_eval_loss": 0.47181596884547616, | |
| "initial_perplexity": 1.4174417928358451, | |
| "final_perplexity": 1.6029023726075684, | |
| "total_epochs": 5, | |
| "total_phases": 3, | |
| "total_steps": 1926, | |
| "total_training_time_seconds": 419524.9958562851, | |
| "total_training_time_hours": 116.5347210711903 | |
| }, | |
| "timestamp": "20251214_065303", | |
| "run_name": "glm-curriculum-16gpu", | |
| "output_directory": "outputs_fsdp" | |
| } |