AdityaNarayan
/

GLM-4.5-Air-CPT-LoRA-HyperSwitch

Text Generation

Model card Files Files and versions

GLM-4.5-Air-CPT-LoRA-HyperSwitch / training_info.json

AdityaNarayan's picture

uploaded LoRA Adapters and Tokenizer

9eb0baf verified 6 months ago

history blame contribute delete

2.6 kB

	{
	"training_metadata": {
	"timestamp": "20251111_014347",
	"training_date": "2025-11-11",
	"training_time": "01:43:47",
	"final_epoch": 3,
	"total_steps": null,
	"status": "completed",
	"run_name": "GLM-4.5-Air_lr0.0002_20251110_102736"
	},
	"model_config": {
	"base_model": "zai-org/GLM-4.5-Air",
	"model_type": "moe_causal_lm",
	"architecture": "Glm4MoeForCausalLM",
	"total_parameters": 13360487424,
	"trainable_parameters": 3956736,
	"trainable_percentage": "0.0296%"
	},
	"lora_config": {
	"r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"target_modules": [
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj"
	],
	"exclude_modules": [
	"block_sparse_moe",
	"w1",
	"w2",
	"w3",
	"gate"
	],
	"bias": "none",
	"use_rslora": true
	},
	"training_config": {
	"num_epochs": 3,
	"per_device_train_batch_size": 1,
	"per_device_eval_batch_size": 1,
	"gradient_accumulation_steps": 16,
	"effective_batch_size": 128,
	"learning_rate": 0.0002,
	"lr_scheduler_type": "cosine",
	"warmup_ratio": 0.03,
	"weight_decay": 0.01,
	"max_grad_norm": 1.0,
	"bf16": true,
	"gradient_checkpointing": true,
	"optim": "adafactor",
	"logging_steps": 10,
	"save_steps": 50,
	"eval_steps": 10
	},
	"dataset_info": {
	"train_samples": 16450,
	"eval_samples": 100,
	"max_seq_length": 1024,
	"data_source": "hyperswitch"
	},
	"hardware_config": {
	"num_gpus": 8,
	"gpu_model": "NVIDIA H200",
	"gpu_memory_per_device_gb": 141,
	"distributed_strategy": "FSDP (Fully Sharded Data Parallel)",
	"fsdp_sharding_strategy": "FULL_SHARD",
	"flash_attention": "2.8.3"
	},
	"moe_config": {
	"use_auxiliary_loss": true,
	"auxiliary_loss_weight": 0.001,
	"freeze_router": false,
	"num_experts_per_token": 2,
	"monitor_expert_usage": true
	},
	"performance_metrics": {
	"final_train_loss": 0.5652141638812477,
	"final_train_runtime": 54547.05,
	"final_train_samples_per_second": 0.905,
	"final_train_steps_per_second": 0.007,
	"final_train_perplexity": 1.7598246332402598
	},
	"framework_versions": {
	"torch": "2.5.1+cu121",
	"transformers": "4.57.1",
	"peft": "0.17.1",
	"accelerate": "1.11.0",
	"python": "3.12.3",
	"flash_attn": "2.8.3"
	},
	"special_features": {
	"flash_attention_2": true,
	"gradient_checkpointing": true,
	"bf16_training": true,
	"fsdp_training": true,
	"attention_only_lora": true,
	"frozen_experts": true,
	"eval_accumulation": true
	}
	}