# finetune_lfm2_2.6b_FIXED.py import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, GPT2Tokenizer ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from datasets import load_from_disk from dataclasses import dataclass from typing import Any, Dict, List import wandb import os import warnings warnings.filterwarnings('ignore') print("=" * 80) print("LFM2-2.6B FINE-TUNING - FIXED VERSION") print("=" * 80) print(f"PyTorch: {torch.__version__}") print(f"CUDA: {torch.cuda.is_available()}") print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}") if torch.cuda.is_available(): gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) print(f"GPU Memory: {gpu_memory_gb:.1f} GB") import bitsandbytes as bnb print("āœ… BitsAndBytes OK") # Initialize W&B wandb.init( project="liquid-ai-hackathon-kokorochat", name="LFM2-2.6B-counselor-FIXED", config={ "model": "LFM2-2.6B", "dataset": "KokoroChat-MultiTurn", "task": "psychological-counseling" } ) print("\n" + "=" * 80) print("LOADING MODEL (WITH FALLBACK)") print("=" * 80) LOCAL_MODEL_PATH = "./models/LFM2-2.6B" HF_MODEL_NAME = "LiquidAI/LFM2-2.6B" # 1. Load tokenizer with GPT2 fallback print("\n1. Loading tokenizer...") try: tokenizer = AutoTokenizer.from_pretrained( LOCAL_MODEL_PATH, trust_remote_code=True, local_files_only=True ) print(" āœ… LFM2 tokenizer loaded!") except Exception as e: print(f" āš ļø LFM2 tokenizer failed") print(" šŸ”„ Using GPT2 tokenizer...") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") print(" āœ… GPT2 tokenizer loaded!") tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" # 2. QLoRA config print("\n2. Configuring QLoRA...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) # 3. Load model with proper fallback print("\n3. Loading LFM2-2.6B model...") # First, try to ensure we have the custom model files print(" šŸ“„ Checking for custom model files...") # Check if modeling files exist custom_files = ["modeling_lfm2.py", "configuration_lfm2.py"] has_custom_files = all( os.path.exists(os.path.join(LOCAL_MODEL_PATH, f)) for f in custom_files ) if not has_custom_files: print(" āš ļø Custom model files missing in local directory") print(" šŸ“„ Need to download from HuggingFace with custom code...") # Download with custom code from huggingface_hub import snapshot_download print(" ā³ Downloading model with custom code (one-time)...") snapshot_download( repo_id=HF_MODEL_NAME, local_dir=LOCAL_MODEL_PATH, local_dir_use_symlinks=False, ignore_patterns=[] # Don't ignore anything ) print(" āœ… Model downloaded with custom code!") # Now load the model print(" ā³ Loading model (~2-4 minutes)...") try: # Try local first with trust_remote_code model = AutoModelForCausalLM.from_pretrained( LOCAL_MODEL_PATH, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, # CRITICAL! torch_dtype=torch.bfloat16, local_files_only=False # Allow downloading custom code if needed ) print(" āœ… Model loaded from local!") except Exception as e: print(f" āš ļø Local load failed: {str(e)[:100]}") print(" šŸ“„ Loading directly from HuggingFace...") # Load from HuggingFace Hub model = AutoModelForCausalLM.from_pretrained( HF_MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16 ) print(" āœ… Model loaded from HuggingFace!") model = prepare_model_for_kbit_training(model) model.config.use_cache = False print(" āœ… Model prepared!") # 4. LoRA - 2.6B configuration print("\n4. Applying LoRA (2.6B config)...") lora_config = LoraConfig( r=64, # Higher for 2.6B lora_alpha=128, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) print("\nšŸ“Š Trainable Parameters:") model.print_trainable_parameters() # 5. Load dataset print("\n5. Loading dataset...") dataset = load_from_disk("./kokorochat_processed_multiturn") print(f" āœ… Training: {len(dataset['train']):,}, Val: {len(dataset['test']):,}") # 6. Data Collator (same as 1.2B) @dataclass class DataCollatorForCausalLM: tokenizer: Any max_length: int = 2048 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]: texts = [f["text"] for f in features] batch = self.tokenizer( texts, max_length=self.max_length, padding=True, truncation=True, return_tensors="pt" ) batch["labels"] = batch["input_ids"].clone() batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100 return batch data_collator = DataCollatorForCausalLM(tokenizer=tokenizer) # 7. Training Configuration - 2.6B optimized print("\n6. Configuring training (2.6B optimized)...") gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3) if gpu_memory_gb >= 70: per_device_batch = 2 grad_accum = 16 print(f" šŸš€ {gpu_memory_gb:.0f}GB GPU → batch=2, accum=16") else: per_device_batch = 1 grad_accum = 32 print(f" ⚔ {gpu_memory_gb:.0f}GB GPU → batch=1, accum=32") training_args = TrainingArguments( output_dir="./lfm2-2.6b-checkpoints-fixed", # Batch (memory-adjusted for 2.6B) per_device_train_batch_size=per_device_batch, per_device_eval_batch_size=per_device_batch, gradient_accumulation_steps=grad_accum, # Learning (optimized for 2.6B) num_train_epochs=3, # 2.6B learns faster learning_rate=2e-4, # Lower for stability warmup_steps=200, lr_scheduler_type="cosine", # Optimization fp16=False, bf16=True, logging_steps=10, eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=5, load_best_model_at_end=True, metric_for_best_model="eval_loss", optim="paged_adamw_8bit", report_to="wandb", gradient_checkpointing=True, max_grad_norm=0.3, logging_dir="./logs", remove_unused_columns=False, dataloader_num_workers=4, dataloader_pin_memory=True, ) effective_batch = per_device_batch * grad_accum steps_per_epoch = len(dataset['train']) // effective_batch total_steps = steps_per_epoch * 3 print("\n" + "=" * 80) print("šŸ“Š 2.6B TRAINING CONFIGURATION") print("=" * 80) print(f"\nāœ… Batch Config:") print(f" Per-device: {per_device_batch}") print(f" Gradient accum: {grad_accum}") print(f" → Effective: {effective_batch}") print(f"\nāœ… Learning Config:") print(f" Learning rate: 2e-4 (vs 3e-4 for 1.2B)") print(f" Epochs: 3 (vs 4 for 1.2B)") print(f" LoRA rank: 64 (vs 32 for 1.2B)") print(f"\nāœ… Training Stats:") print(f" Training samples: {len(dataset['train']):,}") print(f" Steps per epoch: {steps_per_epoch:,}") print(f" Total steps: {total_steps:,}") print(f"\nā±ļø Estimated Time:") if gpu_memory_gb >= 80: print(f" ~5-8 hours on {gpu_memory_gb:.0f}GB GPU") else: print(f" ~8-12 hours on {gpu_memory_gb:.0f}GB GPU") # 8. Trainer (same as 1.2B) trainer = Trainer( model=model, args=training_args, train_dataset=dataset["train"], eval_dataset=dataset["test"], data_collator=data_collator, ) # 9. Start training print("\n" + "=" * 80) print("šŸš€ STARTING 2.6B TRAINING") print("=" * 80) print(f"šŸ“Š Monitor: https://wandb.ai/sandeeptechiot-ai/liquid-ai-hackathon-kokorochat\n") try: trainer.train() print("\nāœ… TRAINING COMPLETE!") except KeyboardInterrupt: print("\nāš ļø Interrupted - saving...") trainer.save_model("./lfm2-2.6b-interrupted") except Exception as e: print(f"\nāŒ Error: {e}") import traceback traceback.print_exc() raise # 10. Save output_dir = "./lfm2-2.6b-counselor-final" lora_dir = "./lfm2-2.6b-counselor-lora" trainer.save_model(output_dir) tokenizer.save_pretrained(output_dir) model.save_pretrained(lora_dir) print(f"\nāœ… Model saved to: {output_dir}") wandb.finish() print("\n" + "=" * 80) print("šŸŽ‰ 2.6B TRAINING COMPLETE!") print("=" * 80)