Spaces:
Sleeping
Sleeping
| # LoRA fine-tuning script for teaching multiplication of 6-digit numbers by a constant number (7) | |
| # Uses PEFT + TRL for efficient training on Qwen2.5-0.5B. | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import torch | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| TrainingArguments, | |
| ) | |
| from peft import LoraConfig, prepare_model_for_kbit_training | |
| from trl import SFTTrainer | |
| from datasets import Dataset | |
| import random | |
| import config | |
| def setup_model_and_tokenizer(use_4bit: bool = False): | |
| """Load model with optional 4-bit quantization.""" | |
| tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| if use_4bit: | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| # bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| config.BASE_MODEL, | |
| quantization_config=bnb_config, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| config.BASE_MODEL, | |
| # dtype=torch.bfloat16, | |
| dtype=torch.float16, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| return model, tokenizer | |
| def generate_training_data(num_samples: int, val_ratio: float = 0.1, seed: int = 42): | |
| random.seed(seed) | |
| # Generate all unique multiplication pairs to avoid duplicates | |
| examples = [] | |
| seen = set() | |
| while len(examples) < num_samples: | |
| a = random.randint(100000, 999999) | |
| b = 7 | |
| # Create canonical key to avoid duplicates (order doesn't matter for multiplication) | |
| key = a | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| result = a * b | |
| # Vary the prompt format for robustness | |
| prompt_templates = [f"{a} * {b}", f"{a}* {b}", f"{a} *{b}"] | |
| prompt = random.choice(prompt_templates) + random.choice(["", "?", " ?"]) | |
| examples.append( | |
| { | |
| "item": [ | |
| { | |
| "role": "system", | |
| "content": config.SYSTEM_PROMPT, | |
| }, | |
| {"role": "user", "content": prompt}, | |
| {"role": "assistant", "content": str(result)}, | |
| ] | |
| } | |
| ) | |
| # Shuffle and split into train/validation | |
| ds = Dataset.from_list(examples) | |
| ds.shuffle(seed) | |
| splitted = ds.train_test_split(test_size=val_ratio) | |
| return splitted | |
| def main(): | |
| output_dir = config.OUTPUT_DIR / "lora-multiplicator" | |
| print("Multiplication LoRA Fine-tuning") | |
| print(f"\nBase model: {config.BASE_MODEL}") | |
| # Check CUDA | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| print( | |
| f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB" | |
| ) | |
| # Load data | |
| print("\nGenerating training data...") | |
| dataset = generate_training_data(config.NUM_SAMPLES) | |
| print( | |
| f"train samples: {len(dataset['train'])}, validation samples: {len(dataset['test'])}" | |
| ) | |
| # Load model | |
| print(f"\nLoading model: {config.BASE_MODEL}") | |
| model, tokenizer = setup_model_and_tokenizer(torch.cuda.is_available()) | |
| peft_config = LoraConfig( | |
| r=config.LORA_R, | |
| lora_alpha=config.LORA_ALPHA, | |
| target_modules=config.TARGET_MODULES, | |
| lora_dropout=config.LORA_DROPOUT, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| # effective_batch_size = per_device_train_batch_size × gradient_accumulation_steps × num_gpus | |
| training_args = TrainingArguments( | |
| output_dir=str(output_dir), | |
| num_train_epochs=3, # Increased from 1 to 3 for better convergence on arithmetic tasks | |
| per_device_train_batch_size=4, # Increased from 2 for more stable gradients | |
| gradient_accumulation_steps=4, # Effective batch size of 16 | |
| gradient_checkpointing=True, # Trade compute for memory savings | |
| learning_rate=1e-3, # Increased from 2e-4 - higher LR works better for LoRA fine-tuning | |
| lr_scheduler_type="cosine", # Cosine annealing for better convergence | |
| bf16=torch.cuda.is_available(), | |
| warmup_ratio=0.05, | |
| logging_steps=10, | |
| save_strategy="steps", # Save checkpoints during training | |
| save_steps=200, # Save every 200 steps | |
| save_total_limit=2, # Keep only 2 best checkpoints to save disk space | |
| report_to="none", # No external reporting | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| remove_unused_columns=False, | |
| max_grad_norm=1.0, # Gradient clipping for training stability | |
| # evaluation | |
| eval_strategy="steps", # Changed from "epoch" to track loss during training | |
| eval_steps=100, # Evaluate every 100 steps | |
| do_eval=True, | |
| per_device_eval_batch_size=8, | |
| ) | |
| formatter = lambda example: ( | |
| tokenizer.apply_chat_template( | |
| example["item"], # | |
| tokenize=False, # return string, not tokens | |
| add_generation_prompt=False, # false for training | |
| ) | |
| ) | |
| # Create trainer | |
| trainer = SFTTrainer( | |
| model=model, | |
| processing_class=tokenizer, | |
| args=training_args, | |
| train_dataset=dataset["train"], | |
| eval_dataset=dataset["test"], | |
| peft_config=peft_config, | |
| formatting_func=formatter, | |
| ) | |
| # Train | |
| print("\nStarting training...") | |
| trainer.train() | |
| # Save final model | |
| final_path = output_dir / "final" | |
| print("\nSaving model...") | |
| trainer.save_model(str(final_path)) | |
| tokenizer.save_pretrained(str(final_path)) | |
| print("\nTraining complete!") | |
| print(f"Model saved to: {final_path}") | |
| if __name__ == "__main__": | |
| main() | |