""" Minimal Working Fine-tuning Script - No Complex Dependencies Filename: finetune_minimal.py """ import torch import os import json from torch.utils.data import Dataset, DataLoader from tqdm import tqdm import numpy as np # Fix the import issues by reinstalling import subprocess import sys def fix_environment(): """Fix the broken environment""" print("Fixing environment...") subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False) subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False) subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False) # Uncomment if needed # fix_environment() # Now import after fixing from transformers import AutoModelForCausalLM, AutoTokenizer from peft import LoraConfig, get_peft_model, TaskType class SimpleDataset(Dataset): def __init__(self, data_path, tokenizer, max_length=1024): self.data = [] with open(data_path, 'r') as f: for line in f: item = json.loads(line) self.data.append(item['text']) self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, idx): text = self.data[idx] encoded = self.tokenizer( text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt' ) return { 'input_ids': encoded['input_ids'].squeeze(), 'attention_mask': encoded['attention_mask'].squeeze() } def train_simple(): """Simple training without complex dependencies""" # Configuration model_name = "LiquidAI/LFM2-2.6B" data_dir = "./kokoro_processed_data" output_dir = "./lfm_minimal_output" batch_size = 4 learning_rate = 2e-4 num_epochs = 2 max_length = 1024 os.makedirs(output_dir, exist_ok=True) print("="*60) print("Minimal Fine-tuning Script") print("="*60) # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device: {device}") # Load tokenizer print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model print("Loading model...") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) # Apply LoRA print("Applying LoRA...") peft_config = LoraConfig( r=32, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.CAUSAL_LM ) model = get_peft_model(model, peft_config) model.print_trainable_parameters() # Load dataset print("Loading dataset...") train_dataset = SimpleDataset( os.path.join(data_dir, "train.jsonl"), tokenizer, max_length ) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True ) # Optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # Training loop print(f"\nStarting training for {num_epochs} epochs...") model.train() global_step = 0 for epoch in range(num_epochs): print(f"\nEpoch {epoch+1}/{num_epochs}") total_loss = 0 progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}") for batch in progress_bar: global_step += 1 # Move to device input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) # Forward pass outputs = model( input_ids=input_ids, attention_mask=attention_mask, labels=input_ids ) loss = outputs.loss total_loss += loss.item() # Backward pass loss.backward() # Update weights every 4 steps (gradient accumulation) if global_step % 4 == 0: optimizer.step() optimizer.zero_grad() # Update progress bar progress_bar.set_postfix({'loss': loss.item()}) # Save checkpoint if global_step % 500 == 0: print(f"\nSaving checkpoint at step {global_step}...") model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}")) tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}")) avg_loss = total_loss / len(train_loader) print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}") # Save final model print("\nSaving final model...") model.save_pretrained(os.path.join(output_dir, "final_model")) tokenizer.save_pretrained(os.path.join(output_dir, "final_model")) print(f"\n✅ Training complete! Model saved to {output_dir}/final_model") # Test the model print("\nTesting model...") test_model(os.path.join(output_dir, "final_model")) def test_model(model_path): """Test the fine-tuned model""" tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.bfloat16, device_map="auto" ) test_input = "最近ストレスを感じています。" prompt = f"""### Instruction: あなたは心理カウンセラーです。 ### Input: {test_input} ### Response: """ inputs = tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( inputs.input_ids.cuda(), max_new_tokens=100, temperature=0.7, do_sample=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"\nTest Input: {test_input}") print(f"Response: {response.split('### Response:')[-1].strip()}") if __name__ == "__main__": train_simple()