File size: 6,472 Bytes

27c46c6

"""
Minimal Working Fine-tuning Script - No Complex Dependencies
Filename: finetune_minimal.py
"""

import torch
import os
import json
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

# Fix the import issues by reinstalling
import subprocess
import sys

def fix_environment():
    """Fix the broken environment"""
    print("Fixing environment...")
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False)
    subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False)
    subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False)

# Uncomment if needed
# fix_environment()

# Now import after fixing
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType

class SimpleDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length=1024):
        self.data = []
        with open(data_path, 'r') as f:
            for line in f:
                item = json.loads(line)
                self.data.append(item['text'])
        
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        encoded = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze()
        }

def train_simple():
    """Simple training without complex dependencies"""
    
    # Configuration
    model_name = "LiquidAI/LFM2-2.6B"
    data_dir = "./kokoro_processed_data"
    output_dir = "./lfm_minimal_output"
    batch_size = 4
    learning_rate = 2e-4
    num_epochs = 2
    max_length = 1024
    
    os.makedirs(output_dir, exist_ok=True)
    
    print("="*60)
    print("Minimal Fine-tuning Script")
    print("="*60)
    
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    
    # Load tokenizer
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # Load model
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    
    # Apply LoRA
    print("Applying LoRA...")
    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    # Load dataset
    print("Loading dataset...")
    train_dataset = SimpleDataset(
        os.path.join(data_dir, "train.jsonl"),
        tokenizer,
        max_length
    )
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    # Training loop
    print(f"\nStarting training for {num_epochs} epochs...")
    model.train()
    
    global_step = 0
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        
        for batch in progress_bar:
            global_step += 1
            
            # Move to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=input_ids
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            
            # Update weights every 4 steps (gradient accumulation)
            if global_step % 4 == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})
            
            # Save checkpoint
            if global_step % 500 == 0:
                print(f"\nSaving checkpoint at step {global_step}...")
                model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
                tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
    
    # Save final model
    print("\nSaving final model...")
    model.save_pretrained(os.path.join(output_dir, "final_model"))
    tokenizer.save_pretrained(os.path.join(output_dir, "final_model"))
    
    print(f"\n✅ Training complete! Model saved to {output_dir}/final_model")
    
    # Test the model
    print("\nTesting model...")
    test_model(os.path.join(output_dir, "final_model"))

def test_model(model_path):
    """Test the fine-tuned model"""
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )
    
    test_input = "最近ストレスを感じています。"
    prompt = f"""### Instruction:
あなたは心理カウンセラーです。

### Input:
{test_input}

### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids.cuda(),
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nTest Input: {test_input}")
    print(f"Response: {response.split('### Response:')[-1].strip()}")

if __name__ == "__main__":
    train_simple()