lfm_complete_code / finetune_trl_supervised.py
Techiiot's picture
Upload folder using huggingface_hub
27c46c6 verified
"""
Minimal Working Fine-tuning Script - No Complex Dependencies
Filename: finetune_minimal.py
"""
import torch
import os
import json
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
# Fix the import issues by reinstalling
import subprocess
import sys
def fix_environment():
"""Fix the broken environment"""
print("Fixing environment...")
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False)
subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False)
subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False)
# Uncomment if needed
# fix_environment()
# Now import after fixing
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
class SimpleDataset(Dataset):
def __init__(self, data_path, tokenizer, max_length=1024):
self.data = []
with open(data_path, 'r') as f:
for line in f:
item = json.loads(line)
self.data.append(item['text'])
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = self.data[idx]
encoded = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': encoded['input_ids'].squeeze(),
'attention_mask': encoded['attention_mask'].squeeze()
}
def train_simple():
"""Simple training without complex dependencies"""
# Configuration
model_name = "LiquidAI/LFM2-2.6B"
data_dir = "./kokoro_processed_data"
output_dir = "./lfm_minimal_output"
batch_size = 4
learning_rate = 2e-4
num_epochs = 2
max_length = 1024
os.makedirs(output_dir, exist_ok=True)
print("="*60)
print("Minimal Fine-tuning Script")
print("="*60)
# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# Apply LoRA
print("Applying LoRA...")
peft_config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Load dataset
print("Loading dataset...")
train_dataset = SimpleDataset(
os.path.join(data_dir, "train.jsonl"),
tokenizer,
max_length
)
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True
)
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Training loop
print(f"\nStarting training for {num_epochs} epochs...")
model.train()
global_step = 0
for epoch in range(num_epochs):
print(f"\nEpoch {epoch+1}/{num_epochs}")
total_loss = 0
progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
for batch in progress_bar:
global_step += 1
# Move to device
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
# Forward pass
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=input_ids
)
loss = outputs.loss
total_loss += loss.item()
# Backward pass
loss.backward()
# Update weights every 4 steps (gradient accumulation)
if global_step % 4 == 0:
optimizer.step()
optimizer.zero_grad()
# Update progress bar
progress_bar.set_postfix({'loss': loss.item()})
# Save checkpoint
if global_step % 500 == 0:
print(f"\nSaving checkpoint at step {global_step}...")
model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
avg_loss = total_loss / len(train_loader)
print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")
# Save final model
print("\nSaving final model...")
model.save_pretrained(os.path.join(output_dir, "final_model"))
tokenizer.save_pretrained(os.path.join(output_dir, "final_model"))
print(f"\nβœ… Training complete! Model saved to {output_dir}/final_model")
# Test the model
print("\nTesting model...")
test_model(os.path.join(output_dir, "final_model"))
def test_model(model_path):
"""Test the fine-tuned model"""
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto"
)
test_input = "ζœ€θΏ‘γ‚Ήγƒˆγƒ¬γ‚Ήγ‚’ζ„Ÿγ˜γ¦γ„γΎγ™γ€‚"
prompt = f"""### Instruction:
あγͺγŸγ―εΏƒη†γ‚«γ‚¦γƒ³γ‚»γƒ©γƒΌγ§γ™γ€‚
### Input:
{test_input}
### Response:
"""
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
inputs.input_ids.cuda(),
max_new_tokens=100,
temperature=0.7,
do_sample=True
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"\nTest Input: {test_input}")
print(f"Response: {response.split('### Response:')[-1].strip()}")
if __name__ == "__main__":
train_simple()