lfm_complete_code / finetune_trl_supervised.py

Upload folder using huggingface_hub

27c46c6 verified 6 months ago

6.47 kB

	"""
	Minimal Working Fine-tuning Script - No Complex Dependencies
	Filename: finetune_minimal.py
	"""

	import torch
	import os
	import json
	from torch.utils.data import Dataset, DataLoader
	from tqdm import tqdm
	import numpy as np

	# Fix the import issues by reinstalling
	import subprocess
	import sys

	def fix_environment():
	"""Fix the broken environment"""
	print("Fixing environment...")
	subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False)
	subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False)
	subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False)

	# Uncomment if needed
	# fix_environment()

	# Now import after fixing
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import LoraConfig, get_peft_model, TaskType

	class SimpleDataset(Dataset):
	def __init__(self, data_path, tokenizer, max_length=1024):
	self.data = []
	with open(data_path, 'r') as f:
	for line in f:
	item = json.loads(line)
	self.data.append(item['text'])

	self.tokenizer = tokenizer
	self.max_length = max_length

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	text = self.data[idx]
	encoded = self.tokenizer(
	text,
	truncation=True,
	padding='max_length',
	max_length=self.max_length,
	return_tensors='pt'
	)
	return {
	'input_ids': encoded['input_ids'].squeeze(),
	'attention_mask': encoded['attention_mask'].squeeze()
	}

	def train_simple():
	"""Simple training without complex dependencies"""

	# Configuration
	model_name = "LiquidAI/LFM2-2.6B"
	data_dir = "./kokoro_processed_data"
	output_dir = "./lfm_minimal_output"
	batch_size = 4
	learning_rate = 2e-4
	num_epochs = 2
	max_length = 1024

	os.makedirs(output_dir, exist_ok=True)

	print("="*60)
	print("Minimal Fine-tuning Script")
	print("="*60)

	# Device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Device: {device}")

	# Load tokenizer
	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Load model
	print("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)

	# Apply LoRA
	print("Applying LoRA...")
	peft_config = LoraConfig(
	r=32,
	lora_alpha=64,
	target_modules=["q_proj", "v_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type=TaskType.CAUSAL_LM
	)

	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters()

	# Load dataset
	print("Loading dataset...")
	train_dataset = SimpleDataset(
	os.path.join(data_dir, "train.jsonl"),
	tokenizer,
	max_length
	)

	train_loader = DataLoader(
	train_dataset,
	batch_size=batch_size,
	shuffle=True
	)

	# Optimizer
	optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

	# Training loop
	print(f"\nStarting training for {num_epochs} epochs...")
	model.train()

	global_step = 0
	for epoch in range(num_epochs):
	print(f"\nEpoch {epoch+1}/{num_epochs}")

	total_loss = 0
	progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

	for batch in progress_bar:
	global_step += 1

	# Move to device
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)

	# Forward pass
	outputs = model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	labels=input_ids
	)

	loss = outputs.loss
	total_loss += loss.item()

	# Backward pass
	loss.backward()

	# Update weights every 4 steps (gradient accumulation)
	if global_step % 4 == 0:
	optimizer.step()
	optimizer.zero_grad()

	# Update progress bar
	progress_bar.set_postfix({'loss': loss.item()})

	# Save checkpoint
	if global_step % 500 == 0:
	print(f"\nSaving checkpoint at step {global_step}...")
	model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))
	tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}"))

	avg_loss = total_loss / len(train_loader)
	print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

	# Save final model
	print("\nSaving final model...")
	model.save_pretrained(os.path.join(output_dir, "final_model"))
	tokenizer.save_pretrained(os.path.join(output_dir, "final_model"))

	print(f"\n✅ Training complete! Model saved to {output_dir}/final_model")

	# Test the model
	print("\nTesting model...")
	test_model(os.path.join(output_dir, "final_model"))

	def test_model(model_path):
	"""Test the fine-tuned model"""

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)

	test_input = "最近ストレスを感じています。"
	prompt = f"""### Instruction:
	あなたは心理カウンセラーです。

	### Input:
	{test_input}

	### Response:
	"""

	inputs = tokenizer(prompt, return_tensors="pt")

	with torch.no_grad():
	outputs = model.generate(
	inputs.input_ids.cuda(),
	max_new_tokens=100,
	temperature=0.7,
	do_sample=True
	)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print(f"\nTest Input: {test_input}")
	print(f"Response: {response.split('### Response:')[-1].strip()}")

	if __name__ == "__main__":
	train_simple()