| """ |
| Minimal Working Fine-tuning Script - No Complex Dependencies |
| Filename: finetune_minimal.py |
| """ |
|
|
| import torch |
| import os |
| import json |
| from torch.utils.data import Dataset, DataLoader |
| from tqdm import tqdm |
| import numpy as np |
|
|
| |
| import subprocess |
| import sys |
|
|
| def fix_environment(): |
| """Fix the broken environment""" |
| print("Fixing environment...") |
| subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "torchvision"], check=False) |
| subprocess.run([sys.executable, "-m", "pip", "install", "--no-deps", "transformers==4.36.0"], check=False) |
| subprocess.run([sys.executable, "-m", "pip", "install", "peft==0.7.0", "accelerate==0.25.0"], check=False) |
|
|
| |
| |
|
|
| |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from peft import LoraConfig, get_peft_model, TaskType |
|
|
| class SimpleDataset(Dataset): |
| def __init__(self, data_path, tokenizer, max_length=1024): |
| self.data = [] |
| with open(data_path, 'r') as f: |
| for line in f: |
| item = json.loads(line) |
| self.data.append(item['text']) |
| |
| self.tokenizer = tokenizer |
| self.max_length = max_length |
| |
| def __len__(self): |
| return len(self.data) |
| |
| def __getitem__(self, idx): |
| text = self.data[idx] |
| encoded = self.tokenizer( |
| text, |
| truncation=True, |
| padding='max_length', |
| max_length=self.max_length, |
| return_tensors='pt' |
| ) |
| return { |
| 'input_ids': encoded['input_ids'].squeeze(), |
| 'attention_mask': encoded['attention_mask'].squeeze() |
| } |
|
|
| def train_simple(): |
| """Simple training without complex dependencies""" |
| |
| |
| model_name = "LiquidAI/LFM2-2.6B" |
| data_dir = "./kokoro_processed_data" |
| output_dir = "./lfm_minimal_output" |
| batch_size = 4 |
| learning_rate = 2e-4 |
| num_epochs = 2 |
| max_length = 1024 |
| |
| os.makedirs(output_dir, exist_ok=True) |
| |
| print("="*60) |
| print("Minimal Fine-tuning Script") |
| print("="*60) |
| |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Device: {device}") |
| |
| |
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| print("Loading model...") |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| |
| |
| print("Applying LoRA...") |
| peft_config = LoraConfig( |
| r=32, |
| lora_alpha=64, |
| target_modules=["q_proj", "v_proj"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type=TaskType.CAUSAL_LM |
| ) |
| |
| model = get_peft_model(model, peft_config) |
| model.print_trainable_parameters() |
| |
| |
| print("Loading dataset...") |
| train_dataset = SimpleDataset( |
| os.path.join(data_dir, "train.jsonl"), |
| tokenizer, |
| max_length |
| ) |
| |
| train_loader = DataLoader( |
| train_dataset, |
| batch_size=batch_size, |
| shuffle=True |
| ) |
| |
| |
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) |
| |
| |
| print(f"\nStarting training for {num_epochs} epochs...") |
| model.train() |
| |
| global_step = 0 |
| for epoch in range(num_epochs): |
| print(f"\nEpoch {epoch+1}/{num_epochs}") |
| |
| total_loss = 0 |
| progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}") |
| |
| for batch in progress_bar: |
| global_step += 1 |
| |
| |
| input_ids = batch['input_ids'].to(device) |
| attention_mask = batch['attention_mask'].to(device) |
| |
| |
| outputs = model( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| labels=input_ids |
| ) |
| |
| loss = outputs.loss |
| total_loss += loss.item() |
| |
| |
| loss.backward() |
| |
| |
| if global_step % 4 == 0: |
| optimizer.step() |
| optimizer.zero_grad() |
| |
| |
| progress_bar.set_postfix({'loss': loss.item()}) |
| |
| |
| if global_step % 500 == 0: |
| print(f"\nSaving checkpoint at step {global_step}...") |
| model.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}")) |
| tokenizer.save_pretrained(os.path.join(output_dir, f"checkpoint-{global_step}")) |
| |
| avg_loss = total_loss / len(train_loader) |
| print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}") |
| |
| |
| print("\nSaving final model...") |
| model.save_pretrained(os.path.join(output_dir, "final_model")) |
| tokenizer.save_pretrained(os.path.join(output_dir, "final_model")) |
| |
| print(f"\nβ
Training complete! Model saved to {output_dir}/final_model") |
| |
| |
| print("\nTesting model...") |
| test_model(os.path.join(output_dir, "final_model")) |
|
|
| def test_model(model_path): |
| """Test the fine-tuned model""" |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_path) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_path, |
| torch_dtype=torch.bfloat16, |
| device_map="auto" |
| ) |
| |
| test_input = "ζθΏγΉγγ¬γΉγζγγ¦γγΎγγ" |
| prompt = f"""### Instruction: |
| γγͺγγ―εΏηγ«γ¦γ³γ»γ©γΌγ§γγ |
| |
| ### Input: |
| {test_input} |
| |
| ### Response: |
| """ |
| |
| inputs = tokenizer(prompt, return_tensors="pt") |
| |
| with torch.no_grad(): |
| outputs = model.generate( |
| inputs.input_ids.cuda(), |
| max_new_tokens=100, |
| temperature=0.7, |
| do_sample=True |
| ) |
| |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| print(f"\nTest Input: {test_input}") |
| print(f"Response: {response.split('### Response:')[-1].strip()}") |
|
|
| if __name__ == "__main__": |
| train_simple() |
|
|