|
|
|
|
|
""" |
|
|
Quick Fine-Tuning Script for FinEE |
|
|
=================================== |
|
|
|
|
|
One-command fine-tuning on the 137K dataset. |
|
|
|
|
|
Usage: |
|
|
python scripts/quick_finetune.py --model phi3 # Recommended (2-3 hours, 8GB RAM) |
|
|
python scripts/quick_finetune.py --model qwen3b # Fast (2 hours, 6GB RAM) |
|
|
python scripts/quick_finetune.py --model llama3 # Best quality (8 hours, 20GB RAM) |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
import json |
|
|
|
|
|
|
|
|
|
|
|
MODELS = { |
|
|
"phi3": { |
|
|
"name": "microsoft/Phi-3-mini-4k-instruct", |
|
|
"mlx_name": "mlx-community/Phi-3-mini-4k-instruct-4bit", |
|
|
"description": "Phi-3 Mini 3.8B - Best balance of speed and quality", |
|
|
"memory": "8GB", |
|
|
"time": "2-3 hours", |
|
|
}, |
|
|
"qwen3b": { |
|
|
"name": "Qwen/Qwen2.5-3B-Instruct", |
|
|
"mlx_name": "mlx-community/Qwen2.5-3B-Instruct-4bit", |
|
|
"description": "Qwen 2.5 3B - Fast training", |
|
|
"memory": "6GB", |
|
|
"time": "2 hours", |
|
|
}, |
|
|
"llama3": { |
|
|
"name": "meta-llama/Llama-3.1-8B-Instruct", |
|
|
"mlx_name": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit", |
|
|
"description": "Llama 3.1 8B - Highest quality", |
|
|
"memory": "20GB", |
|
|
"time": "8 hours", |
|
|
}, |
|
|
"mistral": { |
|
|
"name": "mistralai/Mistral-7B-Instruct-v0.3", |
|
|
"mlx_name": "mlx-community/Mistral-7B-Instruct-v0.3-4bit", |
|
|
"description": "Mistral 7B - Good quality", |
|
|
"memory": "16GB", |
|
|
"time": "6 hours", |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
def check_mlx(): |
|
|
"""Check if MLX is available.""" |
|
|
try: |
|
|
import mlx |
|
|
import mlx_lm |
|
|
return True |
|
|
except ImportError: |
|
|
return False |
|
|
|
|
|
|
|
|
def check_torch(): |
|
|
"""Check if PyTorch is available.""" |
|
|
try: |
|
|
import torch |
|
|
return torch.cuda.is_available() or torch.backends.mps.is_available() |
|
|
except ImportError: |
|
|
return False |
|
|
|
|
|
|
|
|
def prepare_data(): |
|
|
"""Ensure data is in the right format.""" |
|
|
data_dir = Path("data/instruction") |
|
|
|
|
|
if not (data_dir / "train.jsonl").exists(): |
|
|
print("❌ Training data not found at data/instruction/train.jsonl") |
|
|
print(" Run: python scripts/convert_to_instruction.py") |
|
|
return False |
|
|
|
|
|
|
|
|
with open(data_dir / "train.jsonl") as f: |
|
|
count = sum(1 for _ in f) |
|
|
|
|
|
print(f"✅ Training data: {count:,} samples") |
|
|
return True |
|
|
|
|
|
|
|
|
def train_mlx(model_config: dict, output_dir: str, epochs: int = 1, batch_size: int = 4): |
|
|
"""Train using MLX (Mac).""" |
|
|
model_name = model_config["mlx_name"] |
|
|
|
|
|
print(f"\n🚀 Starting MLX fine-tuning with {model_name}") |
|
|
print(f" Output: {output_dir}") |
|
|
|
|
|
|
|
|
cmd = [ |
|
|
sys.executable, "-m", "mlx_lm.lora", |
|
|
"--model", model_name, |
|
|
"--train", |
|
|
"--data", "data/instruction", |
|
|
"--batch-size", str(batch_size), |
|
|
"--num-layers", "8", |
|
|
"--learning-rate", "1e-5", |
|
|
"--iters", str(epochs * 1000), |
|
|
"--save-every", "500", |
|
|
"--adapter-path", os.path.join(output_dir, "adapters"), |
|
|
] |
|
|
|
|
|
print(f"\n📝 Command: {' '.join(cmd)}") |
|
|
print("\n" + "="*60) |
|
|
|
|
|
try: |
|
|
subprocess.run(cmd, check=True) |
|
|
print("\n✅ Training complete!") |
|
|
return True |
|
|
except subprocess.CalledProcessError as e: |
|
|
print(f"\n❌ Training failed: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def train_transformers(model_config: dict, output_dir: str, epochs: int = 1): |
|
|
"""Train using Transformers + PEFT.""" |
|
|
model_name = model_config["name"] |
|
|
|
|
|
print(f"\n🚀 Starting Transformers fine-tuning with {model_name}") |
|
|
print(f" Output: {output_dir}") |
|
|
|
|
|
try: |
|
|
from transformers import ( |
|
|
AutoModelForCausalLM, |
|
|
AutoTokenizer, |
|
|
TrainingArguments, |
|
|
Trainer, |
|
|
) |
|
|
from peft import LoraConfig, get_peft_model |
|
|
import torch |
|
|
except ImportError as e: |
|
|
print(f"❌ Missing dependency: {e}") |
|
|
print(" Install: pip install transformers peft accelerate") |
|
|
return False |
|
|
|
|
|
|
|
|
print("📥 Loading model...") |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
device = "cuda" |
|
|
dtype = torch.float16 |
|
|
elif torch.backends.mps.is_available(): |
|
|
device = "mps" |
|
|
dtype = torch.float16 |
|
|
else: |
|
|
device = "cpu" |
|
|
dtype = torch.float32 |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=dtype, |
|
|
device_map="auto", |
|
|
) |
|
|
|
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=32, |
|
|
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], |
|
|
lora_dropout=0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
model.print_trainable_parameters() |
|
|
|
|
|
|
|
|
print("📥 Loading training data...") |
|
|
from datasets import Dataset |
|
|
|
|
|
train_data = [] |
|
|
with open("data/instruction/train.jsonl") as f: |
|
|
for line in f: |
|
|
train_data.append(json.loads(line)) |
|
|
|
|
|
def format_example(example): |
|
|
messages = example["messages"] |
|
|
text = tokenizer.apply_chat_template( |
|
|
messages, tokenize=False, add_generation_prompt=False |
|
|
) |
|
|
return {"text": text} |
|
|
|
|
|
dataset = Dataset.from_list(train_data[:10000]) |
|
|
dataset = dataset.map(format_example) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=output_dir, |
|
|
num_train_epochs=epochs, |
|
|
per_device_train_batch_size=4, |
|
|
gradient_accumulation_steps=4, |
|
|
learning_rate=1e-5, |
|
|
warmup_steps=100, |
|
|
logging_steps=50, |
|
|
save_steps=500, |
|
|
fp16=device == "cuda", |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=dataset, |
|
|
) |
|
|
|
|
|
print("🏋️ Starting training...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
model.save_pretrained(os.path.join(output_dir, "lora_adapters")) |
|
|
tokenizer.save_pretrained(output_dir) |
|
|
|
|
|
print(f"\n✅ Training complete! Model saved to {output_dir}") |
|
|
return True |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Quick fine-tuning for FinEE") |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
choices=list(MODELS.keys()), |
|
|
default="phi3", |
|
|
help="Model to fine-tune" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
default="models/finetuned", |
|
|
help="Output directory" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--epochs", |
|
|
type=int, |
|
|
default=1, |
|
|
help="Number of training epochs" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--backend", |
|
|
choices=["auto", "mlx", "transformers"], |
|
|
default="auto", |
|
|
help="Training backend" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
model_config = MODELS[args.model] |
|
|
|
|
|
print("="*60) |
|
|
print("FINEE QUICK FINE-TUNING") |
|
|
print("="*60) |
|
|
print(f"\n📦 Model: {args.model}") |
|
|
print(f" {model_config['description']}") |
|
|
print(f" Memory: {model_config['memory']}") |
|
|
print(f" Time: {model_config['time']}") |
|
|
|
|
|
|
|
|
if not prepare_data(): |
|
|
return 1 |
|
|
|
|
|
|
|
|
if args.backend == "auto": |
|
|
if check_mlx(): |
|
|
backend = "mlx" |
|
|
elif check_torch(): |
|
|
backend = "transformers" |
|
|
else: |
|
|
print("❌ No training backend available") |
|
|
print(" Install: pip install mlx-lm (Mac) or pip install torch (Linux/Windows)") |
|
|
return 1 |
|
|
else: |
|
|
backend = args.backend |
|
|
|
|
|
print(f"\n🔧 Backend: {backend}") |
|
|
|
|
|
|
|
|
output_dir = Path(args.output) / f"finee-{args.model}" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
if backend == "mlx": |
|
|
success = train_mlx(model_config, str(output_dir), args.epochs) |
|
|
else: |
|
|
success = train_transformers(model_config, str(output_dir), args.epochs) |
|
|
|
|
|
if success: |
|
|
print("\n" + "="*60) |
|
|
print("✅ FINE-TUNING COMPLETE!") |
|
|
print("="*60) |
|
|
print(f"\n📁 Model saved to: {output_dir}") |
|
|
print("\n📊 Next steps:") |
|
|
print(" 1. Run benchmark: python scripts/benchmark.py --test-file data/instruction/test.jsonl") |
|
|
print(" 2. Export to ONNX: python scripts/export_model.py {output_dir}") |
|
|
print(" 3. Upload to HF: huggingface-cli upload Ranjit0034/finee-phi3-4b {output_dir}") |
|
|
return 0 |
|
|
|
|
|
return 1 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sys.exit(main()) |
|
|
|