finance-entity-extractor / scripts /quick_finetune.py
Ranjit0034's picture
Upload scripts/quick_finetune.py with huggingface_hub
1c67c88 verified
#!/usr/bin/env python3
"""
Quick Fine-Tuning Script for FinEE
===================================
One-command fine-tuning on the 137K dataset.
Usage:
python scripts/quick_finetune.py --model phi3 # Recommended (2-3 hours, 8GB RAM)
python scripts/quick_finetune.py --model qwen3b # Fast (2 hours, 6GB RAM)
python scripts/quick_finetune.py --model llama3 # Best quality (8 hours, 20GB RAM)
Author: Ranjit Behera
"""
import os
import sys
import subprocess
import argparse
from pathlib import Path
import json
# Model configurations
MODELS = {
"phi3": {
"name": "microsoft/Phi-3-mini-4k-instruct",
"mlx_name": "mlx-community/Phi-3-mini-4k-instruct-4bit",
"description": "Phi-3 Mini 3.8B - Best balance of speed and quality",
"memory": "8GB",
"time": "2-3 hours",
},
"qwen3b": {
"name": "Qwen/Qwen2.5-3B-Instruct",
"mlx_name": "mlx-community/Qwen2.5-3B-Instruct-4bit",
"description": "Qwen 2.5 3B - Fast training",
"memory": "6GB",
"time": "2 hours",
},
"llama3": {
"name": "meta-llama/Llama-3.1-8B-Instruct",
"mlx_name": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
"description": "Llama 3.1 8B - Highest quality",
"memory": "20GB",
"time": "8 hours",
},
"mistral": {
"name": "mistralai/Mistral-7B-Instruct-v0.3",
"mlx_name": "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
"description": "Mistral 7B - Good quality",
"memory": "16GB",
"time": "6 hours",
},
}
def check_mlx():
"""Check if MLX is available."""
try:
import mlx
import mlx_lm
return True
except ImportError:
return False
def check_torch():
"""Check if PyTorch is available."""
try:
import torch
return torch.cuda.is_available() or torch.backends.mps.is_available()
except ImportError:
return False
def prepare_data():
"""Ensure data is in the right format."""
data_dir = Path("data/instruction")
if not (data_dir / "train.jsonl").exists():
print("❌ Training data not found at data/instruction/train.jsonl")
print(" Run: python scripts/convert_to_instruction.py")
return False
# Check sample count
with open(data_dir / "train.jsonl") as f:
count = sum(1 for _ in f)
print(f"✅ Training data: {count:,} samples")
return True
def train_mlx(model_config: dict, output_dir: str, epochs: int = 1, batch_size: int = 4):
"""Train using MLX (Mac)."""
model_name = model_config["mlx_name"]
print(f"\n🚀 Starting MLX fine-tuning with {model_name}")
print(f" Output: {output_dir}")
# MLX training command
cmd = [
sys.executable, "-m", "mlx_lm.lora",
"--model", model_name,
"--train",
"--data", "data/instruction",
"--batch-size", str(batch_size),
"--num-layers", "8", # LoRA on 8 layers
"--learning-rate", "1e-5",
"--iters", str(epochs * 1000),
"--save-every", "500",
"--adapter-path", os.path.join(output_dir, "adapters"),
]
print(f"\n📝 Command: {' '.join(cmd)}")
print("\n" + "="*60)
try:
subprocess.run(cmd, check=True)
print("\n✅ Training complete!")
return True
except subprocess.CalledProcessError as e:
print(f"\n❌ Training failed: {e}")
return False
def train_transformers(model_config: dict, output_dir: str, epochs: int = 1):
"""Train using Transformers + PEFT."""
model_name = model_config["name"]
print(f"\n🚀 Starting Transformers fine-tuning with {model_name}")
print(f" Output: {output_dir}")
try:
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
)
from peft import LoraConfig, get_peft_model
import torch
except ImportError as e:
print(f"❌ Missing dependency: {e}")
print(" Install: pip install transformers peft accelerate")
return False
# Load tokenizer and model
print("📥 Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Determine device
if torch.cuda.is_available():
device = "cuda"
dtype = torch.float16
elif torch.backends.mps.is_available():
device = "mps"
dtype = torch.float16
else:
device = "cpu"
dtype = torch.float32
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=dtype,
device_map="auto",
)
# LoRA config
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Load data
print("📥 Loading training data...")
from datasets import Dataset
train_data = []
with open("data/instruction/train.jsonl") as f:
for line in f:
train_data.append(json.loads(line))
def format_example(example):
messages = example["messages"]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=False
)
return {"text": text}
dataset = Dataset.from_list(train_data[:10000]) # Limit for quick test
dataset = dataset.map(format_example)
# Training args
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=1e-5,
warmup_steps=100,
logging_steps=50,
save_steps=500,
fp16=device == "cuda",
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
print("🏋️ Starting training...")
trainer.train()
# Save
model.save_pretrained(os.path.join(output_dir, "lora_adapters"))
tokenizer.save_pretrained(output_dir)
print(f"\n✅ Training complete! Model saved to {output_dir}")
return True
def main():
parser = argparse.ArgumentParser(description="Quick fine-tuning for FinEE")
parser.add_argument(
"--model",
choices=list(MODELS.keys()),
default="phi3",
help="Model to fine-tune"
)
parser.add_argument(
"--output",
default="models/finetuned",
help="Output directory"
)
parser.add_argument(
"--epochs",
type=int,
default=1,
help="Number of training epochs"
)
parser.add_argument(
"--backend",
choices=["auto", "mlx", "transformers"],
default="auto",
help="Training backend"
)
args = parser.parse_args()
model_config = MODELS[args.model]
print("="*60)
print("FINEE QUICK FINE-TUNING")
print("="*60)
print(f"\n📦 Model: {args.model}")
print(f" {model_config['description']}")
print(f" Memory: {model_config['memory']}")
print(f" Time: {model_config['time']}")
# Check data
if not prepare_data():
return 1
# Determine backend
if args.backend == "auto":
if check_mlx():
backend = "mlx"
elif check_torch():
backend = "transformers"
else:
print("❌ No training backend available")
print(" Install: pip install mlx-lm (Mac) or pip install torch (Linux/Windows)")
return 1
else:
backend = args.backend
print(f"\n🔧 Backend: {backend}")
# Create output directory
output_dir = Path(args.output) / f"finee-{args.model}"
output_dir.mkdir(parents=True, exist_ok=True)
# Train
if backend == "mlx":
success = train_mlx(model_config, str(output_dir), args.epochs)
else:
success = train_transformers(model_config, str(output_dir), args.epochs)
if success:
print("\n" + "="*60)
print("✅ FINE-TUNING COMPLETE!")
print("="*60)
print(f"\n📁 Model saved to: {output_dir}")
print("\n📊 Next steps:")
print(" 1. Run benchmark: python scripts/benchmark.py --test-file data/instruction/test.jsonl")
print(" 2. Export to ONNX: python scripts/export_model.py {output_dir}")
print(" 3. Upload to HF: huggingface-cli upload Ranjit0034/finee-phi3-4b {output_dir}")
return 0
return 1
if __name__ == "__main__":
sys.exit(main())