Add training script for financial options expert model

5ef4d59 verified 24 days ago

9.96 kB

	"""
	Financial Options & Market Prediction Expert Model
	===================================================
	Fine-tunes Mistral-7B-Instruct-v0.3 with QLoRA on ~745K financial instruction examples.
	Combines 3 datasets:
	1. sujet-ai/Sujet-Finance-Instruct-177k (sentiment, NER, QA)
	2. gbharti/finance-alpaca (68K financial Q&A including options)
	3. Josephgflowers/Finance-Instruct-500k (500K broad financial instructions)

	The model is trained with a system prompt emphasizing:
	- Options trading analysis
	- Explaining HOW data features affect market predictions
	- Step-by-step reasoning with feature importance
	"""

	import os
	import torch
	import trackio
	from datasets import load_dataset, concatenate_datasets
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	)
	from peft import LoraConfig, prepare_model_for_kbit_training
	from trl import SFTTrainer, SFTConfig

	# ============================================================================
	# CONFIG
	# ============================================================================
	MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
	HUB_MODEL_ID = "Saksham7772/FinOptions-Mistral-7B"
	OUTPUT_DIR = "./finopt-mistral-7b-qlora"
	PROJECT_NAME = "financial-options-expert"
	RUN_NAME = "qlora-r64-lr2e4-ep2"

	SYSTEM_PROMPT = (
	"You are a quantitative financial analyst and options trading expert. "
	"For every analysis you provide:\n"
	"1. Identify which input data features are most influential "
	"(e.g., implied volatility, volume, earnings, macro indicators, sentiment)\n"
	"2. Explain the directional impact of each feature on the prediction "
	"(bullish/bearish/neutral and why)\n"
	"3. Provide your market prediction or options strategy recommendation with clear reasoning\n"
	"4. Express your confidence level and key risk factors\n"
	"Think step by step before answering."
	)

	# ============================================================================
	# TRACKIO — Experiment Tracking
	# ============================================================================
	trackio.init(
	project=PROJECT_NAME,
	name=RUN_NAME,
	config={
	"model": MODEL_ID,
	"lora_r": 64,
	"lora_alpha": 128,
	"learning_rate": 2e-4,
	"epochs": 2,
	"batch_size_per_device": 2,
	"gradient_accumulation_steps": 8,
	"effective_batch_size": 16,
	"quant": "4bit-nf4-double",
	"max_length": 2048,
	"datasets": [
	"sujet-ai/Sujet-Finance-Instruct-177k",
	"gbharti/finance-alpaca",
	"Josephgflowers/Finance-Instruct-500k",
	],
	},
	)

	# ============================================================================
	# DATASET PREPARATION
	# ============================================================================
	print("=" * 60)
	print("Loading and converting datasets to messages format...")
	print("=" * 60)

	# --- Dataset 1: Sujet Finance 177K ---
	ds_sujet = load_dataset("sujet-ai/Sujet-Finance-Instruct-177k", split="train")
	print(f" Sujet Finance: {len(ds_sujet)} rows")

	def convert_sujet(example):
	system = example.get("system_prompt", "").strip()
	if not system:
	system = SYSTEM_PROMPT
	user = example.get("user_prompt", "").strip()
	answer = example.get("answer", "").strip()
	if not user or not answer:
	return {"messages": None}
	return {
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	{"role": "assistant", "content": answer},
	]
	}

	ds_sujet = ds_sujet.map(convert_sujet, remove_columns=ds_sujet.column_names, num_proc=4)
	ds_sujet = ds_sujet.filter(lambda x: x["messages"] is not None, num_proc=4)
	print(f" Sujet Finance after conversion: {len(ds_sujet)} rows")

	# --- Dataset 2: Finance Alpaca 68K ---
	ds_alpaca = load_dataset("gbharti/finance-alpaca", split="train")
	print(f" Finance Alpaca: {len(ds_alpaca)} rows")

	def convert_alpaca(example):
	instruction = example.get("instruction", "").strip()
	inp = example.get("input", "").strip()
	output = example.get("output", "").strip()
	if not instruction or not output:
	return {"messages": None}
	user_content = instruction
	if inp:
	user_content += f"\n\n{inp}"
	return {
	"messages": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_content},
	{"role": "assistant", "content": output},
	]
	}

	ds_alpaca = ds_alpaca.map(convert_alpaca, remove_columns=ds_alpaca.column_names, num_proc=4)
	ds_alpaca = ds_alpaca.filter(lambda x: x["messages"] is not None, num_proc=4)
	print(f" Finance Alpaca after conversion: {len(ds_alpaca)} rows")

	# --- Dataset 3: Finance Instruct 500K ---
	ds_500k = load_dataset("Josephgflowers/Finance-Instruct-500k", split="train")
	print(f" Finance Instruct 500K: {len(ds_500k)} rows")

	def convert_500k(example):
	system = example.get("system", "").strip()
	if not system:
	system = SYSTEM_PROMPT
	user = example.get("user", "").strip()
	assistant = example.get("assistant", "").strip()
	if not user or not assistant:
	return {"messages": None}
	return {
	"messages": [
	{"role": "system", "content": system},
	{"role": "user", "content": user},
	{"role": "assistant", "content": assistant},
	]
	}

	ds_500k = ds_500k.map(convert_500k, remove_columns=ds_500k.column_names, num_proc=4)
	ds_500k = ds_500k.filter(lambda x: x["messages"] is not None, num_proc=4)
	print(f" Finance Instruct 500K after conversion: {len(ds_500k)} rows")

	# --- Combine all datasets ---
	combined = concatenate_datasets([ds_sujet, ds_alpaca, ds_500k])
	combined = combined.shuffle(seed=42)
	print(f"\\n COMBINED DATASET: {len(combined)} rows")

	split = combined.train_test_split(test_size=0.01, seed=42)
	train_dataset = split["train"]
	eval_dataset = split["test"]
	print(f" Train: {len(train_dataset)} \| Eval: {len(eval_dataset)}")
	print("=" * 60)

	# ============================================================================
	# MODEL & TOKENIZER
	# ============================================================================
	print("Loading model with QLoRA 4-bit quantization...")

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	)
	model = prepare_model_for_kbit_training(model)
	model.config.use_cache = False

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	print(f"Model loaded: {MODEL_ID}")
	print(f"Model params: {model.num_parameters():,} (quantized)")

	# ============================================================================
	# LoRA CONFIG — Following Open-FinLLMs recipe: r=64, alpha=128
	# ============================================================================
	peft_config = LoraConfig(
	r=64,
	lora_alpha=128,
	target_modules=[
	"q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",
	],
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	)

	# ============================================================================
	# SFT CONFIG
	# ============================================================================
	sft_config = SFTConfig(
	output_dir=OUTPUT_DIR,
	num_train_epochs=2,
	per_device_train_batch_size=2,
	per_device_eval_batch_size=2,
	gradient_accumulation_steps=8,
	optim="paged_adamw_8bit",
	learning_rate=2e-4,
	max_grad_norm=0.3,
	weight_decay=0.001,
	warmup_ratio=0.03,
	lr_scheduler_type="cosine",
	bf16=True,
	fp16=False,
	max_length=2048,
	packing=False,
	gradient_checkpointing=True,
	gradient_checkpointing_kwargs={"use_reentrant": False},
	eval_strategy="steps",
	eval_steps=500,
	save_strategy="steps",
	save_steps=500,
	save_total_limit=3,
	load_best_model_at_end=True,
	metric_for_best_model="eval_loss",
	disable_tqdm=True,
	logging_strategy="steps",
	logging_steps=10,
	logging_first_step=True,
	logging_dir=f"{OUTPUT_DIR}/logs",
	report_to="trackio",
	run_name=RUN_NAME,
	push_to_hub=True,
	hub_model_id=HUB_MODEL_ID,
	hub_strategy="every_save",
	seed=42,
	dataloader_num_workers=4,
	)

	# ============================================================================
	# TRAINER
	# ============================================================================
	trainer = SFTTrainer(
	model=model,
	args=sft_config,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	peft_config=peft_config,
	processing_class=tokenizer,
	)

	# ============================================================================
	# TRAIN
	# ============================================================================
	print("\\n" + "=" * 60)
	print("STARTING TRAINING")
	print(f" Model: {MODEL_ID}")
	print(f" Total train examples: {len(train_dataset)}")
	print(f" Epochs: 2")
	print(f" Effective batch size: 16")
	print(f" LoRA rank: 64, alpha: 128")
	print(f" Learning rate: 2e-4")
	print(f" Max sequence length: 2048")
	print(f" Push to Hub: {HUB_MODEL_ID}")
	print("=" * 60 + "\\n")

	trainer.train()

	# ============================================================================
	# SAVE & PUSH
	# ============================================================================
	print("\\nSaving final model...")
	trainer.save_model(OUTPUT_DIR)
	tokenizer.save_pretrained(OUTPUT_DIR)

	print(f"\\nTraining complete! Model saved to Hub: https://huggingface.co/{HUB_MODEL_ID}")
	trackio.finish()