Spaces:

OliverSlivka
/

testrun2

Paused

testrun2 / run_sft_test.py

slivk

Fix: Save model locally before push, add explicit push with error handling

a823edb about 1 month ago

6.15 kB

	import torch
	from datasets import load_dataset
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from peft import LoraConfig
	from trl import SFTTrainer, SFTConfig

	# ===== 1. Load Dataset =====
	DATASET_NAME = "OliverSlivka/itemset-extraction-v2"
	print(f"💾 Loading dataset {DATASET_NAME} from Hugging Face Hub...")
	dataset = load_dataset(DATASET_NAME)

	# Create small subsets for the test run
	train_dataset = dataset["train"].shuffle(seed=42).select(range(50))
	eval_dataset = dataset["validation"].shuffle(seed=42)

	print(f"✅ Dataset loaded: {len(train_dataset)} train, {len(eval_dataset)} eval examples for test run.")
	print(f" Columns: {train_dataset.column_names}")

	# ===== 2. Load Model with 4-bit Quantization =====
	MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
	OUTPUT_DIR = "OliverSlivka/qwen2.5-3b-itemset-test"

	print(f"🔥 Loading {MODEL_NAME} with 4-bit quantization...")

	# 4-bit quantization config - use float32 for compute to avoid bf16 issues
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.float32, # Use fp32 for computation (T4 safe)
	bnb_4bit_use_double_quant=True,
	)

	# Load model - explicitly use float32 to avoid bf16 issues on T4
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	quantization_config=bnb_config,
	device_map="auto",
	trust_remote_code=True,
	attn_implementation="eager", # Avoid flash attention which might use bf16
	)

	# Ensure model is in float32 for non-quantized parts
	for param in model.parameters():
	if param.dtype == torch.bfloat16:
	param.data = param.data.to(torch.float32)

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True,
	)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("✅ Model and tokenizer loaded with 4-bit quantization (fp32 compute)")

	# ===== 3. LoRA Configuration =====
	peft_config = LoraConfig(
	r=16,
	lora_alpha=32,
	lora_dropout=0.05,
	bias="none",
	task_type="CAUSAL_LM",
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	)

	print(f"🎯 LoRA config: r={peft_config.r}, alpha={peft_config.lora_alpha}")

	# ===== 4. Training Configuration for Test Run =====
	# CRITICAL: Disable ALL mixed precision to avoid bf16 issues on T4
	# Local save directory
	LOCAL_OUTPUT_DIR = "./trained_adapter"

	training_args = SFTConfig(
	output_dir=LOCAL_OUTPUT_DIR,
	push_to_hub=False, # We'll push manually with better error handling

	# Training schedule
	num_train_epochs=1,
	per_device_train_batch_size=1, # Reduced from 2 to avoid OOM
	gradient_accumulation_steps=16, # Increased to maintain effective batch
	learning_rate=2e-4,
	warmup_steps=5,
	max_steps=12,

	# Optimization
	optim="paged_adamw_8bit",
	max_grad_norm=0.3,
	gradient_checkpointing=True,

	# CRITICAL: Disable ALL mixed precision (fp16 AND bf16)
	# This avoids the GradScaler bf16 issue on T4
	fp16=False,
	bf16=False,

	# Logging
	logging_steps=1,
	report_to="none",

	# Evaluation - DISABLED to avoid OOM during eval
	eval_strategy="no",
	# eval_steps=5, # Disabled

	# Saving
	save_strategy="no",

	# Sequence length - reduced to save memory
	max_length=1024,

	# Memory optimization
	per_device_eval_batch_size=1,
	)

	print("✅ Training configuration set for test run (fp32 mode - T4 safe)")
	print(f" Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
	print(f" Max steps: {training_args.max_steps}")
	print(f" Mixed precision: DISABLED (fp32 training)")

	# ===== 5. Initialize Trainer =====
	print("🎯 Initializing SFTTrainer...")

	trainer = SFTTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	peft_config=peft_config,
	)

	print("✅ Trainer initialized")

	# ===== 6. Train =====
	print("\n🚀 Starting test training...")
	print("="*60)

	print(f"CUDA available: {torch.cuda.is_available()}")
	print(f"PyTorch CUDA version: {torch.version.cuda}")
	if torch.cuda.is_available():
	print(f"Current device: {torch.cuda.current_device()}")
	print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")

	# Check model dtype
	print(f"Model dtype check:")
	for name, param in model.named_parameters():
	if param.requires_grad:
	print(f" {name}: {param.dtype}")
	break

	trainer.train()

	print("="*60)
	print("✅ Test training complete!")

	# ===== 7. Save model locally =====
	print("\n💾 Saving model locally...")
	trainer.save_model(LOCAL_OUTPUT_DIR)
	print(f"✓ Model saved to {LOCAL_OUTPUT_DIR}")

	# List saved files
	import os
	print("\n📁 Saved files:")
	for f in os.listdir(LOCAL_OUTPUT_DIR):
	size = os.path.getsize(os.path.join(LOCAL_OUTPUT_DIR, f))
	print(f" {f}: {size/1024:.1f} KB")

	# ===== 8. Push to HuggingFace Hub =====
	print(f"\n⬆️ Pushing to HuggingFace Hub: {OUTPUT_DIR}")
	try:
	from huggingface_hub import HfApi, login
	import os

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)
	api = HfApi()

	# Create repo if doesn't exist
	try:
	api.create_repo(repo_id=OUTPUT_DIR, exist_ok=True, repo_type="model")
	except Exception as e:
	print(f" Repo creation note: {e}")

	# Upload folder
	api.upload_folder(
	folder_path=LOCAL_OUTPUT_DIR,
	repo_id=OUTPUT_DIR,
	repo_type="model",
	)
	print(f"✅ Model pushed to: https://huggingface.co/{OUTPUT_DIR}")
	else:
	print("⚠️ HF_TOKEN not found - model saved locally but not pushed to Hub")
	print(f" You can manually push from: {LOCAL_OUTPUT_DIR}")
	except Exception as e:
	print(f"❌ Push failed: {e}")
	print(f" Model is saved locally at: {LOCAL_OUTPUT_DIR}")
	print(" You can push manually later using the 'Push Model' tab")

	print("\n🎉 Quick test run finished successfully!")