Upload training_scripts/phase2_train.py with huggingface_hub

2e31dbb verified about 1 month ago

4.69 kB

	#!/usr/bin/env python3
	"""Phase 2: SFT training on Qwen3-4B"""

	import os
	import time
	import torch
	from pathlib import Path
	from datasets import load_from_disk
	from transformers import TrainingArguments, Trainer
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import LoraConfig, get_peft_model

	# Config
	BASE_MODEL = "Qwen/Qwen3-4B"
	DATA_DIR = Path("./qwen3_pipeline/data")
	CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
	CKPT_DIR.mkdir(parents=True, exist_ok=True)

	EPOCHS = 1
	BATCH_SIZE = 2
	GRAD_ACCUM = 8
	LR = 2e-4
	MAX_SEQ_LEN = 4096
	LORA_RANK = 32
	LORA_ALPHA = 64
	LORA_TARGETS = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

	print("="*70)
	print("PHASE 2: SFT TRAINING")
	print("="*70)

	# [1/4] Load model
	print(f"\n[1/4] Loading {BASE_MODEL}...")

	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	tokenizer.padding_side = "right"

	model = AutoModelForCausalLM.from_pretrained(
	BASE_MODEL,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True,
	attn_implementation="eager"
	)

	print(f" Model loaded")
	print(f" GPU memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")

	# [2/4] Apply LoRA
	print(f"\n[2/4] Applying LoRA...")

	lora_config = LoraConfig(
	r=LORA_RANK,
	lora_alpha=LORA_ALPHA,
	target_modules=LORA_TARGETS,
	lora_dropout=0.0,
	bias="none",
	task_type="CAUSAL_LM",
	init_lora_weights="gaussian",
	use_rslora=True,
	)

	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# Enable input gradients for LoRA
	model.enable_input_require_grads()

	# [3/4] Load and tokenize data
	print(f"\n[3/4] Loading and tokenizing data...")

	dataset = load_from_disk(str(DATA_DIR / "sft"))
	print(f" Dataset: {len(dataset)} samples")

	def tokenize_function(examples):
	# Format messages using chat template
	texts = []
	for msg in examples["messages"]:
	text = tokenizer.apply_chat_template(
	msg,
	tokenize=False,
	add_generation_prompt=False
	)
	texts.append(text + tokenizer.eos_token)

	# Tokenize with padding and truncation
	result = tokenizer(
	texts,
	truncation=True,
	max_length=MAX_SEQ_LEN,
	padding="max_length",
	return_tensors=None
	)

	# Labels = input_ids (simple list, not nested)
	result["labels"] = result["input_ids"].copy()

	return result

	print(" Tokenizing...")
	tokenized_dataset = dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=dataset.column_names,
	desc="Tokenizing",
	num_proc=4
	)

	print(f" Tokenized: {len(tokenized_dataset)} samples")

	# [4/4] Train
	print(f"\n[4/4] Training...")

	steps_per_epoch = len(tokenized_dataset) // (BATCH_SIZE * GRAD_ACCUM)
	total_steps = steps_per_epoch * EPOCHS

	print(f" Batch size: {BATCH_SIZE}")
	print(f" Grad accum: {GRAD_ACCUM}")
	print(f" Effective batch: {BATCH_SIZE * GRAD_ACCUM}")
	print(f" Steps per epoch: {steps_per_epoch}")
	print(f" Total steps: {total_steps}")
	print(f" Learning rate: {LR}")
	print(f" Estimated time: ~30-40 min")

	training_args = TrainingArguments(
	output_dir=str(CKPT_DIR),
	num_train_epochs=EPOCHS,
	per_device_train_batch_size=BATCH_SIZE,
	gradient_accumulation_steps=GRAD_ACCUM,
	learning_rate=LR,
	lr_scheduler_type="cosine",
	warmup_ratio=0.03,
	weight_decay=0.01,
	bf16=True,
	logging_steps=10,
	save_strategy="no",
	optim="adamw_torch",
	gradient_checkpointing=True,
	seed=42,
	report_to="none",
	dataloader_num_workers=4,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
	)

	print(f"\n{'='*70}")
	print("TRAINING STARTED")
	print(f"{'='*70}\n")

	start = time.time()
	trainer.train()
	elapsed = (time.time() - start) / 60

	print(f"\n{'='*70}")
	print(f"✓ TRAINING COMPLETE: {elapsed:.1f} minutes")
	print(f"{'='*70}")

	# Save
	print(f"\nSaving model...")

	adapter_path = CKPT_DIR / "adapter"
	model.save_pretrained(str(adapter_path))
	tokenizer.save_pretrained(str(adapter_path))
	print(f" ✓ Adapter: {adapter_path}")

	# Merge
	print(f"\nMerging LoRA weights...")
	model = model.merge_and_unload()

	merged_path = CKPT_DIR / "merged"
	model.save_pretrained(str(merged_path))
	tokenizer.save_pretrained(str(merged_path))
	print(f" ✓ Merged: {merged_path}")

	del model, trainer
	torch.cuda.empty_cache()

	print(f"\n{'='*70}")
	print(f"✓ PHASE 2 COMPLETE")
	print(f"{'='*70}")
	print(f"\nTime: {elapsed:.1f} minutes")
	print(f"Cost: ~${elapsed/60 * 1.15:.2f}")
	print(f"\n➡️ Next: python phase3_eval.py")