|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import sys |
|
|
import torch |
|
|
import logging |
|
|
from pathlib import Path |
|
|
import traceback |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def check_environment(): |
|
|
"""Check and report system environment""" |
|
|
logger.info("=== Environment Check ===") |
|
|
logger.info(f"Python version: {sys.version}") |
|
|
logger.info(f"PyTorch version: {torch.__version__}") |
|
|
logger.info(f"CUDA available: {torch.cuda.is_available()}") |
|
|
if torch.cuda.is_available(): |
|
|
logger.info(f"CUDA version: {torch.version.cuda}") |
|
|
logger.info(f"GPU count: {torch.cuda.device_count()}") |
|
|
for i in range(torch.cuda.device_count()): |
|
|
logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}") |
|
|
logger.info(f"GPU {i} memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.1f} GB") |
|
|
|
|
|
def main(): |
|
|
try: |
|
|
check_environment() |
|
|
logger.info("Importing required packages...") |
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments |
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
|
from trl import SFTTrainer |
|
|
logger.info("✓ All transformers packages imported successfully") |
|
|
except ImportError as e: |
|
|
logger.error(f"Failed to import transformers packages: {e}") |
|
|
logger.error("Please ensure all packages are installed: pip install transformers datasets peft trl") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
MODEL_ID = "google/gemma-3-1b-it" |
|
|
OUTPUT_DIR = "./results" |
|
|
HUB_MODEL_ID = "omark807/gemma3-finetuned-web-accessibility" |
|
|
NUM_TRAIN_EPOCHS = 3 |
|
|
PER_DEVICE_TRAIN_BATCH_SIZE = 2 |
|
|
GRADIENT_ACCUMULATION_STEPS = 4 |
|
|
LEARNING_RATE = 2e-4 |
|
|
SAVE_STEPS = 500 |
|
|
LOGGING_STEPS = 10 |
|
|
MAX_SEQ_LENGTH = 512 |
|
|
|
|
|
|
|
|
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) |
|
|
logger.info(f"Output directory: {os.path.abspath(OUTPUT_DIR)}") |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
logger.info("🚀 CUDA is available! Configuring for GPU training.") |
|
|
|
|
|
try: |
|
|
from bitsandbytes import BitsAndBytesConfig |
|
|
logger.info("✓ BitsAndBytes imported successfully") |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
bnb_4bit_use_double_quant=False, |
|
|
) |
|
|
model_dtype = torch.bfloat16 |
|
|
fp16_arg = False |
|
|
bf16_arg = True |
|
|
device_map = "auto" |
|
|
optimizer_type = "paged_adamw_8bit" |
|
|
logger.info("✓ 4-bit quantization configured") |
|
|
|
|
|
except ImportError as e: |
|
|
logger.warning(f"BitsAndBytes import failed: {e}") |
|
|
logger.warning("Falling back to standard GPU configuration without quantization") |
|
|
bnb_config = None |
|
|
model_dtype = torch.float16 |
|
|
fp16_arg = True |
|
|
bf16_arg = False |
|
|
device_map = {"": 0} |
|
|
optimizer_type = "adamw_torch" |
|
|
|
|
|
else: |
|
|
logger.warning("⚠️ CUDA is NOT available. Using CPU configuration.") |
|
|
logger.warning("Training will be significantly slower!") |
|
|
bnb_config = None |
|
|
model_dtype = torch.float32 |
|
|
fp16_arg = False |
|
|
bf16_arg = False |
|
|
device_map = "cpu" |
|
|
optimizer_type = "adamw_torch" |
|
|
|
|
|
|
|
|
lora_config = LoraConfig( |
|
|
r=16, |
|
|
lora_alpha=16, |
|
|
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], |
|
|
bias="none", |
|
|
lora_dropout=0.05, |
|
|
task_type="CAUSAL_LM", |
|
|
) |
|
|
logger.info("✓ LoRA configuration set") |
|
|
|
|
|
|
|
|
logger.info("Loading dataset...") |
|
|
try: |
|
|
ds = load_dataset("omark807/web_a11y_dataset") |
|
|
logger.info(f"✓ Dataset loaded. Train samples: {len(ds['train'])}") |
|
|
|
|
|
sample = ds['train'][0] |
|
|
if 'question' not in sample or 'answer' not in sample: |
|
|
logger.error("Dataset must have 'question' and 'answer' columns") |
|
|
sys.exit(1) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load dataset: {e}") |
|
|
logger.error("Check your internet connection and dataset availability") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
logger.info(f"Loading tokenizer: {MODEL_ID}") |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
tokenizer.padding_side = "right" |
|
|
tokenizer.model_max_length = MAX_SEQ_LENGTH |
|
|
logger.info("✓ Tokenizer loaded and configured") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load tokenizer: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
logger.info(f"Loading model: {MODEL_ID}") |
|
|
try: |
|
|
model_kwargs = { |
|
|
"torch_dtype": model_dtype, |
|
|
"device_map": device_map, |
|
|
"trust_remote_code": True, |
|
|
"use_cache": False, |
|
|
} |
|
|
|
|
|
|
|
|
if bnb_config is not None: |
|
|
model_kwargs["quantization_config"] = bnb_config |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs) |
|
|
|
|
|
|
|
|
if hasattr(model.config, 'pretraining_tp'): |
|
|
model.config.pretraining_tp = 1 |
|
|
|
|
|
logger.info("✓ Model loaded successfully") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load model: {e}") |
|
|
logger.error("This might be due to insufficient GPU memory or network issues") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
logger.info("Preparing model for training...") |
|
|
try: |
|
|
|
|
|
if bnb_config is not None: |
|
|
model = prepare_model_for_kbit_training(model) |
|
|
logger.info("✓ Model prepared for k-bit training") |
|
|
|
|
|
|
|
|
model = get_peft_model(model, lora_config) |
|
|
logger.info("✓ LoRA applied to model") |
|
|
|
|
|
for name, param in model.named_parameters(): |
|
|
if "lora" in name: |
|
|
param.requires_grad = True |
|
|
elif param.requires_grad: |
|
|
param.requires_grad = False |
|
|
|
|
|
|
|
|
if hasattr(model, 'lm_head'): |
|
|
for param in model.lm_head.parameters(): |
|
|
param.requires_grad = True |
|
|
elif hasattr(model, 'embed_out'): |
|
|
for param in model.embed_out.parameters(): |
|
|
param.requires_grad = True |
|
|
elif hasattr(model, 'base_model') and hasattr(model.base_model, 'lm_head'): |
|
|
for param in model.base_model.lm_head.parameters(): |
|
|
param.requires_grad = True |
|
|
|
|
|
if hasattr(model, 'get_input_embeddings') and model.get_input_embeddings() is not None: |
|
|
model.get_input_embeddings().requires_grad_(False) |
|
|
if hasattr(model, 'get_output_embeddings') and model.get_output_embeddings() is not None: |
|
|
model.get_output_embeddings().requires_grad_(False) |
|
|
|
|
|
model.print_trainable_parameters() |
|
|
logger.info("✓ Gradient requirements explicitly set for LoRA and LM head") |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to prepare model: {e}") |
|
|
logger.error(f"Full traceback: {traceback.format_exc()}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
def tokenize_function(examples): |
|
|
|
|
|
formatted_texts = [] |
|
|
for i in range(len(examples["question"])): |
|
|
question = examples["question"][i] |
|
|
answer = examples["answer"][i] |
|
|
formatted_text = f"<start_of_turn>user\n{question}<end_of_turn>\n<start_of_turn>model\n{answer}<end_of_turn>" |
|
|
formatted_texts.append(formatted_text) |
|
|
|
|
|
|
|
|
tokenized_inputs = tokenizer( |
|
|
formatted_texts, |
|
|
max_length=MAX_SEQ_LENGTH, |
|
|
truncation=True, |
|
|
padding="max_length", |
|
|
return_tensors="np", |
|
|
) |
|
|
|
|
|
|
|
|
tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy() |
|
|
return tokenized_inputs |
|
|
|
|
|
|
|
|
logger.info("Pre-tokenizing dataset...") |
|
|
try: |
|
|
tokenized_ds = ds["train"].map( |
|
|
tokenize_function, |
|
|
batched=True, |
|
|
remove_columns=ds["train"].column_names, |
|
|
num_proc=os.cpu_count() or 1, |
|
|
) |
|
|
logger.info(f"✓ Dataset pre-tokenized. New train samples: {len(tokenized_ds)}") |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to pre-tokenize dataset: {e}") |
|
|
logger.error(f"Full traceback: {traceback.format_exc()}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir=OUTPUT_DIR, |
|
|
num_train_epochs=NUM_TRAIN_EPOCHS, |
|
|
per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, |
|
|
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, |
|
|
optim=optimizer_type, |
|
|
learning_rate=LEARNING_RATE, |
|
|
fp16=fp16_arg, |
|
|
bf16=bf16_arg, |
|
|
max_grad_norm=0.3, |
|
|
warmup_ratio=0.03, |
|
|
lr_scheduler_type="constant", |
|
|
logging_steps=LOGGING_STEPS, |
|
|
save_steps=SAVE_STEPS, |
|
|
save_total_limit=3, |
|
|
remove_unused_columns=False, |
|
|
push_to_hub=False, |
|
|
hub_model_id=HUB_MODEL_ID, |
|
|
report_to="tensorboard", |
|
|
dataloader_num_workers=0, |
|
|
save_safetensors=True, |
|
|
gradient_checkpointing=False, |
|
|
) |
|
|
logger.info("✓ Training arguments configured") |
|
|
|
|
|
|
|
|
logger.info("Initializing SFTTrainer...") |
|
|
try: |
|
|
trainer = SFTTrainer( |
|
|
model=model, |
|
|
train_dataset=tokenized_ds, |
|
|
args=training_args, |
|
|
) |
|
|
logger.info("✓ SFTTrainer initialized successfully") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to initialize trainer: {e}") |
|
|
logger.error(f"Full traceback: {traceback.format_exc()}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
logger.info("🚀 Starting fine-tuning...") |
|
|
logger.info(f"Training for {NUM_TRAIN_EPOCHS} epochs") |
|
|
logger.info(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}, Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}") |
|
|
logger.info(f"Effective batch size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}") |
|
|
|
|
|
try: |
|
|
trainer.train() |
|
|
logger.info("🎉 Fine-tuning completed successfully!") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Training failed: {e}") |
|
|
logger.error(f"Full traceback: {traceback.format_exc()}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
logger.info("Saving model and tokenizer...") |
|
|
try: |
|
|
trainer.save_model(OUTPUT_DIR) |
|
|
tokenizer.save_pretrained(OUTPUT_DIR) |
|
|
logger.info(f"✓ Model saved to: {os.path.abspath(OUTPUT_DIR)}") |
|
|
|
|
|
|
|
|
with open(os.path.join(OUTPUT_DIR, "training_info.txt"), "w") as f: |
|
|
f.write(f"Model: {MODEL_ID}\n") |
|
|
f.write(f"Epochs: {NUM_TRAIN_EPOCHS}\n") |
|
|
f.write(f"Learning rate: {LEARNING_RATE}\n") |
|
|
f.write(f"Batch size: {PER_DEVICE_TRAIN_BATCH_SIZE}\n") |
|
|
f.write(f"LoRA r: {lora_config.r}\n") |
|
|
f.write(f"Device: {'GPU' if torch.cuda.is_available() else 'CPU'}\n") |
|
|
f.write(f"Quantization: {bnb_config is not None}\n") |
|
|
|
|
|
logger.info("✅ All done! Model ready for use.") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to save model: {e}") |
|
|
sys.exit(1) |
|
|
|
|
|
except KeyboardInterrupt: |
|
|
logger.info("Training interrupted by user") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
logger.error(f"Unexpected error: {e}") |
|
|
logger.error(f"Full traceback: {traceback.format_exc()}") |
|
|
sys.exit(1) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |