Spaces:

ArshVerma
/

CodeLens

Sleeping

App Files Files Community

CodeLens / scripts /train.py

ArshVerma

Switch dashboard to Light Mode Premium Aesthetic

3fc3cc7 9 days ago

Raw

History Blame Contribute Delete

3.96 kB

	# =================================================================================
	# 🚀 FOOLPROOF KAGGLE TRAINING SCRIPT FOR CODELENS
	#
	# Step 1: Factory Reset your Kaggle environment to clear any broken libraries.
	# Step 2: Create a code cell and run EXACTLY this (do not restart session after):
	# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
	# !pip install --no-deps trl peft accelerate bitsandbytes
	# Step 3: Run this entire script in the next cell!
	# =================================================================================

	import os
	import torch
	from datasets import load_dataset
	from trl import SFTTrainer, SFTConfig
	from transformers import TrainingArguments
	from unsloth import FastLanguageModel
	from unsloth.chat_templates import get_chat_template

	# 1. Load the Model (Qwen-2.5-Coder-7B)
	max_seq_length = 2048 # Reduced to 2048 to prevent CUDA Out Of Memory on Kaggle T4
	dtype = None # Auto detects float16/bfloat16
	load_in_4bit = True # 4bit quantization to fit on Kaggle T4 GPUs

	print("Loading model...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/Qwen2.5-Coder-7B-Instruct",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	)

	# Apply LoRA adapters
	model = FastLanguageModel.get_peft_model(
	model,
	r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",],
	lora_alpha = 16,
	lora_dropout = 0,
	bias = "none",
	use_gradient_checkpointing = "unsloth",
	random_state = 3407,
	use_rslora = False,
	loftq_config = None,
	)

	# 2. Format the Dataset
	tokenizer = get_chat_template(
	tokenizer,
	chat_template = "chatml", # ChatML format
	)

	def formatting_prompts_func(examples):
	convos = examples["messages"]
	texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
	return { "text" : texts, }

	import glob
	print("Finding dataset...")
	dataset_files = glob.glob("/kaggle/input/**/dataset.jsonl", recursive=True)
	if not dataset_files:
	dataset_files = glob.glob("/kaggle/working/**/dataset.jsonl", recursive=True)
	if not dataset_files:
	raise FileNotFoundError("Could not find dataset.jsonl! Make sure you clicked 'Add Input' and uploaded it.")

	data_file_path = dataset_files[0]
	print(f"Loading dataset from: {data_file_path}")
	dataset = load_dataset("json", data_files=data_file_path, split="train")
	dataset = dataset.map(formatting_prompts_func, batched = True,)

	# 3. Setup Trainer
	trainer = SFTTrainer(
	model = model,
	processing_class = tokenizer,
	train_dataset = dataset,
	args = SFTConfig(
	dataset_text_field = "text",
	max_length = max_seq_length,
	dataset_num_proc = 2,
	packing = False, # Can make training 5x faster for short sequences.
	per_device_train_batch_size = 1, # Reduced to 1 to prevent OOM
	gradient_accumulation_steps = 8, # Increased to keep effective batch size the same
	warmup_steps = 5,
	max_steps = 60, # Increase this to ~300 for a real run
	learning_rate = 2e-4,
	fp16 = not torch.cuda.is_bf16_supported(),
	bf16 = torch.cuda.is_bf16_supported(),
	logging_steps = 1,
	optim = "adamw_8bit",
	weight_decay = 0.01,
	lr_scheduler_type = "linear",
	seed = 3407,
	output_dir = "outputs",
	),
	)

	# 4. Train!
	print("Starting training...")
	trainer_stats = trainer.train()

	# 5. Save the fine-tuned model
	print("Saving model to LoRA adapters...")
	model.save_pretrained("codelens_reviewer_lora")
	tokenizer.save_pretrained("codelens_reviewer_lora")

	print("✅ Training complete! Model saved to codelens_reviewer_lora")
	print("To push to Hugging Face, run: model.push_to_hub('your_username/codelens_reviewer_lora', token='...')")