| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import torch |
| from datasets import load_dataset |
| from trl import SFTTrainer, SFTConfig |
| from transformers import TrainingArguments |
| from unsloth import FastLanguageModel |
| from unsloth.chat_templates import get_chat_template |
|
|
| |
| max_seq_length = 2048 |
| dtype = None |
| load_in_4bit = True |
|
|
| print("Loading model...") |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = "unsloth/Qwen2.5-Coder-7B-Instruct", |
| max_seq_length = max_seq_length, |
| dtype = dtype, |
| load_in_4bit = load_in_4bit, |
| ) |
|
|
| |
| model = FastLanguageModel.get_peft_model( |
| model, |
| r = 16, |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
| "gate_proj", "up_proj", "down_proj",], |
| lora_alpha = 16, |
| lora_dropout = 0, |
| bias = "none", |
| use_gradient_checkpointing = "unsloth", |
| random_state = 3407, |
| use_rslora = False, |
| loftq_config = None, |
| ) |
|
|
| |
| tokenizer = get_chat_template( |
| tokenizer, |
| chat_template = "chatml", |
| ) |
|
|
| def formatting_prompts_func(examples): |
| convos = examples["messages"] |
| texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos] |
| return { "text" : texts, } |
|
|
| import glob |
| print("Finding dataset...") |
| dataset_files = glob.glob("/kaggle/input/**/dataset.jsonl", recursive=True) |
| if not dataset_files: |
| dataset_files = glob.glob("/kaggle/working/**/dataset.jsonl", recursive=True) |
| if not dataset_files: |
| raise FileNotFoundError("Could not find dataset.jsonl! Make sure you clicked 'Add Input' and uploaded it.") |
|
|
| data_file_path = dataset_files[0] |
| print(f"Loading dataset from: {data_file_path}") |
| dataset = load_dataset("json", data_files=data_file_path, split="train") |
| dataset = dataset.map(formatting_prompts_func, batched = True,) |
|
|
| |
| trainer = SFTTrainer( |
| model = model, |
| processing_class = tokenizer, |
| train_dataset = dataset, |
| args = SFTConfig( |
| dataset_text_field = "text", |
| max_length = max_seq_length, |
| dataset_num_proc = 2, |
| packing = False, |
| per_device_train_batch_size = 1, |
| gradient_accumulation_steps = 8, |
| warmup_steps = 5, |
| max_steps = 60, |
| learning_rate = 2e-4, |
| fp16 = not torch.cuda.is_bf16_supported(), |
| bf16 = torch.cuda.is_bf16_supported(), |
| logging_steps = 1, |
| optim = "adamw_8bit", |
| weight_decay = 0.01, |
| lr_scheduler_type = "linear", |
| seed = 3407, |
| output_dir = "outputs", |
| ), |
| ) |
|
|
| |
| print("Starting training...") |
| trainer_stats = trainer.train() |
|
|
| |
| print("Saving model to LoRA adapters...") |
| model.save_pretrained("codelens_reviewer_lora") |
| tokenizer.save_pretrained("codelens_reviewer_lora") |
|
|
| print("β
Training complete! Model saved to codelens_reviewer_lora") |
| print("To push to Hugging Face, run: model.push_to_hub('your_username/codelens_reviewer_lora', token='...')") |
|
|