| | import os |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
| | import inspect |
| | import torch |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | print(f"Using device: {device}") |
| | from datasets import load_dataset |
| | from huggingface_hub import notebook_login |
| | from peft import LoraConfig |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| | from trl import SFTConfig, SFTTrainer |
| |
|
| | lora_config = LoraConfig( |
| | r=16, |
| | lora_alpha=32, |
| | lora_dropout=0.05, |
| | bias="none", |
| | task_type="CAUSAL_LM", |
| | target_modules=[ |
| | "q_proj", "k_proj", "v_proj", "o_proj", |
| | "gate_proj", "up_proj", "down_proj", |
| | ], |
| | ) |
| |
|
| |
|
| |
|
| | model_id = "google/gemma-2-2b-it" |
| |
|
| | bnb_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_compute_dtype=torch.bfloat16, |
| | bnb_4bit_use_double_quant=True, |
| | bnb_4bit_quant_type="nf4", |
| | ) |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | tokenizer.pad_token = tokenizer.eos_token |
| | tokenizer.padding_side = "right" |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | model_id, |
| | quantization_config=bnb_config, |
| | device_map="auto", |
| | ) |
| | model.config.use_cache = False |
| | dataset = load_dataset("tatsu-lab/alpaca", split="train") |
| |
|
| |
|
| | def format_alpaca_prompt(example): |
| | instruction = example["instruction"].strip() |
| | user_input = example["input"].strip() |
| | response = example["output"].strip() |
| | if user_input: |
| | prompt = ( |
| | f"### Instruction:\n{instruction}\n\n" |
| | f"### Input:\n{user_input}\n\n" |
| | "### Response:\n" |
| | ) |
| | else: |
| | prompt = f"### Instruction:\n{instruction}\n\n### Response:\n" |
| | return {"text": f"{prompt}{response}"} |
| |
|
| |
|
| | train_dataset = dataset.map(format_alpaca_prompt) |
| | train_dataset=train_dataset.select(range(100)) |
| | print(train_dataset) |
| | print(train_dataset[0]["text"][:300]) |
| |
|
| | |
| | |
| | num_preview_samples = 3 |
| | preview_dataset = train_dataset.select(range(num_preview_samples)) |
| | print(f"\nPre-finetuning preview on {num_preview_samples} samples:") |
| | comparison_rows = [] |
| | for idx, sample in enumerate(preview_dataset): |
| | full_text = sample["text"] |
| | split_token = "### Response:\n" |
| | prompt_text = full_text.split(split_token)[0] + split_token |
| | expected_response = full_text.split(split_token, 1)[1] |
| | inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=120, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_p=0.9, |
| | eos_token_id=tokenizer.eos_token_id, |
| | ) |
| | decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | print(f"\n--- Sample {idx + 1} Prompt ---\n{prompt_text}") |
| | print(f"--- Sample {idx + 1} Base Model Output ---\n{decoded}") |
| | comparison_rows.append( |
| | { |
| | "id": idx + 1, |
| | "prompt": prompt_text, |
| | "target": expected_response, |
| | "before": decoded, |
| | } |
| | ) |
| |
|
| | config_kwargs = { |
| | "output_dir": "./gemma-2-2b-it-alpaca-lora", |
| | "num_train_epochs": 1, |
| | "per_device_train_batch_size": 1, |
| | "gradient_accumulation_steps": 8, |
| | "learning_rate": 2e-4, |
| | "lr_scheduler_type": "cosine", |
| | "warmup_ratio": 0.03, |
| | "logging_steps": 10, |
| | "save_strategy": "epoch", |
| | "eval_strategy": "no", |
| | "optim": "paged_adamw_8bit", |
| | "bf16": torch.cuda.is_available(), |
| | "gradient_checkpointing": True, |
| | "packing": True, |
| | "report_to": "none", |
| | } |
| | supported_config_keys = set(inspect.signature(SFTConfig.__init__).parameters.keys()) |
| | config_kwargs = {k: v for k, v in config_kwargs.items() if k in supported_config_keys} |
| | training_args = SFTConfig(**config_kwargs) |
| |
|
| | trainer_kwargs = { |
| | "model": model, |
| | "args": training_args, |
| | "train_dataset": train_dataset, |
| | "peft_config": lora_config, |
| | "dataset_text_field": "text", |
| | "max_seq_length": 1024, |
| | } |
| | supported_trainer_keys = set(inspect.signature(SFTTrainer.__init__).parameters.keys()) |
| | trainer_kwargs = {k: v for k, v in trainer_kwargs.items() if k in supported_trainer_keys} |
| | trainer = SFTTrainer( |
| | **trainer_kwargs, |
| | ) |
| |
|
| | train_result = trainer.train() |
| | adapter_out = "./gemma-2-2b-it-alpaca-lora/final_adapter" |
| | trainer.model.save_pretrained(adapter_out) |
| | tokenizer.save_pretrained(adapter_out) |
| | print(f"Saved LoRA adapter to: {adapter_out}") |
| |
|
| | print("\nPost-finetuning comparison on same samples:") |
| | for row in comparison_rows: |
| | inputs = tokenizer(row["prompt"], return_tensors="pt").to(trainer.model.device) |
| | with torch.no_grad(): |
| | outputs = trainer.model.generate( |
| | **inputs, |
| | max_new_tokens=120, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_p=0.9, |
| | eos_token_id=tokenizer.eos_token_id, |
| | ) |
| | after_decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | print(f"\n=== Sample {row['id']} ===") |
| | print(f"Prompt:\n{row['prompt']}") |
| | print(f"\nGround Truth Response:\n{row['target']}") |
| | print(f"\nBefore Fine-tuning:\n{row['before']}") |
| | print(f"\nAfter Fine-tuning:\n{after_decoded}") |
| |
|
| | prompt = "### Instruction:\nExplain photosynthesis in simple words.\n\n### Response:\n" |
| | inputs = tokenizer(prompt, return_tensors="pt").to(trainer.model.device) |
| | with torch.no_grad(): |
| | outputs = trainer.model.generate( |
| | **inputs, |
| | max_new_tokens=120, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_p=0.9, |
| | eos_token_id=tokenizer.eos_token_id, |
| | ) |
| |
|
| | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) |