Javad Taghia commited on
Commit
d63f23b
·
1 Parent(s): 1b6b5bb
Files changed (4) hide show
  1. README.md +52 -0
  2. environment.yml +27 -0
  3. requirements.txt +12 -0
  4. train_tulu.py +178 -0
README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tulu Laptop Finetune + W&B
2
+
3
+ Minimal setup to finetune a laptop-friendly Tulu checkpoint with QLoRA and track runs in Weights & Biases.
4
+
5
+ ## Prereqs
6
+ - Recent NVIDIA GPU with CUDA for 4-bit (bitsandbytes) set `--use_4bit true`. On CPU/MPS (default), set `--use_4bit false`, but expect much slower/limited runs.
7
+ - Conda (Miniconda/Anaconda).
8
+ - A Weights & Biases account + API key.
9
+
10
+ ## Setup
11
+ 1) Create the env (Conda)
12
+ ```bash
13
+ conda env create -f environment.yml
14
+ conda activate tulu-train
15
+ ```
16
+ 2) Add secrets (keep `.env` out of git)
17
+ ```bash
18
+ cp .env.example .env
19
+ # Edit .env with your WANDB_API_KEY / project / entity
20
+ ```
21
+ 3) Verify packages (optional if you prefer pip)
22
+ ```bash
23
+ pip install -r requirements.txt
24
+ ```
25
+
26
+ ## Run a quick finetune
27
+ The defaults use `allenai/tulu-2-7b` with a small instruction dataset (`mlabonne/guanaco-llama2-1k`) and 4-bit QLoRA. This keeps memory needs closer to laptop GPUs.
28
+ ```bash
29
+ python train_tulu.py \
30
+ --output_dir outputs/tulu-lora \
31
+ --max_seq_length 512 \
32
+ --per_device_batch_size 1 \
33
+ --gradient_accumulation_steps 16
34
+ ```
35
+
36
+ Key flags:
37
+ - `--use_4bit false` if bitsandbytes/CUDA are unavailable (will be slower and need more RAM).
38
+ - `--dataset_name` to try another instruction set (any HF dataset with `instruction/input/output` fields).
39
+ - `--model_name` if you want a different Tulu variant (e.g., `allenai/tulu-2-dpo-7b`).
40
+
41
+ ## How W&B is used
42
+ - `train_tulu.py` loads `.env`, logs into W&B, and reports through `Trainer(report_to=["wandb"])`.
43
+ - Ensure `WANDB_API_KEY`, `WANDB_PROJECT`, and (optionally) `WANDB_ENTITY` are set in `.env`.
44
+ - Each run captures hyperparameters and metrics; check the W&B UI for live loss curves and checkpoints.
45
+
46
+ ## Output
47
+ - Finetuned adapters + tokenizer are written to `outputs/tulu-lora` (configurable via `--output_dir`). Push this to the Hub with `huggingface-cli upload` if desired.
48
+
49
+ ## Troubleshooting
50
+ - OOM? Reduce `max_seq_length`, increase `gradient_accumulation_steps`, or switch to a smaller dataset.
51
+ - bitsandbytes import errors on macOS/CPU: run with `--use_4bit false` or use a Linux+CUDA machine.
52
+ - bitsandbytes install error? We pin to `0.42.0`, the latest widely distributed wheel. If you cannot install it (CPU-only/MPS), remove it from `requirements.txt` and set `--use_4bit false`.
environment.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: deeai
2
+ channels:
3
+ # Use conda-forge for up-to-date builds of Python and libs.
4
+ - conda-forge
5
+ dependencies:
6
+ # Base interpreter; Python 3.10 has broad wheel support across ML libs.
7
+ - python=3.10
8
+ # Core tooling and a clean pip inside the env.
9
+ - pip
10
+ - pip:
11
+ # Core model + tokenizer stack.
12
+ - transformers>=4.44
13
+ - datasets>=2.19
14
+ # Parameter-efficient finetuning (LoRA).
15
+ - peft>=0.11
16
+ # Multi-GPU/accelerator launcher + config helper.
17
+ - accelerate>=0.33
18
+ # 4-bit quantization backend for laptop-friendly training (CUDA required).
19
+ # 0.42 is the latest widely available pip release.
20
+ - bitsandbytes==0.42.0
21
+ # Logging + experiment tracking.
22
+ - wandb>=0.17
23
+ # Env loader so secrets stay in .env, not code.
24
+ - python-dotenv>=1.0
25
+ # Optional: small utilities.
26
+ - tqdm>=4.66
27
+ - scipy>=1.11
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core model stack
2
+ transformers>=4.44
3
+ datasets>=2.19
4
+ peft>=0.11
5
+ accelerate>=0.33
6
+ bitsandbytes==0.42.0 # CUDA-only; required for 4-bit QLoRA
7
+
8
+ # Tracking and utilities
9
+ wandb>=0.17
10
+ python-dotenv>=1.0
11
+ tqdm>=4.66
12
+ scipy>=1.11
train_tulu.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal QLoRA finetune for a laptop-friendly Tulu checkpoint with W&B logging.
3
+
4
+ Defaults aim to run on a single consumer GPU using 4-bit quantization.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import os
11
+ from dataclasses import dataclass
12
+ from typing import Dict, List
13
+
14
+ import torch
15
+ import wandb
16
+ from datasets import load_dataset
17
+ from dotenv import load_dotenv
18
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
19
+ from transformers import (
20
+ AutoModelForCausalLM,
21
+ AutoTokenizer,
22
+ BitsAndBytesConfig,
23
+ DataCollatorForLanguageModeling,
24
+ Trainer,
25
+ TrainingArguments,
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class ScriptConfig:
31
+ model_name: str = "allenai/tulu-2-7b"
32
+ dataset_name: str = "mlabonne/guanaco-llama2-1k" # small, instruction-style set
33
+ output_dir: str = "outputs/tulu-lora"
34
+ max_seq_length: int = 512
35
+ per_device_batch_size: int = 1
36
+ gradient_accumulation_steps: int = 16
37
+ num_train_epochs: int = 1
38
+ learning_rate: float = 2e-4
39
+ warmup_ratio: float = 0.03
40
+ logging_steps: int = 10
41
+ save_steps: int = 200
42
+ use_4bit: bool = True
43
+
44
+
45
+ def format_chat(example: Dict[str, str]) -> str:
46
+ """Simple instruction->response template that fits Tulu-style tuning."""
47
+ user_input = example.get("input") or "N/A"
48
+ return (
49
+ f"### Instruction:\n{example['instruction']}\n\n"
50
+ f"### Input:\n{user_input}\n\n"
51
+ f"### Response:\n{example['output']}"
52
+ )
53
+
54
+
55
+ def tokenize_example(example: Dict[str, str], tokenizer, max_seq_length: int):
56
+ prompt = format_chat(example)
57
+ # We build labels that are the same as input_ids for causal LM.
58
+ tokenized = tokenizer(
59
+ prompt,
60
+ truncation=True,
61
+ max_length=max_seq_length,
62
+ padding="max_length",
63
+ )
64
+ tokenized["labels"] = tokenized["input_ids"].copy()
65
+ return tokenized
66
+
67
+
68
+ def load_model_and_tokenizer(cfg: ScriptConfig):
69
+ quantization_config = None
70
+ if cfg.use_4bit:
71
+ quantization_config = BitsAndBytesConfig(
72
+ load_in_4bit=True,
73
+ bnb_4bit_compute_dtype=torch.bfloat16,
74
+ bnb_4bit_use_double_quant=True,
75
+ bnb_4bit_quant_type="nf4",
76
+ )
77
+
78
+ tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=False)
79
+ tokenizer.padding_side = "right"
80
+ tokenizer.pad_token = tokenizer.eos_token
81
+
82
+ model = AutoModelForCausalLM.from_pretrained(
83
+ cfg.model_name,
84
+ quantization_config=quantization_config,
85
+ device_map="auto",
86
+ )
87
+ if cfg.use_4bit:
88
+ model = prepare_model_for_kbit_training(model)
89
+
90
+ lora_cfg = LoraConfig(
91
+ r=64,
92
+ lora_alpha=16,
93
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
94
+ lora_dropout=0.05,
95
+ bias="none",
96
+ task_type="CAUSAL_LM",
97
+ )
98
+ model = get_peft_model(model, lora_cfg)
99
+ return model, tokenizer
100
+
101
+
102
+ def init_wandb(cfg: ScriptConfig):
103
+ project = os.getenv("WANDB_PROJECT", "tulu-laptop-run")
104
+ entity = os.getenv("WANDB_ENTITY")
105
+ api_key = os.getenv("WANDB_API_KEY")
106
+ if not api_key:
107
+ raise RuntimeError("WANDB_API_KEY is missing. Put it in your .env before running.")
108
+ wandb.login(key=api_key)
109
+ wandb.init(project=project, entity=entity, config=vars(cfg))
110
+
111
+
112
+ def parse_args() -> ScriptConfig:
113
+ parser = argparse.ArgumentParser(description="Finetune Tulu with QLoRA + W&B")
114
+ parser.add_argument("--model_name", default=ScriptConfig.model_name)
115
+ parser.add_argument("--dataset_name", default=ScriptConfig.dataset_name)
116
+ parser.add_argument("--output_dir", default=ScriptConfig.output_dir)
117
+ parser.add_argument("--max_seq_length", type=int, default=ScriptConfig.max_seq_length)
118
+ parser.add_argument("--per_device_batch_size", type=int, default=ScriptConfig.per_device_batch_size)
119
+ parser.add_argument("--gradient_accumulation_steps", type=int, default=ScriptConfig.gradient_accumulation_steps)
120
+ parser.add_argument("--num_train_epochs", type=float, default=ScriptConfig.num_train_epochs)
121
+ parser.add_argument("--learning_rate", type=float, default=ScriptConfig.learning_rate)
122
+ parser.add_argument("--warmup_ratio", type=float, default=ScriptConfig.warmup_ratio)
123
+ parser.add_argument("--logging_steps", type=int, default=ScriptConfig.logging_steps)
124
+ parser.add_argument("--save_steps", type=int, default=ScriptConfig.save_steps)
125
+ parser.add_argument("--use_4bit", action=argparse.BooleanOptionalAction, default=False)
126
+ args = parser.parse_args()
127
+ return ScriptConfig(**vars(args))
128
+
129
+
130
+ def main():
131
+ load_dotenv()
132
+ cfg = parse_args()
133
+
134
+ init_wandb(cfg)
135
+ model, tokenizer = load_model_and_tokenizer(cfg)
136
+
137
+ use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
138
+ use_fp16 = torch.cuda.is_available() and not use_bf16
139
+
140
+ raw_dataset = load_dataset(cfg.dataset_name)
141
+ tokenized = raw_dataset["train"].map(
142
+ lambda ex: tokenize_example(ex, tokenizer, cfg.max_seq_length),
143
+ remove_columns=raw_dataset["train"].column_names,
144
+ )
145
+
146
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
147
+
148
+ training_args = TrainingArguments(
149
+ output_dir=cfg.output_dir,
150
+ per_device_train_batch_size=cfg.per_device_batch_size,
151
+ gradient_accumulation_steps=cfg.gradient_accumulation_steps,
152
+ num_train_epochs=cfg.num_train_epochs,
153
+ learning_rate=cfg.learning_rate,
154
+ warmup_ratio=cfg.warmup_ratio,
155
+ logging_steps=cfg.logging_steps,
156
+ save_steps=cfg.save_steps,
157
+ bf16=use_bf16,
158
+ fp16=use_fp16,
159
+ report_to=["wandb"],
160
+ optim="paged_adamw_32bit",
161
+ )
162
+
163
+ trainer = Trainer(
164
+ model=model,
165
+ args=training_args,
166
+ train_dataset=tokenized,
167
+ tokenizer=tokenizer,
168
+ data_collator=data_collator,
169
+ )
170
+
171
+ trainer.train()
172
+ trainer.save_model(cfg.output_dir)
173
+ tokenizer.save_pretrained(cfg.output_dir)
174
+ wandb.finish()
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()