Javad Taghia commited on
Commit
dba87af
·
1 Parent(s): 61c72b6
README.md CHANGED
@@ -109,6 +109,16 @@ python evaluation/compare_lora.py \
109
  --lora_dir outputs/tinyllama-lora \
110
  --prompt "Explain LoRA in one sentence."
111
  ```
 
 
 
 
 
 
 
 
 
 
112
  Optional flags: `--max_new_tokens`, `--temperature`, `--top_p`, `--torch_dtype`.
113
 
114
  ## Troubleshooting
@@ -152,4 +162,21 @@ python train_tulu.py \
152
  --input_field input \
153
  --output_field output
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
 
109
  --lora_dir outputs/tinyllama-lora \
110
  --prompt "Explain LoRA in one sentence."
111
  ```
112
+ ```bash
113
+ python evaluation/compare_lora.py \
114
+ --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
115
+ --lora_dir outputs/tinyllama-lora \
116
+ --prompt "Explain LoRA in one sentence." \
117
+ --device cpu \
118
+ --torch_dtype float32
119
+
120
+ ```
121
+
122
  Optional flags: `--max_new_tokens`, `--temperature`, `--top_p`, `--torch_dtype`.
123
 
124
  ## Troubleshooting
 
162
  --input_field input \
163
  --output_field output
164
 
165
+ ===
166
+ only cpu
167
+ python train_tulu.py \
168
+ --model_name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
169
+ --output_dir outputs/tinyllama-lora \
170
+ --offload_folder offload \
171
+ --device cuda \
172
+ --torch_dtype auto \
173
+ --max_seq_length 512 \
174
+ --per_device_batch_size 2 \
175
+ --gradient_accumulation_steps 8 \
176
+ --num_train_epochs 1 \
177
+ --use_4bit \
178
+ --instruction_field instruction \
179
+ --input_field input \
180
+ --output_field output
181
+
182
 
evaluation/compare_lora.py CHANGED
@@ -20,6 +20,12 @@ def parse_args():
20
  choices=["auto", "float16", "bfloat16", "float32"],
21
  help="Force dtype for model load.",
22
  )
 
 
 
 
 
 
23
  return p.parse_args()
24
 
25
 
@@ -29,6 +35,10 @@ def resolve_dtype(name: str) -> Optional[torch.dtype]:
29
  return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
30
 
31
 
 
 
 
 
32
  def generate(model, tokenizer, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
33
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
34
  with torch.inference_mode():
@@ -44,18 +54,19 @@ def generate(model, tokenizer, prompt: str, max_new_tokens: int, temperature: fl
44
 
45
  def main():
46
  args = parse_args()
47
- torch_dtype = resolve_dtype(args.torch_dtype)
 
48
 
49
  tokenizer = AutoTokenizer.from_pretrained(args.lora_dir, use_fast=False)
50
 
51
  base_model = AutoModelForCausalLM.from_pretrained(
52
  args.base_model,
53
- device_map="auto",
54
  torch_dtype=torch_dtype,
55
  )
56
  lora_wrapped = AutoModelForCausalLM.from_pretrained(
57
  args.base_model,
58
- device_map="auto",
59
  torch_dtype=torch_dtype,
60
  )
61
  lora_wrapped = PeftModel.from_pretrained(lora_wrapped, args.lora_dir)
 
20
  choices=["auto", "float16", "bfloat16", "float32"],
21
  help="Force dtype for model load.",
22
  )
23
+ p.add_argument(
24
+ "--device",
25
+ default="auto",
26
+ choices=["auto", "cpu", "cuda", "mps"],
27
+ help="Force device map; on CPU use this to keep everything on host.",
28
+ )
29
  return p.parse_args()
30
 
31
 
 
35
  return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
36
 
37
 
38
+ def resolve_device_map(device: str):
39
+ return {"": "cpu"} if device == "cpu" else "auto"
40
+
41
+
42
  def generate(model, tokenizer, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
43
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
44
  with torch.inference_mode():
 
54
 
55
  def main():
56
  args = parse_args()
57
+ torch_dtype = resolve_dtype(args.torch_dtype) or (torch.float32 if args.device == "cpu" else None)
58
+ device_map = resolve_device_map(args.device) if args.device != "auto" else "auto"
59
 
60
  tokenizer = AutoTokenizer.from_pretrained(args.lora_dir, use_fast=False)
61
 
62
  base_model = AutoModelForCausalLM.from_pretrained(
63
  args.base_model,
64
+ device_map=device_map,
65
  torch_dtype=torch_dtype,
66
  )
67
  lora_wrapped = AutoModelForCausalLM.from_pretrained(
68
  args.base_model,
69
+ device_map=device_map,
70
  torch_dtype=torch_dtype,
71
  )
72
  lora_wrapped = PeftModel.from_pretrained(lora_wrapped, args.lora_dir)
evaluation/simple_inference.py CHANGED
@@ -1,25 +1,59 @@
 
 
 
1
  import torch
2
  from peft import PeftConfig, PeftModel
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def main():
7
- lora_dir = "outputs/tinyllama-lora" # change to your adapter path
8
- cfg = PeftConfig.from_pretrained(lora_dir)
9
  base_model = cfg.base_model_name_or_path # base model id/path
 
 
10
 
11
- tokenizer = AutoTokenizer.from_pretrained(lora_dir, use_fast=False)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  base_model,
14
- device_map="auto",
15
- torch_dtype=torch.float16,
16
  )
17
- model = PeftModel.from_pretrained(model, lora_dir)
18
 
19
- prompt = "### Instruction:\nExplain LoRA in one sentence.\n\n### Input:\nN/A\n\n### Response:\n"
20
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
21
  with torch.inference_mode():
22
- out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
 
 
 
 
 
 
23
  print(tokenizer.decode(out[0], skip_special_tokens=True))
24
 
25
 
 
1
+ import argparse
2
+ from typing import Optional
3
+
4
  import torch
5
  from peft import PeftConfig, PeftModel
6
  from transformers import AutoModelForCausalLM, AutoTokenizer
7
 
8
 
9
+ def resolve_dtype(name: str, device: str) -> Optional[torch.dtype]:
10
+ if name == "auto":
11
+ # On CPU, default to fp32; otherwise let transformers pick.
12
+ return torch.float32 if device == "cpu" else None
13
+ return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
14
+
15
+
16
+ def resolve_device_map(device: str):
17
+ return {"": "cpu"} if device == "cpu" else "auto"
18
+
19
+
20
+ def parse_args():
21
+ p = argparse.ArgumentParser(description="Run a quick LoRA inference.")
22
+ p.add_argument("--lora_dir", default="outputs/tinyllama-lora", help="Path to LoRA adapter folder.")
23
+ p.add_argument("--prompt", default="### Instruction:\nExplain LoRA in one sentence.\n\n### Input:\nN/A\n\n### Response:\n")
24
+ p.add_argument("--max_new_tokens", type=int, default=128)
25
+ p.add_argument("--temperature", type=float, default=0.7)
26
+ p.add_argument("--top_p", type=float, default=0.9)
27
+ p.add_argument("--device", default="auto", choices=["auto", "cpu", "cuda", "mps"])
28
+ p.add_argument("--torch_dtype", default="auto", choices=["auto", "float16", "bfloat16", "float32"])
29
+ return p.parse_args()
30
+
31
+
32
  def main():
33
+ args = parse_args()
34
+ cfg = PeftConfig.from_pretrained(args.lora_dir)
35
  base_model = cfg.base_model_name_or_path # base model id/path
36
+ torch_dtype = resolve_dtype(args.torch_dtype, args.device)
37
+ device_map = resolve_device_map(args.device) if args.device != "auto" else "auto"
38
 
39
+ tokenizer = AutoTokenizer.from_pretrained(args.lora_dir, use_fast=False)
40
  model = AutoModelForCausalLM.from_pretrained(
41
  base_model,
42
+ device_map=device_map,
43
+ torch_dtype=torch_dtype,
44
  )
45
+ model = PeftModel.from_pretrained(model, args.lora_dir)
46
 
47
+ prompt = args.prompt
48
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
49
  with torch.inference_mode():
50
+ out = model.generate(
51
+ **inputs,
52
+ max_new_tokens=args.max_new_tokens,
53
+ do_sample=True,
54
+ temperature=args.temperature,
55
+ top_p=args.top_p,
56
+ )
57
  print(tokenizer.decode(out[0], skip_special_tokens=True))
58
 
59
 
train_tulu.py CHANGED
@@ -283,6 +283,9 @@ def main():
283
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
284
  # Pad/batch causal LM examples.
285
 
 
 
 
286
  training_args = TrainingArguments(
287
  output_dir=cfg.output_dir,
288
  per_device_train_batch_size=cfg.per_device_batch_size,
@@ -295,7 +298,7 @@ def main():
295
  bf16=use_bf16,
296
  fp16=use_fp16,
297
  report_to=["wandb"],
298
- optim="paged_adamw_32bit",
299
  )
300
  # Trainer configuration (logging, saving, optimizer, precision).
301
 
 
283
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
284
  # Pad/batch causal LM examples.
285
 
286
+ # Choose optimizer: paged_adamw_32bit for 4-bit GPU; fall back to AdamW on CPU/no-4bit.
287
+ optim_name = "paged_adamw_32bit" if cfg.use_4bit and not force_cpu else "adamw_torch"
288
+
289
  training_args = TrainingArguments(
290
  output_dir=cfg.output_dir,
291
  per_device_train_batch_size=cfg.per_device_batch_size,
 
298
  bf16=use_bf16,
299
  fp16=use_fp16,
300
  report_to=["wandb"],
301
+ optim=optim_name,
302
  )
303
  # Trainer configuration (logging, saving, optimizer, precision).
304