updated with evaluation

Files changed (5) hide show

.env.example +12 -0
.gitignore +4 -1
README.md +52 -0
evaluation/compare_lora.py +87 -0
evaluation/simple_inference.py +27 -0

.env.example ADDED Viewed

	@@ -0,0 +1,12 @@

+# License: CC BY-NC-SA 4.0. Rights belong to Javad Taghia (taghia.javad@gmail.com).
+# Copy this file to `.env` and fill in secrets.
+# Never commit your real API keys.
+WANDB_API_KEY=your_wandb_api_key
+WANDB_PROJECT=tulu-laptop-run
+WANDB_ENTITY=your_wandb_username_or_team
+# Optional: where to cache/download base models and datasets
+# e.g., /Volumes/JTQ-s/______GITLAB____/downloaded_base_models
+BASE_MODEL_CACHE=

.gitignore CHANGED Viewed

@@ -2,7 +2,7 @@
 # Secrets and env files
 .env
-.env.*
 # Python caches and artifacts
 __pycache__/
@@ -13,3 +13,6 @@ offload/
 # Local W&B runs/artifacts
 wandb/

 # Secrets and env files
 .env
 # Python caches and artifacts
 __pycache__/
 # Local W&B runs/artifacts
 wandb/
+# Training outputs and adapters
+outputs/

README.md CHANGED Viewed

@@ -97,7 +97,59 @@ Key flags:
 - Finetuned adapters + tokenizer are written to `outputs/tulu-lora` (configurable via `--output_dir`).
 - `outputs/` is tracked via Git LFS (`.gitattributes`), so weights can be committed and pushed to the Hub. Run `git lfs install` once, then `git add outputs/...` before committing.
 ## Troubleshooting
 - OOM? Reduce `max_seq_length`, increase `gradient_accumulation_steps`, or switch to a smaller dataset (e.g., use a tiny instruction set like `mlabonne/guanaco-llama2-1k`, or subset your dataset with `--dataset_name your/dataset --max_train_samples 500` in code/script).
 - bitsandbytes import errors on macOS/CPU: run with `--use_4bit false` or use a Linux+CUDA machine.
 - bitsandbytes install error? We pin to `0.42.0`, the latest widely distributed wheel. If you cannot install it (CPU-only/MPS), remove it from `requirements.txt` and set `--use_4bit false`.

 - Finetuned adapters + tokenizer are written to `outputs/tulu-lora` (configurable via `--output_dir`).
 - `outputs/` is tracked via Git LFS (`.gitattributes`), so weights can be committed and pushed to the Hub. Run `git lfs install` once, then `git add outputs/...` before committing.
+## Evaluation (inference/compare)
+- Quick smoke test with the saved adapter (edit `lora_dir` inside if you used a different path):
+```bash
+python evaluation/simple_inference.py
+```
+- Compare base vs. LoRA outputs side-by-side:
+```bash
+python evaluation/compare_lora.py \
+  --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+  --lora_dir outputs/tinyllama-lora \
+  --prompt "Explain LoRA in one sentence."
+```
+Optional flags: `--max_new_tokens`, `--temperature`, `--top_p`, `--torch_dtype`.
 ## Troubleshooting
 - OOM? Reduce `max_seq_length`, increase `gradient_accumulation_steps`, or switch to a smaller dataset (e.g., use a tiny instruction set like `mlabonne/guanaco-llama2-1k`, or subset your dataset with `--dataset_name your/dataset --max_train_samples 500` in code/script).
 - bitsandbytes import errors on macOS/CPU: run with `--use_4bit false` or use a Linux+CUDA machine.
 - bitsandbytes install error? We pin to `0.42.0`, the latest widely distributed wheel. If you cannot install it (CPU-only/MPS), remove it from `requirements.txt` and set `--use_4bit false`.
+===
+pip install --upgrade "torch==2.2.*" "torchvision==0.17.*" "torchaudio==2.2.*" --index-url https://download.pytorch.org/whl/cu121
+pip install --upgrade "bitsandbytes>=0.43.1"
+pip install --upgrade "transformers>=4.40.0"
+python train_tulu.py \
+  --model_name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+  --output_dir outputs/tinyllama-lora \
+  --offload_folder offload \
+  --device cuda \
+  --torch_dtype auto \
+  --max_seq_length 512 \
+  --per_device_batch_size 2 \
+  --gradient_accumulation_steps 8 \
+  --num_train_epochs 1 \
+  --use_4bit \
+  --instruction_field instruction \
+  --input_field input \
+  --output_field output
+python train_tulu.py \
+  --model_name TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+  --output_dir outputs/tinyllama-lora \
+  --offload_folder offload \
+  --device cuda \
+  --torch_dtype auto \
+  --max_seq_length 512 \
+  --per_device_batch_size 2 \
+  --gradient_accumulation_steps 8 \
+  --num_train_epochs 1 \
+  --use_4bit \
+  --instruction_field instruction \
+  --input_field input \
+  --output_field output

evaluation/compare_lora.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import argparse
+from typing import Optional
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def parse_args():
+    p = argparse.ArgumentParser(description="Compare base vs. fine-tuned LoRA outputs.")
+    p.add_argument("--base_model", required=True, help="Base model id or local path.")
+    p.add_argument("--lora_dir", required=True, help="Path to LoRA adapter folder (e.g., outputs/tinyllama-lora).")
+    p.add_argument("--prompt", required=True, help="Prompt to generate with.")
+    p.add_argument("--max_new_tokens", type=int, default=128)
+    p.add_argument("--temperature", type=float, default=0.7)
+    p.add_argument("--top_p", type=float, default=0.9)
+    p.add_argument(
+        "--torch_dtype",
+        default="auto",
+        choices=["auto", "float16", "bfloat16", "float32"],
+        help="Force dtype for model load.",
+    )
+    return p.parse_args()
+def resolve_dtype(name: str) -> Optional[torch.dtype]:
+    if name == "auto":
+        return None
+    return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
+def generate(model, tokenizer, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.inference_mode():
+        output = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+        )
+    return tokenizer.decode(output[0], skip_special_tokens=True)
+def main():
+    args = parse_args()
+    torch_dtype = resolve_dtype(args.torch_dtype)
+    tokenizer = AutoTokenizer.from_pretrained(args.lora_dir, use_fast=False)
+    base_model = AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        device_map="auto",
+        torch_dtype=torch_dtype,
+    )
+    lora_wrapped = AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        device_map="auto",
+        torch_dtype=torch_dtype,
+    )
+    lora_wrapped = PeftModel.from_pretrained(lora_wrapped, args.lora_dir)
+    base_out = generate(
+        base_model,
+        tokenizer,
+        args.prompt,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+    )
+    lora_out = generate(
+        lora_wrapped,
+        tokenizer,
+        args.prompt,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+    )
+    print("=== Base model ===")
+    print(base_out)
+    print("\n=== LoRA model ===")
+    print(lora_out)
+if __name__ == "__main__":
+    main()

evaluation/simple_inference.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+from peft import PeftConfig, PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def main():
+    lora_dir = "outputs/tinyllama-lora"  # change to your adapter path
+    cfg = PeftConfig.from_pretrained(lora_dir)
+    base_model = cfg.base_model_name_or_path  # base model id/path
+    tokenizer = AutoTokenizer.from_pretrained(lora_dir, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model,
+        device_map="auto",
+        torch_dtype=torch.float16,
+    )
+    model = PeftModel.from_pretrained(model, lora_dir)
+    prompt = "### Instruction:\nExplain LoRA in one sentence.\n\n### Input:\nN/A\n\n### Response:\n"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.inference_mode():
+        out = model.generate(**inputs, max_new_tokens=128, do_sample=True, temperature=0.7)
+    print(tokenizer.decode(out[0], skip_special_tokens=True))
+if __name__ == "__main__":
+    main()