Upload 13 files

Browse files

Files changed (5) hide show

finetune_lora.py +2 -2
finetune_lora_origin.py +212 -0
scripts/__pycache__/prepare_alpaca.cpython-311.pyc +0 -0
scripts/prepare_alpaca.py +2 -2
scripts/prepare_alpaca_origin.py +130 -0

finetune_lora.py CHANGED Viewed

@@ -28,7 +28,7 @@ learning_rate = 3e-4
 batch_size = 128
 micro_batch_size = 4
 gradient_accumulation_steps = batch_size // micro_batch_size
-max_iters = 2 #50000 * 3 // micro_batch_size
 weight_decay = 0.0
 max_seq_length = 256  # see scripts/prepare_alpaca.py
 lora_r = 8
@@ -44,7 +44,7 @@ def main(
 ):
     #fabric = L.Fabric(accelerator="cuda", precision="bf16-true")
-    fabric = L.Fabric(accelerator="cpu", devices=2, precision="bf16-true")
     fabric.launch()
     fabric.seed_everything(1337 + fabric.global_rank)

 batch_size = 128
 micro_batch_size = 4
 gradient_accumulation_steps = batch_size // micro_batch_size
+max_iters = 10000 #50000 * 3 // micro_batch_size
 weight_decay = 0.0
 max_seq_length = 256  # see scripts/prepare_alpaca.py
 lora_r = 8
 ):
     #fabric = L.Fabric(accelerator="cuda", precision="bf16-true")
+    fabric = L.Fabric(accelerator="cpu", devices=1, precision="bf16-true")
     fabric.launch()
     fabric.seed_everything(1337 + fabric.global_rank)

finetune_lora_origin.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Instruction-tuning with LoRA on the Alpaca dataset.
+Note: If you run into a CUDA error "Expected is_sm80 to be true, but got false", uncomment the line
+`torch.backends.cuda.enable_flash_sdp(False)` in the script below (see https://github.com/Lightning-AI/lit-llama/issues/101).
+"""
+import sys
+from pathlib import Path
+import os
+import time
+import lightning as L
+import numpy as np
+import torch
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+from generate import generate
+from lit_llama.lora import mark_only_lora_as_trainable, lora, lora_state_dict
+from lit_llama.model import LLaMA, LLaMAConfig
+from lit_llama.tokenizer import Tokenizer
+from scripts.prepare_alpaca import generate_prompt
+eval_interval = 100
+save_interval = 100
+eval_iters = 100
+log_interval = 1
+# Hyperparameters
+learning_rate = 3e-4
+batch_size = 128
+micro_batch_size = 4
+gradient_accumulation_steps = batch_size // micro_batch_size
+max_iters = 50000 * 3 // micro_batch_size
+weight_decay = 0.0
+max_seq_length = 256  # see scripts/prepare_alpaca.py
+lora_r = 8
+lora_alpha = 16
+lora_dropout = 0.05
+warmup_steps = 100
+def main(
+    data_dir: str = "data/alpaca",
+    pretrained_path: str = "checkpoints/lit-llama/7B/lit-llama.pth",
+    out_dir: str = "out/lora/alpaca",
+):
+    fabric = L.Fabric(accelerator="cpu", devices=1, precision="bf16-true")
+    # fabric = L.Fabric(accelerator="cuda", devices=1, precision="bf16-true")
+    fabric.launch()
+    fabric.seed_everything(1337 + fabric.global_rank)
+    if fabric.global_rank == 0:
+        os.makedirs(out_dir, exist_ok=True)
+    train_data, val_data = load_datasets(data_dir=data_dir)
+    config = LLaMAConfig.from_name("7B")
+    config.block_size = max_seq_length
+    checkpoint = torch.load(pretrained_path)
+    with fabric.init_module(), lora(r=lora_r, alpha=lora_alpha, dropout=lora_dropout, enabled=True):
+        model = LLaMA(config)
+        # strict=False because missing keys due to LoRA weights not contained in checkpoint state
+        model.load_state_dict(checkpoint, strict=False)
+    mark_only_lora_as_trainable(model)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+    model, optimizer = fabric.setup(model, optimizer)
+    train(fabric, model, optimizer, train_data, val_data, out_dir)
+    # Save the final LoRA checkpoint at the end of training
+    checkpoint = lora_state_dict(model)
+    fabric.save(os.path.join(out_dir, "lit-llama-lora-finetuned.pth"), checkpoint)
+def train(
+    fabric: L.Fabric,
+    model: torch.nn.Module,
+    optimizer: torch.optim.Optimizer,
+    train_data: np.ndarray,
+    val_data: np.ndarray,
+    out_dir: str,
+) -> None:
+    """The training loop.
+    Loosely based on the nanoGPT implementation: https://github.com/karpathy/nanoGPT.
+    """
+    step_count = 0
+    for iter_num in range(max_iters):
+        if step_count <= warmup_steps:
+            # linear warmup
+            lr = learning_rate * step_count / warmup_steps
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+        t0 = time.time()
+        input_ids, targets = get_batch(fabric, train_data)
+        logits = model(input_ids)
+        loss = loss_fn(logits, targets)
+        fabric.backward(loss)
+        if (iter_num + 1) % gradient_accumulation_steps == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+            step_count += 1
+            if step_count % eval_interval == 0:
+                val_loss = validate(fabric, model, val_data)
+                fabric.print(f"step {iter_num}: val loss {val_loss:.4f}")
+                fabric.barrier()
+            if step_count % save_interval == 0:
+                print(f"Saving LoRA weights to {out_dir}")
+                # We are only saving the LoRA weights
+                # TODO: Provide a function/script to merge the LoRA weights with pretrained weights
+                checkpoint = lora_state_dict(model)
+                fabric.save(os.path.join(out_dir, f"iter-{iter_num:06d}-ckpt.pth"), checkpoint)
+        dt = time.time() - t0
+        if iter_num % log_interval == 0:
+            fabric.print(f"iter {iter_num}: loss {loss.item():.4f}, time: {dt*1000:.2f}ms")
+def generate_response(model, instruction):
+    tokenizer = Tokenizer("checkpoints/lit-llama/tokenizer.model")
+    sample = {"instruction": instruction, "input": ""}
+    prompt = generate_prompt(sample)
+    encoded = tokenizer.encode(prompt, bos=True, eos=False, device=model.device)
+    output = generate(
+        model,
+        idx=encoded,
+        max_seq_length=max_seq_length,
+        max_new_tokens=100,
+    )
+    output = tokenizer.decode(output)
+    return output # output.split("### Response:")[1].strip()
+@torch.no_grad()
+def validate(fabric: L.Fabric, model: torch.nn.Module, val_data: np.ndarray) -> torch.Tensor:
+    fabric.print("Validating ...")
+    model.eval()
+    losses = torch.zeros(eval_iters)
+    for k in range(eval_iters):
+        input_ids, targets = get_batch(fabric, val_data)
+        logits = model(input_ids)
+        loss = loss_fn(logits, targets)
+        losses[k] = loss.item()
+    out = losses.mean()
+    # produce an example:
+    instruction = "Recommend a movie for me to watch during the weekend and explain the reason."
+    output = generate_response(model, instruction)
+    fabric.print(instruction)
+    fabric.print(output)
+    model.train()
+    return out.item()
+def loss_fn(logits, targets):
+    # shift the targets such that output n predicts token n+1
+    logits = logits[..., :-1, :].contiguous()
+    targets = targets[..., 1:].contiguous()
+    loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+    return loss
+def get_batch(fabric: L.Fabric, data: list):
+    ix = torch.randint(len(data), (micro_batch_size,))
+    input_ids = [data[i]["input_ids"].type(torch.int64) for i in ix]
+    labels = [data[i]["labels"].type(torch.int64) for i in ix]
+    max_len = max(len(s) for s in input_ids)
+    def pad_right(x, pad_id):
+        # pad right based on the longest sequence
+        n = max_len - len(x)
+        return torch.cat((x, torch.full((n,), pad_id, dtype=x.dtype)))
+    x = torch.stack([pad_right(x, pad_id=0) for x in input_ids])
+    y = torch.stack([pad_right(x, pad_id=-1) for x in labels])
+    x, y = fabric.to_device((x.pin_memory(), y.pin_memory()))
+    return x, y
+def load_datasets(data_dir):
+    train_data = torch.load(os.path.join(data_dir, "train.pt"))
+    val_data = torch.load(os.path.join(data_dir, "test.pt"))
+    return train_data, val_data
+if __name__ == "__main__":
+    # Uncomment this line if you see an error: "Expected is_sm80 to be true, but got false"
+    # torch.backends.cuda.enable_flash_sdp(False)
+    torch.set_float32_matmul_precision("high")
+    from jsonargparse.cli import CLI
+    CLI(main)

scripts/__pycache__/prepare_alpaca.cpython-311.pyc CHANGED Viewed

Binary files a/scripts/__pycache__/prepare_alpaca.cpython-311.pyc and b/scripts/__pycache__/prepare_alpaca.cpython-311.pyc differ

scripts/prepare_alpaca.py CHANGED Viewed

@@ -22,8 +22,8 @@ IGNORE_INDEX = -1
 def prepare(
     destination_path: Path = Path("data/alpaca"),
     tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
-    #test_split_size: int = 2000,
-    test_split_size: int = 2,
     max_seq_length: int = 256,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora

 def prepare(
     destination_path: Path = Path("data/alpaca"),
     tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
+    test_split_size: int = 200,
+    #test_split_size: int = 2,
     max_seq_length: int = 256,
     seed: int = 42,
     mask_inputs: bool = False,  # as in alpaca-lora

scripts/prepare_alpaca_origin.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""Implementation derived from https://github.com/tloen/alpaca-lora"""
+import sys
+from pathlib import Path
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+import torch
+import requests
+import json
+from torch.utils.data import random_split
+from lit_llama.tokenizer import Tokenizer
+from tqdm import tqdm
+DATA_FILE = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json"
+DATA_FILE_NAME = "alpaca_data_cleaned_archive.json"
+IGNORE_INDEX = -1
+def prepare(
+    destination_path: Path = Path("data/alpaca"),
+    tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
+    test_split_size: int = 2000,
+    max_seq_length: int = 256,
+    seed: int = 42,
+    mask_inputs: bool = False,  # as in alpaca-lora
+    data_file_name: str = DATA_FILE_NAME
+) -> None:
+    """Prepare the Alpaca dataset for instruction tuning.
+    The output is a training and validation dataset saved as `train.pt` and `val.pt`,
+    which stores the preprocessed and tokenized prompts and labels.
+    """
+    destination_path.mkdir(parents=True, exist_ok=True)
+    file_path = destination_path / data_file_name
+    download(file_path)
+    # TODO: If we don't have the Meta weights, where do we get the tokenizer from?
+    tokenizer = Tokenizer(tokenizer_path)
+    with open(file_path, "r") as file:
+        data = json.load(file)
+    # Partition the dataset into train and test
+    train_split_size = len(data) - test_split_size
+    train_set, test_set = random_split(
+        data,
+        lengths=(train_split_size, test_split_size),
+        generator=torch.Generator().manual_seed(seed),
+    )
+    train_set, test_set = list(train_set), list(test_set)
+    print(f"train has {len(train_set):,} samples")
+    print(f"val has {len(test_set):,} samples")
+    print("Processing train split ...")
+    train_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(train_set)]
+    torch.save(train_set, file_path.parent / "train.pt")
+    print("Processing test split ...")
+    test_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(test_set)]
+    torch.save(test_set, file_path.parent / "test.pt")
+def download(file_path: Path):
+    """Downloads the raw json data file and saves it in the given destination."""
+    if file_path.exists():
+        return
+    with open(file_path, "w") as f:
+        f.write(requests.get(DATA_FILE).text)
+def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
+    """Processes a single sample.
+    Each sample in the dataset consists of:
+    - instruction: A string describing the task
+    - input: A string holding a special input value for the instruction.
+        This only applies to some samples, and in others this is empty.
+    - output: The response string
+    This function processes this data to produce a prompt text and a label for
+    supervised training. The prompt text is formed as a single message including both
+    the instruction and the input. The label/target is the same message but with the
+    response attached.
+    Finally, both the prompt and the label get tokenized. If desired, all tokens
+    in the label that correspond to the original input prompt get masked out (default).
+    """
+    full_prompt = generate_prompt(example)
+    full_prompt_and_response = full_prompt + example["output"]
+    encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
+    encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)
+    # The labels are the full prompt with response, but with the prompt masked out
+    labels = encoded_full_prompt_and_response.clone()
+    if mask_inputs:
+        labels[:len(encoded_full_prompt)] = IGNORE_INDEX
+    return {**example, "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}
+def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
+    return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)
+def generate_prompt(example):
+    """Generates a standardized message to prompt the model with an instruction, optional input and a
+    'response' field."""
+    if example["input"]:
+        return (
+            "Below is an instruction that describes a task, paired with an input that provides further context. "
+            "Write a response that appropriately completes the request.\n\n"
+            f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
+        )
+    return (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        f"### Instruction:\n{example['instruction']}\n\n### Response:"
+    )
+if __name__ == "__main__":
+    from jsonargparse import CLI
+    CLI(prepare)