nvan13 commited on Dec 31, 2025

Commit

f4dcc30

verified ·

1 Parent(s): 51cbdf4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

generation/control/oldm/hack.py +111 -0
generation/control/oldm/lora.py +1119 -0
generation/control/oldm/lora_ldm.py +343 -0
generation/control/oldm/model.py +28 -0
generation/control/oldm/oft_ldm.py +353 -0
generation/subject/download_dreambooth.sh +4 -0
generation/subject/evaluate.py +462 -0
generation/subject/get_result.py +62 -0
generation/subject/oft_utils/__init__.py +2 -0
generation/subject/oft_utils/attention_processor.py +1036 -0
generation/subject/oft_utils/mhe.py +360 -0
generation/subject/train_dreambooth_hra.py +1123 -0
generation/subject/train_dreambooth_hra.sh +186 -0
llama/data/MATH_test.jsonl +0 -0
llama/data/gsm8k_test.jsonl +0 -0
llama/data/oft/__init__.py +20 -0
llama/data/oft/config.py +119 -0
llama/data/oft/layer.py +388 -0
llama/data/oft/model.py +106 -0
llama/finetune_32.py +368 -0
llama/inference/MATH_inference.py +108 -0
llama/inference/grader.py +141 -0
llama/inference/gsm8k_inference.py +127 -0
llama/inference/util.py +253 -0
llama/merge_adapter_to_base_model.py +27 -0
llama/output/cp1e4/ft/README.md +202 -0
llama/output/cp1e4/ft/adapter_config.json +23 -0
llama/output/cp1e4/ft/added_tokens.json +3 -0
llama/output/cp1e4/ft/special_tokens_map.json +30 -0
llama/output/cp1e4/ft/tokenizer.json +0 -0
llama/output/cp1e4/ft/tokenizer_config.json +51 -0
llama/output/cp1e5/ft/README.md +202 -0
llama/output/cp1e5/ft/adapter_config.json +23 -0
llama/output/cp1e5/trainer_state.json +30 -0
llama/output/cp1e5N/ft/README.md +202 -0
llama/output/cp1e5N/ft/adapter_config.json +23 -0
llama/output/cp1e5N/ft/added_tokens.json +3 -0
llama/output/cp1e5N/ft/special_tokens_map.json +30 -0
llama/output/cp1e5N/ft/tokenizer.json +0 -0
llama/output/cp1e5N/ft/tokenizer_config.json +51 -0
llama/output/cp3e5/ft/README.md +202 -0
llama/output/cp3e5/ft/adapter_config.json +23 -0
llama/output/cp3e5/trainer_state.json +72 -0
llama/output/cp3e5N/ft/README.md +202 -0
llama/output/cp3e5N/ft/adapter_config.json +23 -0
llama/output/cp3e5N/ft/added_tokens.json +3 -0
llama/output/cp3e5N/ft/special_tokens_map.json +30 -0
llama/output/cp3e5N/ft/tokenizer.json +0 -0
llama/output/cp3e5N/ft/tokenizer_config.json +51 -0
llama/output/cpr1/ft/README.md +202 -0

generation/control/oldm/hack.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+import einops
+import ldm.modules.encoders.modules
+import ldm.modules.attention
+from transformers import logging
+from ldm.modules.attention import default
+def disable_verbosity():
+    logging.set_verbosity_error()
+    print('logging improved.')
+    return
+def enable_sliced_attention():
+    ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward
+    print('Enabled sliced_attention.')
+    return
+def hack_everything(clip_skip=0):
+    disable_verbosity()
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward
+    ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip
+    print('Enabled clip hacks.')
+    return
+# Written by Lvmin
+def _hacked_clip_forward(self, text):
+    PAD = self.tokenizer.pad_token_id
+    EOS = self.tokenizer.eos_token_id
+    BOS = self.tokenizer.bos_token_id
+    def tokenize(t):
+        return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"]
+    def transformer_encode(t):
+        if self.clip_skip > 1:
+            rt = self.transformer(input_ids=t, output_hidden_states=True)
+            return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip])
+        else:
+            return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state
+    def split(x):
+        return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3]
+    def pad(x, p, i):
+        return x[:i] if len(x) >= i else x + [p] * (i - len(x))
+    raw_tokens_list = tokenize(text)
+    tokens_list = []
+    for raw_tokens in raw_tokens_list:
+        raw_tokens_123 = split(raw_tokens)
+        raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123]
+        raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123]
+        tokens_list.append(raw_tokens_123)
+    tokens_list = torch.IntTensor(tokens_list).to(self.device)
+    feed = einops.rearrange(tokens_list, 'b f i -> (b f) i')
+    y = transformer_encode(feed)
+    z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3)
+    return z
+# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py
+def _hacked_sliced_attentin_forward(self, x, context=None, mask=None):
+    h = self.heads
+    q = self.to_q(x)
+    context = default(context, x)
+    k = self.to_k(context)
+    v = self.to_v(context)
+    del context, x
+    q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+    limit = k.shape[0]
+    att_step = 1
+    q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0))
+    k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0))
+    v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0))
+    q_chunks.reverse()
+    k_chunks.reverse()
+    v_chunks.reverse()
+    sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
+    del k, q, v
+    for i in range(0, limit, att_step):
+        q_buffer = q_chunks.pop()
+        k_buffer = k_chunks.pop()
+        v_buffer = v_chunks.pop()
+        sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale
+        del k_buffer, q_buffer
+        # attention, what we cannot get enough of, by chunks
+        sim_buffer = sim_buffer.softmax(dim=-1)
+        sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer)
+        del v_buffer
+        sim[i:i + att_step, :, :] = sim_buffer
+        del sim_buffer
+    sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h)
+    return self.to_out(sim)

generation/control/oldm/lora.py ADDED Viewed

	@@ -0,0 +1,1119 @@

+"""
+This script is retrived from lora available at:
+https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+Original Author: Simo Ryu
+License: Apache License 2.0
+"""
+import json
+import math
+from itertools import groupby
+from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union
+import pickle
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from safetensors.torch import safe_open
+    from safetensors.torch import save_file as safe_save
+    safetensors_available = True
+except ImportError:
+    from .safe_open import safe_open
+    def safe_save(
+        tensors: Dict[str, torch.Tensor],
+        filename: str,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> None:
+        raise EnvironmentError(
+            "Saving safetensors requires the safetensors library. Please install with pip or similar."
+        )
+    safetensors_available = False
+class LoraInjectedLinear(nn.Module):
+    def __init__(
+        self, in_features, out_features, bias=False, r=4, dropout_p=0.1, scale=1.0
+    ):
+        super().__init__()
+        if r > min(in_features, out_features):
+            raise ValueError(
+                f"LoRA rank {r} must be less or equal than {min(in_features, out_features)}"
+            )
+        self.r = r
+        self.linear = nn.Linear(in_features, out_features, bias)
+        self.lora_down = nn.Linear(in_features, r, bias=False)
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Linear(r, out_features, bias=False)
+        self.scale = scale
+        self.selector = nn.Identity()
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input) + self.lora_up(self.selector(self.lora_down(input))) * self.scale
+            # + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            # * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Linear(self.r, self.r, bias=False)
+        self.selector.weight.data = torch.diag(diag)
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups: int = 1,
+        bias: bool = True,
+        r: int = 4,
+        dropout_p: float = 0.1,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            raise ValueError(
+                f"LoRA rank {r} must be less or equal than {min(in_channels, out_channels)}"
+            )
+        self.r = r
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.lora_down = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv2d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input) + self.lora_up(self.selector(self.lora_down(input))) * self.scale
+            # + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            # * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv2d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+UNET_DEFAULT_TARGET_REPLACE = {"CrossAttention", "Attention", "GEGLU"}
+UNET_EXTENDED_TARGET_REPLACE = {"ResBlock", "CrossAttention", "Attention", "GEGLU"}
+TEXT_ENCODER_DEFAULT_TARGET_REPLACE = {"CLIPAttention"}
+TEXT_ENCODER_EXTENDED_TARGET_REPLACE = {"CLIPAttention"}
+DEFAULT_TARGET_REPLACE = UNET_DEFAULT_TARGET_REPLACE
+EMBED_FLAG = "<embed>"
+def _find_children(
+    model,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+):
+    """
+    Find all modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    result = []
+    for parent in model.modules():
+        for name, module in parent.named_children():
+            if any([isinstance(module, _class) for _class in search_class]):
+                result.append((parent, name, module))  # Append the result to the list
+    return result  # Return the list instead of using 'yield'
+def _find_modules_v2(
+    model,
+    ancestor_class: Optional[Set[str]] = None,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [
+        LoraInjectedLinear,
+        LoraInjectedConv2d,
+    ],
+):
+    """
+    Find all modules of a certain class (or union of classes) that are direct or
+    indirect descendants of other modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # Get the targets we should replace all linears under
+    if ancestor_class is not None:
+        ancestors = (
+            module
+            for module in model.modules()
+            if module.__class__.__name__ in ancestor_class
+        )
+    else:
+        # this, incase you want to naively iterate over all modules.
+        ancestors = [module for module in model.modules()]
+    results = []
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for ancestor in ancestors:
+        for fullname, module in ancestor.named_modules():
+            if any([isinstance(module, _class) for _class in search_class]):
+                # Find the direct parent if this is a descendant, not a child, of target
+                *path, name = fullname.split(".")
+                parent = ancestor
+                while path:
+                    parent = parent.get_submodule(path.pop(0))
+                # Skip this linear if it's a child of a LoraInjectedLinear
+                if exclude_children_of and any(
+                    [isinstance(parent, _class) for _class in exclude_children_of]
+                ):
+                    continue
+                results.append((parent, name, module))  # Append the result to the list
+    return results  # Return the list instead of using 'yield'
+def _find_modules_old(
+    model,
+    ancestor_class: Set[str] = DEFAULT_TARGET_REPLACE,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [LoraInjectedLinear],
+):
+    ret = []
+    for _module in model.modules():
+        if _module.__class__.__name__ in ancestor_class:
+            for name, _child_module in _module.named_modules():
+                if _child_module.__class__ in search_class:
+                    ret.append((_module, name, _child_module))
+    # print(ret)
+    return ret
+_find_modules = _find_modules_v2
+# _find_modules = _find_modules_old
+def inject_trainable_lora(
+    model: nn.Module,
+    target_replace_module: Set[str] = DEFAULT_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+    verbose: bool = False,
+    dropout_p: float = 0.0,
+    scale: float = 1.0,
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear]
+    ):
+        weight = _child_module.weight
+        bias = _child_module.bias
+        if verbose:
+            print("LoRA Injection : injecting lora into ", name)
+            print("LoRA Injection : weight shape", weight.shape)
+        _tmp = LoraInjectedLinear(
+            _child_module.in_features,
+            _child_module.out_features,
+            _child_module.bias is not None,
+            r=r,
+            dropout_p=dropout_p,
+            scale=scale,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def inject_trainable_lora_extended(
+    model: nn.Module,
+    target_replace_module: Set[str] = UNET_EXTENDED_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, nn.Conv2d]
+    ):
+        if _child_module.__class__ == nn.Linear:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedLinear(
+                _child_module.in_features,
+                _child_module.out_features,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif _child_module.__class__ == nn.Conv2d:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedConv2d(
+                _child_module.in_channels,
+                _child_module.out_channels,
+                _child_module.kernel_size,
+                _child_module.stride,
+                _child_module.padding,
+                _child_module.dilation,
+                _child_module.groups,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        if bias is not None:
+            _tmp.to(_child_module.bias.device).to(_child_module.bias.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def extract_lora_ups_down(model, target_replace_module=DEFAULT_TARGET_REPLACE):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        loras.append((_child_module.lora_up, _child_module.lora_down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def extract_lora_as_tensor(
+    model, target_replace_module=DEFAULT_TARGET_REPLACE, as_fp16=True
+):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        up, down = _child_module.realize_as_lora()
+        if as_fp16:
+            up = up.to(torch.float16)
+            down = down.to(torch.float16)
+        loras.append((up, down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def save_lora_weight(
+    model,
+    path="./lora.pt",
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+):
+    weights = []
+    for _up, _down in extract_lora_ups_down(
+        model, target_replace_module=target_replace_module
+    ):
+        weights.append(_up.weight.to("cpu").to(torch.float16))
+        weights.append(_down.weight.to("cpu").to(torch.float16))
+    torch.save(weights, path)
+def save_lora_as_json(model, path="./lora.json"):
+    weights = []
+    for _up, _down in extract_lora_ups_down(model):
+        weights.append(_up.weight.detach().cpu().numpy().tolist())
+        weights.append(_down.weight.detach().cpu().numpy().tolist())
+    import json
+    with open(path, "w") as f:
+        json.dump(weights, f)
+def save_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Saves the Lora from multiple modules in a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (module, target_replace_module)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (model, target_replace_module) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        for i, (_up, _down) in enumerate(
+            extract_lora_as_tensor(model, target_replace_module)
+        ):
+            rank = _down.shape[0]
+            metadata[f"{name}:{i}:rank"] = str(rank)
+            weights[f"{name}:{i}:up"] = _up
+            weights[f"{name}:{i}:down"] = _down
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def save_safeloras(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    outpath="./lora.safetensors",
+):
+    return save_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def convert_loras_to_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Converts the Lora from multiple pytorch .pt files into a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (pytorch_model_path, target_replace_module, rank)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (path, target_replace_module, r) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        lora = torch.load(path)
+        for i, weight in enumerate(lora):
+            is_up = i % 2 == 0
+            i = i // 2
+            if is_up:
+                metadata[f"{name}:{i}:rank"] = str(r)
+                weights[f"{name}:{i}:up"] = weight
+            else:
+                weights[f"{name}:{i}:down"] = weight
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def convert_loras_to_safeloras(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    outpath="./lora.safetensors",
+):
+    convert_loras_to_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def parse_safeloras(
+    safeloras,
+) -> Dict[str, Tuple[List[nn.parameter.Parameter], List[int], List[str]]]:
+    """
+    Converts a loaded safetensor file that contains a set of module Loras
+    into Parameters and other information
+    Output is a dictionary of {
+        "module name": (
+            [list of weights],
+            [list of ranks],
+            target_replacement_modules
+        )
+    }
+    """
+    loras = {}
+    metadata = safeloras.metadata()
+    get_name = lambda k: k.split(":")[0]
+    keys = list(safeloras.keys())
+    keys.sort(key=get_name)
+    for name, module_keys in groupby(keys, get_name):
+        info = metadata.get(name)
+        if not info:
+            raise ValueError(
+                f"Tensor {name} has no metadata - is this a Lora safetensor?"
+            )
+        # Skip Textual Inversion embeds
+        if info == EMBED_FLAG:
+            continue
+        # Handle Loras
+        # Extract the targets
+        target = json.loads(info)
+        # Build the result lists - Python needs us to preallocate lists to insert into them
+        module_keys = list(module_keys)
+        ranks = [4] * (len(module_keys) // 2)
+        weights = [None] * len(module_keys)
+        for key in module_keys:
+            # Split the model name and index out of the key
+            _, idx, direction = key.split(":")
+            idx = int(idx)
+            # Add the rank
+            ranks[idx] = int(metadata[f"{name}:{idx}:rank"])
+            # Insert the weight into the list
+            idx = idx * 2 + (1 if direction == "down" else 0)
+            weights[idx] = nn.parameter.Parameter(safeloras.get_tensor(key))
+        loras[name] = (weights, ranks, target)
+    return loras
+def parse_safeloras_embeds(
+    safeloras,
+) -> Dict[str, torch.Tensor]:
+    """
+    Converts a loaded safetensor file that contains Textual Inversion embeds into
+    a dictionary of embed_token: Tensor
+    """
+    embeds = {}
+    metadata = safeloras.metadata()
+    for key in safeloras.keys():
+        # Only handle Textual Inversion embeds
+        meta = metadata.get(key)
+        if not meta or meta != EMBED_FLAG:
+            continue
+        embeds[key] = safeloras.get_tensor(key)
+    return embeds
+def load_safeloras(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras)
+def load_safeloras_embeds(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras_embeds(safeloras)
+def load_safeloras_both(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras), parse_safeloras_embeds(safeloras)
+def collapse_lora(model, alpha=1.0):
+    for _module, name, _child_module in _find_modules(
+        model,
+        UNET_EXTENDED_TARGET_REPLACE | TEXT_ENCODER_EXTENDED_TARGET_REPLACE,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            print("Collapsing Lin Lora in", name)
+            _child_module.linear.weight = nn.Parameter(
+                _child_module.linear.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data
+                    @ _child_module.lora_down.weight.data
+                )
+                .type(_child_module.linear.weight.dtype)
+                .to(_child_module.linear.weight.device)
+            )
+        else:
+            print("Collapsing Conv Lora in", name)
+            _child_module.conv.weight = nn.Parameter(
+                _child_module.conv.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data.flatten(start_dim=1)
+                    @ _child_module.lora_down.weight.data.flatten(start_dim=1)
+                )
+                .reshape(_child_module.conv.weight.data.shape)
+                .type(_child_module.conv.weight.dtype)
+                .to(_child_module.conv.weight.device)
+            )
+def monkeypatch_or_replace_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, LoraInjectedLinear]
+    ):
+        _source = (
+            _child_module.linear
+            if isinstance(_child_module, LoraInjectedLinear)
+            else _child_module
+        )
+        weight = _source.weight
+        bias = _source.bias
+        _tmp = LoraInjectedLinear(
+            _source.in_features,
+            _source.out_features,
+            _source.bias is not None,
+            r=r.pop(0) if isinstance(r, list) else r,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_lora_extended(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[nn.Linear, LoraInjectedLinear, nn.Conv2d, LoraInjectedConv2d],
+    ):
+        if (_child_module.__class__ == nn.Linear) or (
+            _child_module.__class__ == LoraInjectedLinear
+        ):
+            if len(loras[0].shape) != 2:
+                continue
+            _source = (
+                _child_module.linear
+                if isinstance(_child_module, LoraInjectedLinear)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedLinear(
+                _source.in_features,
+                _source.out_features,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif (_child_module.__class__ == nn.Conv2d) or (
+            _child_module.__class__ == LoraInjectedConv2d
+        ):
+            if len(loras[0].shape) != 4:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv2d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv2d(
+                _source.in_channels,
+                _source.out_channels,
+                _source.kernel_size,
+                _source.stride,
+                _source.padding,
+                _source.dilation,
+                _source.groups,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_safeloras(models, safeloras):
+    loras = parse_safeloras(safeloras)
+    for name, (lora, ranks, target) in loras.items():
+        model = getattr(models, name, None)
+        if not model:
+            print(f"No model provided for {name}, contained in Lora")
+            continue
+        monkeypatch_or_replace_lora_extended(model, lora, target, ranks)
+def monkeypatch_remove_lora(model):
+    for _module, name, _child_module in _find_modules(
+        model, search_class=[LoraInjectedLinear, LoraInjectedConv2d]
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            _source = _child_module.linear
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Linear(
+                _source.in_features, _source.out_features, bias is not None
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        else:
+            _source = _child_module.conv
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Conv2d(
+                in_channels=_source.in_channels,
+                out_channels=_source.out_channels,
+                kernel_size=_source.kernel_size,
+                stride=_source.stride,
+                padding=_source.padding,
+                dilation=_source.dilation,
+                groups=_source.groups,
+                bias=bias is not None,
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        _module._modules[name] = _tmp
+def monkeypatch_add_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[LoraInjectedLinear]
+    ):
+        weight = _child_module.linear.weight
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_up.weight.to(weight.device) * beta
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_down.weight.to(weight.device) * beta
+        )
+        _module._modules[name].to(weight.device)
+def tune_lora_scale(model, alpha: float = 1.0):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            _module.scale = alpha
+def set_lora_diag(model, diag: torch.Tensor):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            _module.set_selector_from_diag(diag)
+def _text_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
+def _ti_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["ti", "pt"])
+def apply_learned_embed_in_clip(
+    learned_embeds,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    if isinstance(token, str):
+        trained_tokens = [token]
+    elif isinstance(token, list):
+        assert len(learned_embeds.keys()) == len(
+            token
+        ), "The number of tokens and the number of embeds should be the same"
+        trained_tokens = token
+    else:
+        trained_tokens = list(learned_embeds.keys())
+    for token in trained_tokens:
+        print(token)
+        embeds = learned_embeds[token]
+        # cast to dtype of text_encoder
+        dtype = text_encoder.get_input_embeddings().weight.dtype
+        num_added_tokens = tokenizer.add_tokens(token)
+        i = 1
+        if not idempotent:
+            while num_added_tokens == 0:
+                print(f"The tokenizer already contains the token {token}.")
+                token = f"{token[:-1]}-{i}>"
+                print(f"Attempting to add the token {token}.")
+                num_added_tokens = tokenizer.add_tokens(token)
+                i += 1
+        elif num_added_tokens == 0 and idempotent:
+            print(f"The tokenizer already contains the token {token}.")
+            print(f"Replacing {token} embedding.")
+        # resize the token embeddings
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        # get the id for the token and assign the embeds
+        token_id = tokenizer.convert_tokens_to_ids(token)
+        text_encoder.get_input_embeddings().weight.data[token_id] = embeds
+    return token
+def load_learned_embed_in_clip(
+    learned_embeds_path,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    learned_embeds = torch.load(learned_embeds_path)
+    apply_learned_embed_in_clip(
+        learned_embeds, text_encoder, tokenizer, token, idempotent
+    )
+def patch_pipe(
+    pipe,
+    maybe_unet_path,
+    token: Optional[str] = None,
+    r: int = 4,
+    patch_unet=True,
+    patch_text=True,
+    patch_ti=True,
+    idempotent_token=True,
+    unet_target_replace_module=DEFAULT_TARGET_REPLACE,
+    text_target_replace_module=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+):
+    if maybe_unet_path.endswith(".pt"):
+        # torch format
+        if maybe_unet_path.endswith(".ti.pt"):
+            unet_path = maybe_unet_path[:-6] + ".pt"
+        elif maybe_unet_path.endswith(".text_encoder.pt"):
+            unet_path = maybe_unet_path[:-16] + ".pt"
+        else:
+            unet_path = maybe_unet_path
+        ti_path = _ti_lora_path(unet_path)
+        text_path = _text_lora_path(unet_path)
+        if patch_unet:
+            print("LoRA : Patching Unet")
+            monkeypatch_or_replace_lora(
+                pipe.unet,
+                torch.load(unet_path),
+                r=r,
+                target_replace_module=unet_target_replace_module,
+            )
+        if patch_text:
+            print("LoRA : Patching text encoder")
+            monkeypatch_or_replace_lora(
+                pipe.text_encoder,
+                torch.load(text_path),
+                target_replace_module=text_target_replace_module,
+                r=r,
+            )
+        if patch_ti:
+            print("LoRA : Patching token input")
+            token = load_learned_embed_in_clip(
+                ti_path,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+    elif maybe_unet_path.endswith(".safetensors"):
+        safeloras = safe_open(maybe_unet_path, framework="pt", device="cpu")
+        monkeypatch_or_replace_safeloras(pipe, safeloras)
+        tok_dict = parse_safeloras_embeds(safeloras)
+        if patch_ti:
+            apply_learned_embed_in_clip(
+                tok_dict,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+        return tok_dict
+@torch.no_grad()
+def inspect_lora(model):
+    moved = {}
+    for name, _module in model.named_modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            ups = _module.lora_up.weight.data.clone()
+            downs = _module.lora_down.weight.data.clone()
+            wght: torch.Tensor = ups.flatten(1) @ downs.flatten(1)
+            dist = wght.flatten().abs().mean().item()
+            if name in moved:
+                moved[name].append(dist)
+            else:
+                moved[name] = [dist]
+    return moved
+def save_all(
+    unet,
+    text_encoder,
+    save_path,
+    placeholder_token_ids=None,
+    placeholder_tokens=None,
+    save_lora=True,
+    save_ti=True,
+    target_replace_module_text=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+    target_replace_module_unet=DEFAULT_TARGET_REPLACE,
+    safe_form=True,
+):
+    if not safe_form:
+        # save ti
+        if save_ti:
+            ti_path = _ti_lora_path(save_path)
+            learned_embeds_dict = {}
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                learned_embeds_dict[tok] = learned_embeds.detach().cpu()
+            torch.save(learned_embeds_dict, ti_path)
+            print("Ti saved to ", ti_path)
+        # save text encoder
+        if save_lora:
+            save_lora_weight(
+                unet, save_path, target_replace_module=target_replace_module_unet
+            )
+            print("Unet saved to ", save_path)
+            save_lora_weight(
+                text_encoder,
+                _text_lora_path(save_path),
+                target_replace_module=target_replace_module_text,
+            )
+            print("Text Encoder saved to ", _text_lora_path(save_path))
+    else:
+        assert save_path.endswith(
+            ".safetensors"
+        ), f"Save path : {save_path} should end with .safetensors"
+        loras = {}
+        embeds = {}
+        if save_lora:
+            loras["unet"] = (unet, target_replace_module_unet)
+            loras["text_encoder"] = (text_encoder, target_replace_module_text)
+        if save_ti:
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                embeds[tok] = learned_embeds.detach().cpu()
+        save_safeloras_with_embeds(loras, embeds, save_path)

generation/control/oldm/lora_ldm.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import einops
+import torch
+import torch as th
+import torch.nn as nn
+import os
+import sys
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Upsample, Downsample, AttentionBlock, normalization
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from cldm.lora import inject_trainable_lora, extract_lora_ups_down, inject_trainable_lora_extended
+def count_parameters(params):
+    num_params = sum(p.numel() for p in params)
+    return round(num_params / 1e6, 1)
+def set_requires_grad(model, requires_grad=True):
+    for param in model.parameters():
+        param.requires_grad = requires_grad
+class ControlledUnetModel(UNetModel):
+    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            if control is not None:
+                h = module(h, emb, context)
+                h += control
+                control = None
+            else:
+                h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        return self.out(h)
+class ControlNet(nn.Module):
+    def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            out_channels,
+            hint_channels,
+            num_res_blocks,
+            attention_resolutions,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            conv_resample=True,
+            dims=2,
+            use_checkpoint=False,
+            use_fp16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False,  # custom transformer support
+            transformer_depth=1,  # custom transformer support
+            context_dim=None,  # custom transformer support
+            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None,
+            disable_middle_self_attn=False,
+            use_linear_in_transformer=False,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_hint_block = TimestepEmbedSequential(
+            conv_nd(dims, hint_channels, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 32, 32, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 96, 96, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+        )
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        guided_hint = self.input_hint_block(hint, emb, context)
+        # print('guided_hint', len(guided_hint), guided_hint[0].shape)
+        # sys.exit()
+        return guided_hint
+class ControlLDM(LatentDiffusion):
+    def __init__(self, control_stage_config, control_key, only_mid_control, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = instantiate_from_config(control_stage_config)
+        self.control_key = control_key
+        self.only_mid_control = only_mid_control
+        self.control_scales = [1.0] * 13
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
+        control = batch[self.control_key]
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        control = einops.rearrange(control, 'b h w c -> b c h w')
+        control = control.to(memory_format=torch.contiguous_format).float()
+        return x, dict(c_crossattn=[c], c_concat=[control])
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+        cond_txt = torch.cat(cond['c_crossattn'], 1)
+        if cond['c_concat'] is None:
+            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control)
+        else:
+            control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt)
+            # control = [c * scale for c, scale in zip(control, self.control_scales)]
+            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
+        return eps
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        return self.get_learned_conditioning([""] * N)
+    @torch.no_grad()
+    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
+                   use_ema_scope=True, num_samples=1,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log["reconstruction"] = self.decode_first_stage(z)
+        log["control"] = c_cat * 2.0 - 1.0
+        log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                     batch_size=N, ddim=use_ddim,
+                                                     ddim_steps=ddim_steps, eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+        if kwargs['split'] == 'train':
+            if unconditional_guidance_scale > 1.0:
+                uc_cross = self.get_unconditional_conditioning(N)
+                uc_cat = c_cat  # torch.zeros_like(c_cat)
+                uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
+                samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                batch_size=N, ddim=use_ddim,
+                                                ddim_steps=ddim_steps, eta=ddim_eta,
+                                                unconditional_guidance_scale=unconditional_guidance_scale,
+                                                unconditional_conditioning=uc_full,
+                                                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        else:
+            if unconditional_guidance_scale > 1.0:
+                # uc_cross = self.get_unconditional_conditioning(N)
+                # uc_cat = c_cat  # torch.zeros_like(c_cat)
+                c_cat = torch.stack([c_cat[0] for _ in range(num_samples)], dim=0).clone()
+                cond = {"c_concat": [c_cat], "c_crossattn": [self.get_learned_conditioning([batch['txt'][0]] * num_samples)]}
+                uc_full = {"c_concat": [c_cat], "c_crossattn": [self.get_learned_conditioning([''] * num_samples)]}
+                samples_cfg, _ = self.sample_log(cond=cond, # cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                batch_size=num_samples, ddim=use_ddim,
+                                                ddim_steps=ddim_steps, eta=ddim_eta,
+                                                unconditional_guidance_scale=unconditional_guidance_scale,
+                                                unconditional_conditioning=uc_full,
+                                                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        return log
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        b, c, h, w = cond["c_concat"][0].shape
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+        return samples, intermediates
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        names = []
+        for name, param in self.model.diffusion_model.named_parameters():
+            if param.requires_grad:
+                params.append(param)
+                names.append(name)
+        # params += self.unet_lora_params
+        if not self.sd_locked:
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        set_requires_grad(self.model.diffusion_model, True)
+        num_params = count_parameters(params)
+        print()
+        print()
+        print(f"Total number of trainable parameters: {num_params} M!")
+        print()
+        print()
+        return opt
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()

generation/control/oldm/model.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+import torch
+from omegaconf import OmegaConf
+from ldm.util import instantiate_from_config
+def get_state_dict(d):
+    return d.get('state_dict', d)
+def load_state_dict(ckpt_path, location='cpu'):
+    _, extension = os.path.splitext(ckpt_path)
+    if extension.lower() == ".safetensors":
+        import safetensors.torch
+        state_dict = safetensors.torch.load_file(ckpt_path, device=location)
+    else:
+        state_dict = get_state_dict(torch.load(ckpt_path, map_location=torch.device(location)))
+    state_dict = get_state_dict(state_dict)
+    print(f'Loaded state_dict from [{ckpt_path}]')
+    return state_dict
+def create_model(config_path):
+    config = OmegaConf.load(config_path)
+    model = instantiate_from_config(config.model).cpu()
+    print(f'Loaded model config from [{config_path}]')
+    return model

generation/control/oldm/oft_ldm.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import einops
+import torch
+import torch as th
+import torch.nn as nn
+import os
+import sys
+from ldm.modules.diffusionmodules.util import (
+    conv_nd,
+    linear,
+    zero_module,
+    timestep_embedding,
+)
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from ldm.modules.attention import SpatialTransformer
+from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Upsample, Downsample, AttentionBlock, normalization
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import log_txt_as_img, exists, instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+def count_parameters(params):
+    num_params = 0
+    for p in params:
+        shape = p.shape
+        if len(shape) == 3 and shape[1] == shape[2]:
+            N, D, _ = shape
+            num_params += N * D * (D - 1) // 2
+        else:
+            num_params += p.numel()
+    # num_params = sum(p.numel() for p in params)
+    return round(num_params / 1e6, 1)
+def set_requires_grad(model, requires_grad=True):
+    for param in model.parameters():
+        param.requires_grad = requires_grad
+class ControlledUnetModel(UNetModel):
+    def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs):
+        hs = []
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            if control is not None:
+                h = module(h, emb, context)
+                h += control
+                control = None
+            else:
+                h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        return self.out(h)
+class ControlNet(nn.Module):
+    def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            out_channels,
+            hint_channels,
+            num_res_blocks,
+            attention_resolutions,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            conv_resample=True,
+            dims=2,
+            use_checkpoint=False,
+            use_fp16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False,  # custom transformer support
+            transformer_depth=1,  # custom transformer support
+            context_dim=None,  # custom transformer support
+            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None,
+            disable_middle_self_attn=False,
+            use_linear_in_transformer=False,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
+                                 "as a list/tuple (per-level) with the same length as channel_mult")
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
+            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
+                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
+                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
+                  f"attention will still not be set.")
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        self.input_hint_block = TimestepEmbedSequential(
+            conv_nd(dims, hint_channels, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 16, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 32, 32, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            conv_nd(dims, 96, 96, 3, padding=1),
+            nn.SiLU(),
+            conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+        )
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+        guided_hint = self.input_hint_block(hint, emb, context)
+        # print('guided_hint', len(guided_hint), guided_hint[0].shape, guided_hint.max(), guided_hint.min())
+        # sys.exit()
+        return guided_hint
+class ControlLDM(LatentDiffusion):
+    def __init__(self, control_stage_config, control_key, only_mid_control, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = instantiate_from_config(control_stage_config)
+        self.control_key = control_key
+        self.only_mid_control = only_mid_control
+        self.control_scales = [1.0] * 13
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
+        control = batch[self.control_key]
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        control = einops.rearrange(control, 'b h w c -> b c h w')
+        control = control.to(memory_format=torch.contiguous_format).float()
+        return x, dict(c_crossattn=[c], c_concat=[control])
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+        cond_txt = torch.cat(cond['c_crossattn'], 1)
+        if cond['c_concat'] is None:
+            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=None, only_mid_control=self.only_mid_control)
+        else:
+            control = self.control_model(x=x_noisy, hint=torch.cat(cond['c_concat'], 1), timesteps=t, context=cond_txt)
+            # control = [c * scale for c, scale in zip(control, self.control_scales)]
+            eps = diffusion_model(x=x_noisy, timesteps=t, context=cond_txt, control=control, only_mid_control=self.only_mid_control)
+        return eps
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        return self.get_learned_conditioning([""] * N)
+    @torch.no_grad()
+    def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None,
+                   quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
+                   plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None,
+                   use_ema_scope=True, num_samples=1,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c["c_concat"][0][:N], c["c_crossattn"][0][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log["reconstruction"] = self.decode_first_stage(z)
+        log["control"] = c_cat * 2.0 - 1.0
+        log["conditioning"] = log_txt_as_img((512, 512), batch[self.cond_stage_key], size=16)
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+            diffusion_row = torch.stack(diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
+            log["diffusion_row"] = diffusion_grid
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                     batch_size=N, ddim=use_ddim,
+                                                     ddim_steps=ddim_steps, eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log["samples"] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log["denoise_row"] = denoise_grid
+        if kwargs['split'] == 'train':
+            if unconditional_guidance_scale > 1.0:
+                uc_cross = self.get_unconditional_conditioning(N)
+                uc_cat = c_cat  # torch.zeros_like(c_cat)
+                uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
+                samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                batch_size=N, ddim=use_ddim,
+                                                ddim_steps=ddim_steps, eta=ddim_eta,
+                                                unconditional_guidance_scale=unconditional_guidance_scale,
+                                                unconditional_conditioning=uc_full,
+                                                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        else:
+            if unconditional_guidance_scale > 1.0:
+                # uc_cross = self.get_unconditional_conditioning(N)
+                # uc_cat = c_cat  # torch.zeros_like(c_cat)
+                c_cat = torch.stack([c_cat[0] for _ in range(num_samples)], dim=0).clone()
+                cond = {"c_concat": [c_cat], "c_crossattn": [self.get_learned_conditioning([batch['txt'][0]] * num_samples)]}
+                uc_full = {"c_concat": [c_cat], "c_crossattn": [self.get_learned_conditioning([''] * num_samples)]}
+                samples_cfg, _ = self.sample_log(cond=cond, # cond={"c_concat": [c_cat], "c_crossattn": [c]},
+                                                batch_size=num_samples, ddim=use_ddim,
+                                                ddim_steps=ddim_steps, eta=ddim_eta,
+                                                unconditional_guidance_scale=unconditional_guidance_scale,
+                                                unconditional_conditioning=uc_full,
+                                                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
+        return log
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        b, c, h, w = cond["c_concat"][0].shape
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+        return samples, intermediates
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        names = []
+        for name, param in self.model.diffusion_model.named_parameters():
+            if param.requires_grad:
+                params.append(param)
+                names.append(name)
+                # print(name, param.shape)
+        # params += self.unet_lora_params
+        if not self.sd_locked:
+            params += list(self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        set_requires_grad(self.model.diffusion_model, True)
+        num_params = count_parameters(params)
+        print()
+        print()
+        print(f"Total number of trainable parameters: {num_params} M!")
+        print()
+        print()
+        return opt
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()

generation/subject/download_dreambooth.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+echo -e "\nDownloading dreambooth dataset..."
+git clone https://github.com/google/dreambooth.git

generation/subject/evaluate.py ADDED Viewed

	@@ -0,0 +1,462 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import argparse
+import hashlib
+import logging
+import math
+import os
+import warnings
+from pathlib import Path
+from functools import reduce
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig, ViTFeatureExtractor, ViTModel
+import lpips
+import json
+from PIL import Image
+import requests
+from transformers import AutoProcessor, AutoTokenizer, CLIPModel
+import torchvision.transforms.functional as TF
+from torch.nn.functional import cosine_similarity
+from torchvision.transforms import Compose, ToTensor, Normalize, Resize, ToPILImage
+import re
+def get_prompt(subject_name, prompt_idx):
+    subject_names = [
+        "backpack", "backpack_dog", "bear_plushie", "berry_bowl", "can",
+        "candle", "cat", "cat2", "clock", "colorful_sneaker",
+        "dog", "dog2", "dog3", "dog5", "dog6",
+        "dog7", "dog8", "duck_toy", "fancy_boot", "grey_sloth_plushie",
+        "monster_toy", "pink_sunglasses", "poop_emoji", "rc_car", "red_cartoon",
+        "robot_toy", "shiny_sneaker", "teapot", "vase", "wolf_plushie",
+    ]
+    class_tokens = [
+        "backpack", "backpack", "stuffed animal", "bowl", "can",
+        "candle", "cat", "cat", "clock", "sneaker",
+        "dog", "dog", "dog", "dog", "dog",
+        "dog", "dog", "toy", "boot", "stuffed animal",
+        "toy", "glasses", "toy", "toy", "cartoon",
+        "toy", "sneaker", "teapot", "vase", "stuffed animal",
+    ]
+    class_token = class_tokens[subject_names.index(subject_name)]
+    prompt_list = [
+        f"a qwe {class_token} in the jungle",
+        f"a qwe {class_token} in the snow",
+        f"a qwe {class_token} on the beach",
+        f"a qwe {class_token} on a cobblestone street",
+        f"a qwe {class_token} on top of pink fabric",
+        f"a qwe {class_token} on top of a wooden floor",
+        f"a qwe {class_token} with a city in the background",
+        f"a qwe {class_token} with a mountain in the background",
+        f"a qwe {class_token} with a blue house in the background",
+        f"a qwe {class_token} on top of a purple rug in a forest",
+        f"a qwe {class_token} wearing a red hat",
+        f"a qwe {class_token} wearing a santa hat",
+        f"a qwe {class_token} wearing a rainbow scarf",
+        f"a qwe {class_token} wearing a black top hat and a monocle",
+        f"a qwe {class_token} in a chef outfit",
+        f"a qwe {class_token} in a firefighter outfit",
+        f"a qwe {class_token} in a police outfit",
+        f"a qwe {class_token} wearing pink glasses",
+        f"a qwe {class_token} wearing a yellow shirt",
+        f"a qwe {class_token} in a purple wizard outfit",
+        f"a red qwe {class_token}",
+        f"a purple qwe {class_token}",
+        f"a shiny qwe {class_token}",
+        f"a wet qwe {class_token}",
+        f"a cube shaped qwe {class_token}",
+    ]
+    return prompt_list[int(prompt_idx)]
+class PromptDatasetCLIP(Dataset):
+    def __init__(self, subject_name, data_dir_B, tokenizer, processor, epoch=None):
+        self.data_dir_B = data_dir_B
+        subject_name, prompt_idx = subject_name.split('-')
+        data_dir_B = os.path.join(self.data_dir_B, str(epoch))
+        self.image_lst = [os.path.join(data_dir_B, f) for f in os.listdir(data_dir_B) if f.endswith(".png")]
+        self.prompt_lst = [get_prompt(subject_name, prompt_idx)] * len(self.image_lst)
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def __len__(self):
+        return len(self.image_lst)
+    def __getitem__(self, idx):
+        image_path = self.image_lst[idx]
+        image = Image.open(image_path)
+        prompt = self.prompt_lst[idx]
+        extrema = image.getextrema()
+        if all(min_val == max_val == 0 for min_val, max_val in extrema):
+            return None, None
+        else:
+            prompt_inputs = self.tokenizer([prompt], padding=True, return_tensors="pt")
+            image_inputs = self.processor(images=image, return_tensors="pt")
+            return image_inputs, prompt_inputs
+class PairwiseImageDatasetCLIP(Dataset):
+    def __init__(self, subject_name, data_dir_A, data_dir_B, processor, epoch):
+        self.data_dir_A = data_dir_A
+        self.data_dir_B = data_dir_B
+        subject_name, prompt_idx = subject_name.split('-')
+        self.data_dir_A = os.path.join(self.data_dir_A, subject_name)
+        self.image_files_A = [os.path.join(self.data_dir_A, f) for f in os.listdir(self.data_dir_A) if f.endswith(".jpg")]
+        data_dir_B = os.path.join(self.data_dir_B, str(epoch))
+        self.image_files_B = [os.path.join(data_dir_B, f) for f in os.listdir(data_dir_B) if f.endswith(".png")]
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.processor = processor
+    def __len__(self):
+        return len(self.image_files_A) * len(self.image_files_B)
+    def __getitem__(self, index):
+        index_A = index // len(self.image_files_B)
+        index_B = index % len(self.image_files_B)
+        image_A = Image.open(self.image_files_A[index_A]) # .convert("RGB")
+        image_B = Image.open(self.image_files_B[index_B]) # .convert("RGB")
+        extrema_A = image_A.getextrema()
+        extrema_B = image_B.getextrema()
+        if all(min_val == max_val == 0 for min_val, max_val in extrema_A) or all(min_val == max_val == 0 for min_val, max_val in extrema_B):
+            return None, None
+        else:
+            inputs_A = self.processor(images=image_A, return_tensors="pt")
+            inputs_B = self.processor(images=image_B, return_tensors="pt")
+            return inputs_A, inputs_B
+class PairwiseImageDatasetDINO(Dataset):
+    def __init__(self, subject_name, data_dir_A, data_dir_B, feature_extractor, epoch):
+        self.data_dir_A = data_dir_A
+        self.data_dir_B = data_dir_B
+        subject_name, prompt_idx = subject_name.split('-')
+        self.data_dir_A = os.path.join(self.data_dir_A, subject_name)
+        self.image_files_A = [os.path.join(self.data_dir_A, f) for f in os.listdir(self.data_dir_A) if f.endswith(".jpg")]
+        data_dir_B = os.path.join(self.data_dir_B, str(epoch))
+        self.image_files_B = [os.path.join(data_dir_B, f) for f in os.listdir(data_dir_B) if f.endswith(".png")]
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.feature_extractor = feature_extractor
+    def __len__(self):
+        return len(self.image_files_A) * len(self.image_files_B)
+    def __getitem__(self, index):
+        index_A = index // len(self.image_files_B)
+        index_B = index % len(self.image_files_B)
+        image_A = Image.open(self.image_files_A[index_A]) # .convert("RGB")
+        image_B = Image.open(self.image_files_B[index_B]) # .convert("RGB")
+        extrema_A = image_A.getextrema()
+        extrema_B = image_B.getextrema()
+        if all(min_val == max_val == 0 for min_val, max_val in extrema_A) or all(min_val == max_val == 0 for min_val, max_val in extrema_B):
+            return None, None
+        else:
+            inputs_A = self.feature_extractor(images=image_A, return_tensors="pt")
+            inputs_B = self.feature_extractor(images=image_B, return_tensors="pt")
+            return inputs_A, inputs_B
+class PairwiseImageDatasetLPIPS(Dataset):
+    def __init__(self, subject_name, data_dir_A, data_dir_B, epoch):
+        self.data_dir_A = data_dir_A
+        self.data_dir_B = data_dir_B
+        subject_name, prompt_idx = subject_name.split('-')
+        self.data_dir_A = os.path.join(self.data_dir_A, subject_name)
+        self.image_files_A = [os.path.join(self.data_dir_A, f) for f in os.listdir(self.data_dir_A) if f.endswith(".jpg")]
+        data_dir_B = os.path.join(self.data_dir_B, str(epoch))
+        self.image_files_B = [os.path.join(data_dir_B, f) for f in os.listdir(data_dir_B) if f.endswith(".png")]
+        self.transform = Compose([
+            Resize((512, 512)),
+            ToTensor(),
+            Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        ])
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def __len__(self):
+        return len(self.image_files_A) * len(self.image_files_B)
+    def __getitem__(self, index):
+        index_A = index // len(self.image_files_B)
+        index_B = index % len(self.image_files_B)
+        image_A = Image.open(self.image_files_A[index_A]) # .convert("RGB")
+        image_B = Image.open(self.image_files_B[index_B]) # .convert("RGB")
+        extrema_A = image_A.getextrema()
+        extrema_B = image_B.getextrema()
+        if all(min_val == max_val == 0 for min_val, max_val in extrema_A) or all(min_val == max_val == 0 for min_val, max_val in extrema_B):
+            return None, None
+        else:
+            if self.transform:
+                image_A = self.transform(image_A)
+                image_B = self.transform(image_B)
+            return image_A, image_B
+def clip_text(subject_name, image_dir):
+    criterion = 'clip_text'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+    # Get the text features
+    tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+    # Get the image features
+    processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    epochs = sorted([int(epoch) for epoch in os.listdir(image_dir)])
+    best_mean_similarity = 0
+    mean_similarity_list = []
+    for epoch in epochs:
+        similarity = []
+        dataset = PromptDatasetCLIP(subject_name, image_dir, tokenizer, processor, epoch)
+        dataloader = DataLoader(dataset, batch_size=32)
+        for i in range(len(dataset)):
+            image_inputs, prompt_inputs = dataset[i]
+            if image_inputs is not None and prompt_inputs is not None:
+                image_inputs['pixel_values'] = image_inputs['pixel_values'].to(device)
+                prompt_inputs['input_ids'] = prompt_inputs['input_ids'].to(device)
+                prompt_inputs['attention_mask'] = prompt_inputs['attention_mask'].to(device)
+                # print(prompt_inputs)
+                image_features = model.get_image_features(**image_inputs)
+                text_features = model.get_text_features(**prompt_inputs)
+                sim = cosine_similarity(image_features, text_features)
+                #image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+                #text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+                #logit_scale = model.logit_scale.exp()
+                #sim = torch.matmul(text_features, image_features.t()) * logit_scale
+                similarity.append(sim.item())
+        if similarity:
+            mean_similarity = torch.tensor(similarity).mean().item()
+            mean_similarity_list.append(mean_similarity)
+            best_mean_similarity = max(best_mean_similarity, mean_similarity)
+            print(f'epoch: {epoch}, criterion: {criterion}, mean_similarity: {mean_similarity}({best_mean_similarity})')
+        else:
+            mean_similarity_list.append(0)
+            print(f'epoch: {epoch}, criterion: {criterion}, mean_similarity: {0}({best_mean_similarity})')
+    return mean_similarity_list
+def clip_image(subject_name, image_dir, dreambooth_dir='dreambooth/dataset'):
+    criterion = 'clip_image'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+    # Get the image features
+    processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")
+    epochs = sorted([int(epoch) for epoch in os.listdir(image_dir)])
+    best_mean_similarity = 0
+    mean_similarity_list = []
+    for epoch in epochs:
+        similarity = []
+        dataset = PairwiseImageDatasetCLIP(subject_name, dreambooth_dir, image_dir, processor, epoch)
+        # dataset = SelfPairwiseImageDatasetCLIP(subject, './data', processor)
+        for i in range(len(dataset)):
+            inputs_A, inputs_B = dataset[i]
+            if inputs_A is not None and inputs_B is not None:
+                inputs_A['pixel_values'] = inputs_A['pixel_values'].to(device)
+                inputs_B['pixel_values'] = inputs_B['pixel_values'].to(device)
+                image_A_features = model.get_image_features(**inputs_A)
+                image_B_features = model.get_image_features(**inputs_B)
+                image_A_features = image_A_features / image_A_features.norm(p=2, dim=-1, keepdim=True)
+                image_B_features = image_B_features / image_B_features.norm(p=2, dim=-1, keepdim=True)
+                logit_scale = model.logit_scale.exp()
+                sim = torch.matmul(image_A_features, image_B_features.t()) # * logit_scale
+                similarity.append(sim.item())
+        if similarity:
+            mean_similarity = torch.tensor(similarity).mean().item()
+            best_mean_similarity = max(best_mean_similarity, mean_similarity)
+            mean_similarity_list.append(mean_similarity)
+            print(f'epoch: {epoch}, criterion: {criterion}, mean_similarity: {mean_similarity}({best_mean_similarity})')
+        else:
+            mean_similarity_list.append(0)
+            print(f'epoch: {epoch}, criterion: {criterion}, mean_similarity: {0}({best_mean_similarity})')
+    return mean_similarity_list
+def dino(subject_name, image_dir, dreambooth_dir='dreambooth/dataset'):
+    criterion = 'dino'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = ViTModel.from_pretrained('facebook/dino-vits16').to(device)
+    feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vits16')
+    epochs = sorted([int(epoch) for epoch in os.listdir(image_dir)])
+    best_mean_similarity = 0
+    mean_similarity_list = []
+    for epoch in epochs:
+        similarity = []
+        # dataset = PairwiseImageDatasetDINO(subject, './data', image_dir, feature_extractor, epoch)
+        dataset = PairwiseImageDatasetDINO(subject_name, dreambooth_dir, image_dir, feature_extractor, epoch)
+        # dataset = SelfPairwiseImageDatasetDINO(subject, './data', feature_extractor)
+        for i in range(len(dataset)):
+            inputs_A, inputs_B = dataset[i]
+            if inputs_A is not None and inputs_B is not None:
+                inputs_A['pixel_values'] = inputs_A['pixel_values'].to(device)
+                inputs_B['pixel_values'] = inputs_B['pixel_values'].to(device)
+                outputs_A = model(**inputs_A)
+                image_A_features = outputs_A.last_hidden_state[:, 0, :]
+                outputs_B = model(**inputs_B)
+                image_B_features = outputs_B.last_hidden_state[:, 0, :]
+                image_A_features = image_A_features / image_A_features.norm(p=2, dim=-1, keepdim=True)
+                image_B_features = image_B_features / image_B_features.norm(p=2, dim=-1, keepdim=True)
+                sim = torch.matmul(image_A_features, image_B_features.t()) # * logit_scale
+                similarity.append(sim.item())
+        mean_similarity = torch.tensor(similarity).mean().item()
+        best_mean_similarity = max(best_mean_similarity, mean_similarity)
+        mean_similarity_list.append(mean_similarity)
+        print(f'epoch: {epoch}, criterion: {criterion}, mean_similarity: {mean_similarity}({best_mean_similarity})')
+    return mean_similarity_list
+def lpips_image(subject_name, image_dir, dreambooth_dir='dreambooth/dataset'):
+    criterion = 'lpips_image'
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Set up the LPIPS model (vgg=True uses the VGG-based model from the paper)
+    loss_fn = lpips.LPIPS(net='vgg').to(device)
+    # 有可能有些epoch没跑全
+    epochs = sorted([int(epoch) for epoch in os.listdir(image_dir)])
+    mean_similarity_list = []
+    best_mean_similarity = 0
+    for epoch in epochs:
+        similarity = []
+        dataset = PairwiseImageDatasetLPIPS(subject_name, dreambooth_dir, image_dir, epoch)
+        # dataset = SelfPairwiseImageDatasetLPIPS(subject, './data')
+        for i in range(len(dataset)):
+            image_A, image_B = dataset[i]
+            if image_A is not None and image_B is not None:
+                image_A = image_A.to(device)
+                image_B = image_B.to(device)
+                # Calculate LPIPS between the two images
+                distance = loss_fn(image_A, image_B)
+                similarity.append(distance.item())
+        mean_similarity = torch.tensor(similarity).mean().item()
+        best_mean_similarity = max(best_mean_similarity, mean_similarity)
+        mean_similarity_list.append(mean_similarity)
+        print(f'epoch: {epoch}, criterion: LPIPS distance, mean_similarity: {mean_similarity}({best_mean_similarity})')
+    return mean_similarity_list
+if __name__ == "__main__":
+    image_dir = 'log_hra/lr_1e-4_r_8/'
+    subject_dirs, subject_names = [], []
+    for name in os.listdir(image_dir):
+        if os.path.isdir(os.path.join(image_dir, name)):
+            subject_dirs.append(os.path.join(image_dir, name))
+            subject_names.append(name)
+    results_path = os.path.join(image_dir, 'true_results.json')
+    # {'backpack-0':{'DINO':[x, ...], 'CLIP-I':[x, ...], 'CLIP-T':[x, ...], 'LPIPS':[x, ...],}}
+    results_dict = dict()
+    if os.path.exists(results_path):
+        with open(results_path, 'r') as f:
+            results = f.__iter__()
+            while True:
+                try:
+                    result_json = json.loads(next(results))
+                    results_dict.update(result_json)
+                except StopIteration:
+                    print("finish extraction.")
+                    break
+    for idx in range(len(subject_names)):
+        subject_name = subject_names[idx]
+        subject_dir = subject_dirs[idx]
+        if subject_name in results_dict:
+            continue
+        print(f'evaluating {subject_dir}')
+        dino_sim = dino(subject_name, subject_dir)
+        clip_i_sim = clip_image(subject_name, subject_dir)
+        clip_t_sim = clip_text(subject_name, subject_dir)
+        lpips_sim = lpips_image(subject_name, subject_dir)
+        subject_result = {'DINO': dino_sim, 'CLIP-I': clip_i_sim, 'CLIP-T': clip_t_sim, 'LPIPS': lpips_sim}
+        print(subject_result)
+        with open(results_path,'a') as f:
+            json_string = json.dumps({subject_name: subject_result})
+            f.write(json_string + "\n")

generation/subject/get_result.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+import os
+from functools import reduce
+import numpy as np
+import json
+if __name__ == "__main__":
+    image_dir = 'log_hra/lr_1e-4_r_8/'
+    results_path = os.path.join(image_dir, 'true_results.json')
+    # {'backpack-0':{'DINO':[x, ...], 'CLIP-I':[x, ...], 'CLIP-T':[x, ...], 'LPIPS':[x, ...],}}
+    results_dict = dict()
+    if os.path.exists(results_path):
+        with open(results_path, 'r') as f:
+            results = f.__iter__()
+            while True:
+                try:
+                    result_json = json.loads(next(results))
+                    results_dict.update(result_json)
+                except StopIteration:
+                    print("finish extraction.")
+                    break
+    total_result = np.zeros(4)
+    metric_name_list = ['DINO', 'CLIP-I', 'CLIP-T', 'LPIPS']
+    for subject_name, subject_results in results_dict.items():
+        metric_results_percent = None
+        for metric_name, metric_results in subject_results.items():
+            metric_results = [0 if np.isnan(r) else r for r in metric_results]
+            metric_results_norm = np.array(metric_results) / (max(metric_results) - min(metric_results))
+            if metric_results_percent is None:
+                metric_results_percent = metric_results_norm
+            else:
+                metric_results_percent += metric_results_norm
+        subject_results_max_idx = np.argmax(metric_results_percent)
+        for idx, metric_name in enumerate(metric_name_list):
+            total_result[idx] += subject_results[metric_name][subject_results_max_idx]
+    total_result /= len(results_dict)
+    print(f'DINO: {total_result[0]}, CLIP-I: {total_result[1]}, CLIP-T: {total_result[2]}, LPIPS: {total_result[3]}')

generation/subject/oft_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .mhe import MHE_db, MHE_OFT, MHE_LoRA
2	+

generation/subject/oft_utils/attention_processor.py ADDED Viewed

	@@ -0,0 +1,1036 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Union
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.scale = dim_head**-0.5 if scale_qk else 1.0
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+        else:
+            self.group_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ):
+        is_lora = hasattr(self, "processor") and isinstance(
+            self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor)
+        )
+        if use_memory_efficient_attention_xformers:
+            if self.added_kv_proj_dim is not None:
+                # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                raise NotImplementedError(
+                    "Memory efficient attention with `xformers` is currently not supported when"
+                    " `self.added_kv_proj_dim` is defined."
+                )
+            elif not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+            if is_lora:
+                processor = LoRAXFormersAttnProcessor(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_lora:
+                processor = LoRAAttnProcessor(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            else:
+                processor = AttnProcessor()
+        self.set_processor(processor)
+    def set_attention_slice(self, slice_size):
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            processor = AttnProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor"):
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+        self.processor = processor
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def head_to_batch_dim(self, tensor, out_dim=3):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(self, query, key, attention_mask=None):
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
+        if batch_size is None:
+            deprecate(
+                "batch_size=None",
+                "0.0.15",
+                (
+                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
+                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
+                    " `prepare_attention_mask` when preparing the attention_mask."
+                ),
+            )
+            batch_size = 1
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        if attention_mask.shape[-1] != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(self, encoder_hidden_states):
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+class AttnProcessor:
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class HRALinearLayer(nn.Module):
+    def __init__(self, in_features, out_features, bias=False, r=8, apply_GS=False):
+        super(HRALinearLayer, self).__init__()
+        self.in_features=in_features
+        self.out_features=out_features
+        self.register_buffer('cross_attention_dim', torch.tensor(in_features))
+        self.register_buffer('hidden_size', torch.tensor(out_features))
+        self.r = r
+        self.apply_GS = apply_GS
+        half_u = torch.zeros(in_features, r // 2)
+        nn.init.kaiming_uniform_(half_u, a=math.sqrt(5))
+        self.hra_u = nn.Parameter(torch.repeat_interleave(half_u, 2, dim=1), requires_grad=True)
+    def forward(self, attn, x):
+        # orig_dtype = x.dtype
+        # dtype = self.v_list[0].dtype
+        # unit_v_list = [v / (torch.sqrt(torch.sum(v ** 2) + self.eps)) for v in self.v_list]
+        # filt = attn.weight.data.to(dtype)
+        # for unit_v in unit_v_list:
+        #     filt = torch.mm(filt, torch.eye(self.in_features, device=x.device) - 2 * unit_v @ unit_v.t())
+        #     # filt = torch.mm(filt, torch.eye(self.in_features, device=x.device) + self.v_square)
+        # bias_term = attn.bias.data if attn.bias is not None else None
+        # if bias_term is not None:
+        #     bias_term = bias_term.to(orig_dtype)
+        # out = nn.functional.linear(input=x.to(orig_dtype), weight=filt.to(orig_dtype), bias=bias_term)
+        # return out
+        orig_weight = attn.weight.data
+        if self.apply_GS:
+            weight = [(self.hra_u[:, 0] / self.hra_u[:, 0].norm()).view(-1, 1)]
+            for i in range(1, self.r):
+                ui = self.hra_u[:, i].view(-1, 1)
+                for j in range(i):
+                    ui = ui - (weight[j].t() @ ui) * weight[j]
+                weight.append((ui / ui.norm()).view(-1, 1))
+            weight = torch.cat(weight, dim=1)
+            new_weight = orig_weight @ (torch.eye(self.in_features, device=x.device) - 2 * weight @ weight.t())
+        else:
+            new_weight = orig_weight
+            hra_u_norm = self.hra_u / self.hra_u.norm(dim=0)
+            for i in range(self.r):
+                ui = hra_u_norm[:, i].view(-1, 1)
+                new_weight = torch.mm(new_weight, torch.eye(self.in_features, device=x.device) - 2 * ui @ ui.t())
+        out = nn.functional.linear(input=x, weight=new_weight, bias=attn.bias)
+        return out
+class HRAAttnProcessor(nn.Module):
+    def __init__(self, hidden_size, cross_attention_dim=None, r=8, apply_GS=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.r = r
+        self.to_q_hra = HRALinearLayer(hidden_size, hidden_size, r=r, apply_GS=apply_GS)
+        self.to_k_hra = HRALinearLayer(cross_attention_dim or hidden_size, hidden_size, r=r, apply_GS=apply_GS)
+        self.to_v_hra = HRALinearLayer(cross_attention_dim or hidden_size, hidden_size, r=r, apply_GS=apply_GS)
+        self.to_out_hra = HRALinearLayer(hidden_size, hidden_size, r=r, apply_GS=apply_GS)
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = self.to_q_hra(attn.to_q, hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        key = self.to_k_hra(attn.to_k, encoder_hidden_states)
+        # value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        value = self.to_v_hra(attn.to_v, encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        # hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = self.to_out_hra(attn.to_out[0], hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+def project(R, eps):
+    I = torch.zeros((R.size(0), R.size(0)), dtype=R.dtype, device=R.device)
+    diff = R - I
+    norm_diff = torch.norm(diff)
+    if norm_diff <= eps:
+        return R
+    else:
+        return I + eps * (diff / norm_diff)
+def project_batch(R, eps=1e-5):
+    # scaling factor for each of the smaller block matrix
+    eps = eps * 1 / torch.sqrt(torch.tensor(R.shape[0]))
+    I = torch.zeros((R.size(1), R.size(1)), device=R.device, dtype=R.dtype).unsqueeze(0).expand_as(R)
+    diff = R - I
+    norm_diff = torch.norm(R - I, dim=(1, 2), keepdim=True)
+    mask = (norm_diff <= eps).bool()
+    out = torch.where(mask, R, I + eps * (diff / norm_diff))
+    return out
+class OFTLinearLayer(nn.Module):
+    def __init__(self, in_features, out_features, bias=False, block_share=False, eps=6e-5, r=4, is_coft=False):
+        super(OFTLinearLayer, self).__init__()
+        # Define the reduction rate:
+        self.r = r
+        # Check whether to use the constrained variant COFT
+        self.is_coft = is_coft
+        assert in_features % self.r == 0, "in_features must be divisible by r"
+        # Get the number of available GPUs
+        # self.num_gpus = torch.cuda.device_count()
+        # Set the device IDs for distributed training
+        # self.device_ids = list(range(self.num_gpus))
+        self.in_features=in_features
+        self.out_features=out_features
+        self.register_buffer('cross_attention_dim', torch.tensor(in_features))
+        self.register_buffer('hidden_size', torch.tensor(out_features))
+        # Define the fixed Linear layer: v
+        # self.OFT = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)
+        #self.filt_shape = [in_features, in_features]
+        self.fix_filt_shape = [in_features, out_features]
+        self.block_share = block_share
+        # Define the trainable matrix parameter: R
+        if self.block_share:
+            # Initialized as an identity matrix
+            self.R_shape = [in_features // self.r, in_features // self.r]
+            self.R = nn.Parameter(torch.zeros(self.R_shape[0], self.R_shape[0]), requires_grad=True)
+            self.eps = eps * self.R_shape[0] * self.R_shape[0]
+        else:
+            # Initialized as an identity matrix
+            self.R_shape = [self.r, in_features // self.r, in_features // self.r]
+            R = torch.zeros(self.R_shape[1], self.R_shape[1])
+            R = torch.stack([R] * self.r)
+            self.R = nn.Parameter(R, requires_grad=True)
+            self.eps = eps * self.R_shape[1] * self.R_shape[1]
+        self.tmp = None
+    def forward(self, attn, x):
+        orig_dtype = x.dtype
+        dtype = self.R.dtype
+        if self.block_share:
+            if self.is_coft:
+                with torch.no_grad():
+                    self.R.copy_(project(self.R, eps=self.eps))
+            orth_rotate = self.cayley(self.R)
+        else:
+            if self.is_coft:
+                with torch.no_grad():
+                    self.R.copy_(project_batch(self.R, eps=self.eps))
+            # 如果没有cayley_batch这一步，那么self.R也不会更新
+            orth_rotate = self.cayley_batch(self.R)
+        # print('self.tmp[:5, :5]')
+        # print(self.tmp[:5, :5])
+        # if self.tmp is not None:
+        #     print('self.R[0, :5, :5] - self.tmp[0, :5, :5]')
+        #     print(self.R[0, :5, :5] - self.tmp[0, :5, :5])
+        # self.tmp = self.R.clone()
+        # Block-diagonal parametrization
+        block_diagonal_matrix = self.block_diagonal(orth_rotate)
+        # fix filter
+        fix_filt = attn.weight.data
+        fix_filt = torch.transpose(fix_filt, 0, 1)
+        filt = torch.mm(block_diagonal_matrix, fix_filt.to(dtype))
+        filt = torch.transpose(filt, 0, 1)
+        # Apply the trainable identity matrix
+        bias_term = attn.bias.data if attn.bias is not None else None
+        if bias_term is not None:
+            bias_term = bias_term.to(orig_dtype)
+        out = nn.functional.linear(input=x.to(orig_dtype), weight=filt.to(orig_dtype), bias=bias_term)
+        # out = nn.functional.linear(input=x, weight=fix_filt.transpose(0, 1), bias=bias_term)
+        return out
+    def cayley(self, data):
+        r, c = list(data.shape)
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.t())
+        I = torch.eye(r, device=data.device)
+        # Perform the Cayley parametrization
+        Q = torch.mm(I - skew, torch.inverse(I + skew))
+        return Q
+    def cayley_batch(self, data):
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        # I = torch.eye(r, device=data.device).unsqueeze(0).repeat(b, 1, 1)
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+        return Q
+    def block_diagonal(self, R):
+        if len(R.shape) == 2:
+            # Create a list of R repeated block_count times
+            blocks = [R] * self.r
+        else:
+            # Create a list of R slices along the third dimension
+            blocks = [R[i, ...] for i in range(self.r)]
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+        return A
+    def is_orthogonal(self, R, eps=1e-5):
+        with torch.no_grad():
+            RtR = torch.matmul(R.t(), R)
+            diff = torch.abs(RtR - torch.eye(R.shape[1], dtype=R.dtype, device=R.device))
+            return torch.all(diff < eps)
+    def is_identity_matrix(self, tensor):
+        if not torch.is_tensor(tensor):
+            raise TypeError("Input must be a PyTorch tensor.")
+        if tensor.ndim != 2 or tensor.shape[0] != tensor.shape[1]:
+            return False
+        identity = torch.eye(tensor.shape[0], device=tensor.device)
+        return torch.all(torch.eq(tensor, identity))
+class OFTAttnProcessor(nn.Module):
+    def __init__(self, hidden_size, cross_attention_dim=None, eps=2e-5, r=4, is_coft=False):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.r = r
+        self.is_coft = is_coft
+        self.to_q_oft = OFTLinearLayer(hidden_size, hidden_size, eps=eps, r=r, is_coft=is_coft)
+        self.to_k_oft = OFTLinearLayer(cross_attention_dim or hidden_size, hidden_size, eps=eps, r=r, is_coft=is_coft)
+        self.to_v_oft = OFTLinearLayer(cross_attention_dim or hidden_size, hidden_size, eps=eps, r=r, is_coft=is_coft)
+        self.to_out_oft = OFTLinearLayer(hidden_size, hidden_size, eps=eps, r=r, is_coft=is_coft)
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+        query = self.to_q_oft(attn.to_q, hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+        key = self.to_k_oft(attn.to_k, encoder_hidden_states)
+        # value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+        value = self.to_v_oft(attn.to_v, encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        # hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+        hidden_states = self.to_out_oft(attn.to_out[0], hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class AttnAddedKVProcessor:
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class AttnAddedKVProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query, out_dim=4)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key, out_dim=4)
+            value = attn.head_to_batch_dim(value, out_dim=4)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+class XFormersAttnProcessor:
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class AttnProcessor2_0:
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class SlicedAttnProcessor:
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class SlicedAttnAddedKVProcessor:
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+    def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None):
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+            hidden_states[start_idx:end_idx] = attn_slice
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+        return hidden_states
+AttentionProcessor = Union[
+    AttnProcessor,
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    OFTAttnProcessor,
+    HRAAttnProcessor
+]

generation/subject/oft_utils/mhe.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import copy
+import numpy as np
+class MHE_LoRA(nn.Module):
+    def __init__(self, model):
+        super(MHE_LoRA, self).__init__()
+        # self.model = copy.deepcopy(model)
+        self.model = self.copy_without_grad(model)
+        self.extracted_params = {}
+        keys_to_delete = []
+        # for name, param in self.model.named_parameters():
+        #     self.extracted_params[name] = param
+        for name, tensor in model.state_dict().items():
+            self.extracted_params[name] = tensor.detach().clone()
+        for name in self.extracted_params:
+            if 'attn' in name and 'processor' not in name:
+                if 'weight' in name:
+                    if 'to_q' in name:
+                        lora_down = name.replace('to_q', 'processor.to_q_lora.down')
+                        lora_up = name.replace('to_q', 'processor.to_q_lora.up')
+                    elif 'to_k' in name:
+                        lora_down = name.replace('to_k', 'processor.to_k_lora.down')
+                        lora_up = name.replace('to_k', 'processor.to_k_lora.up')
+                    elif 'to_v' in name:
+                        lora_down = name.replace('to_v', 'processor.to_v_lora.down')
+                        lora_up = name.replace('to_v', 'processor.to_v_lora.up')
+                    elif 'to_out' in name:
+                        lora_down = name.replace('to_out.0', 'processor.to_out_lora.down')
+                        lora_up = name.replace('to_out.0', 'processor.to_out_lora.up')
+                    else:
+                        pass
+                    with torch.no_grad():
+                        self.extracted_params[name] += self.extracted_params[lora_up].cuda() @ self.extracted_params[lora_down].cuda()
+                    keys_to_delete.append(lora_up)
+                    keys_to_delete.append(lora_down)
+        for key in keys_to_delete:
+            del self.extracted_params[key]
+    def copy_without_grad(self, model):
+        copied_model = copy.deepcopy(model)
+        for param in copied_model.parameters():
+            param.requires_grad = False
+            param.detach_()
+        return copied_model
+    @staticmethod
+    def mhe_loss(filt):
+        if len(filt.shape) == 2:
+            n_filt, _ = filt.shape
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * (-1)
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        else:
+            n_filt, _, _, _ = filt.shape
+            filt = filt.reshape(n_filt, -1)
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * -1
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        return MHE_loss
+    def calculate_mhe(self):
+        mhe_loss = []
+        with torch.no_grad():
+            for name in self.extracted_params:
+                weight = self.extracted_params[name]
+                # linear layer or conv layer
+                if len(weight.shape) == 2 or len(weight.shape) == 4:
+                    loss = self.mhe_loss(weight)
+                    mhe_loss.append(loss.cpu().detach().item())
+            mhe_loss = np.array(mhe_loss)
+        return mhe_loss.sum()
+def project(R, eps):
+    I = torch.zeros((R.size(0), R.size(0)), dtype=R.dtype, device=R.device)
+    diff = R - I
+    norm_diff = torch.norm(diff)
+    if norm_diff <= eps:
+        return R
+    else:
+        return I + eps * (diff / norm_diff)
+def project_batch(R, eps=1e-5):
+    # scaling factor for each of the smaller block matrix
+    eps = eps * 1 / torch.sqrt(torch.tensor(R.shape[0]))
+    I = torch.zeros((R.size(1), R.size(1)), device=R.device, dtype=R.dtype).unsqueeze(0).expand_as(R)
+    diff = R - I
+    norm_diff = torch.norm(R - I, dim=(1, 2), keepdim=True)
+    mask = (norm_diff <= eps).bool()
+    out = torch.where(mask, R, I + eps * (diff / norm_diff))
+    return out
+class MHE_OFT(nn.Module):
+    def __init__(self, model, eps=6e-5, r=4):
+        super(MHE_OFT, self).__init__()
+        # self.model = copy.deepcopy(model)
+        # self.model = self.copy_without_grad(model)
+        self.r = r
+        self.extracted_params = {}
+        keys_to_delete = []
+        # for name, param in self.model.named_parameters():
+        #     self.extracted_params[name] = param
+        for name, tensor in model.state_dict().items():
+            self.extracted_params[name] = tensor.detach().clone()
+        for name in self.extracted_params:
+            if 'attn' in name and 'processor' not in name:
+                if 'weight' in name:
+                    if 'to_q' in name:
+                        oft_R = name.replace('to_q.weight', 'processor.to_q_oft.R')
+                    elif 'to_k' in name:
+                        oft_R = name.replace('to_k.weight', 'processor.to_k_oft.R')
+                    elif 'to_v' in name:
+                        oft_R = name.replace('to_v.weight', 'processor.to_v_oft.R')
+                    elif 'to_out' in name:
+                        oft_R = name.replace('to_out.0.weight', 'processor.to_out_oft.R')
+                    else:
+                        pass
+                    R = self.extracted_params[oft_R].cuda()
+                    with torch.no_grad():
+                        if len(R.shape) == 2:
+                            self.eps = eps * R.shape[0] * R.shape[0]
+                            R.copy_(project(R, eps=self.eps))
+                            orth_rotate = self.cayley(R)
+                        else:
+                            self.eps = eps * R.shape[1] * R.shape[0]
+                            R.copy_(project_batch(R, eps=self.eps))
+                            orth_rotate = self.cayley_batch(R)
+                        self.extracted_params[name] = self.extracted_params[name] @ self.block_diagonal(orth_rotate)
+                    keys_to_delete.append(oft_R)
+        for key in keys_to_delete:
+            del self.extracted_params[key]
+    def is_orthogonal(self, R, eps=1e-5):
+        with torch.no_grad():
+            RtR = torch.matmul(R.t(), R)
+            diff = torch.abs(RtR - torch.eye(R.shape[1], dtype=R.dtype, device=R.device))
+            return torch.all(diff < eps)
+    def block_diagonal(self, R):
+        if len(R.shape) == 2:
+            # Create a list of R repeated block_count times
+            blocks = [R] * self.r
+        else:
+            # Create a list of R slices along the third dimension
+            blocks = [R[i, ...] for i in range(R.shape[0])]
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+        return A
+    def copy_without_grad(self, model):
+        copied_model = copy.deepcopy(model)
+        for param in copied_model.parameters():
+            param.requires_grad = False
+            param.detach_()
+        return copied_model
+    def cayley(self, data):
+        r, c = list(data.shape)
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.t())
+        I = torch.eye(r, device=data.device)
+        # Perform the Cayley parametrization
+        Q = torch.mm(I + skew, torch.inverse(I - skew))
+        return Q
+    def cayley_batch(self, data):
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        # I = torch.eye(r, device=data.device).unsqueeze(0).repeat(b, 1, 1)
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I + skew, torch.inverse(I - skew))
+        return Q
+    @staticmethod
+    def mhe_loss(filt):
+        if len(filt.shape) == 2:
+            n_filt, _ = filt.shape
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * (-1)
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        else:
+            n_filt, _, _, _ = filt.shape
+            filt = filt.reshape(n_filt, -1)
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * -1
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        return MHE_loss
+    def calculate_mhe(self):
+        mhe_loss = []
+        with torch.no_grad():
+            for name in self.extracted_params:
+                weight = self.extracted_params[name]
+                # linear layer or conv layer
+                if len(weight.shape) == 2 or len(weight.shape) == 4:
+                    loss = self.mhe_loss(weight)
+                    mhe_loss.append(loss.cpu().detach().item())
+            mhe_loss = np.array(mhe_loss)
+        return mhe_loss.sum()
+    def is_orthogonal(self, R, eps=1e-5):
+        with torch.no_grad():
+            RtR = torch.matmul(R.t(), R)
+            diff = torch.abs(RtR - torch.eye(R.shape[1], dtype=R.dtype, device=R.device))
+            return torch.all(diff < eps)
+    def is_identity_matrix(self, tensor):
+        if not torch.is_tensor(tensor):
+            raise TypeError("Input must be a PyTorch tensor.")
+        if tensor.ndim != 2 or tensor.shape[0] != tensor.shape[1]:
+            return False
+        identity = torch.eye(tensor.shape[0], device=tensor.device)
+        return torch.all(torch.eq(tensor, identity))
+class MHE_db:
+    def __init__(self, model):
+        # self.model = copy.deepcopy(model)
+        # self.model.load_state_dict(model.state_dict())
+        # self.model = self.copy_without_grad(model)
+        #self.extracted_params = {}
+        #for name, param in model.named_parameters():
+        #    self.extracted_params[name] = param
+        self.extracted_params = {}
+        for name, tensor in model.state_dict().items():
+            self.extracted_params[name] = tensor.detach().clone()
+    @staticmethod
+    def mhe_loss(filt):
+        if len(filt.shape) == 2:
+            n_filt, _ = filt.shape
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * (-1)
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        else:
+            n_filt, _, _, _ = filt.shape
+            filt = filt.reshape(n_filt, -1)
+            filt = torch.transpose(filt, 0, 1)
+            filt_neg = filt * -1
+            filt = torch.cat((filt, filt_neg), dim=1)
+            n_filt *= 2
+            filt_norm = torch.sqrt(torch.sum(filt * filt, dim=0, keepdim=True) + 1e-4)
+            norm_mat = torch.matmul(filt_norm.t(), filt_norm)
+            inner_pro = torch.matmul(filt.t(), filt)
+            inner_pro /= norm_mat
+            cross_terms = (2.0 - 2.0 * inner_pro + torch.diag(torch.tensor([1.0] * n_filt)).cuda())
+            final = torch.pow(cross_terms, torch.ones_like(cross_terms) * (-0.5))
+            final -= torch.tril(final)
+            cnt = n_filt * (n_filt - 1) / 2.0
+            MHE_loss = 1 * torch.sum(final) / cnt
+        return MHE_loss
+    def calculate_mhe(self):
+        mhe_loss = []
+        with torch.no_grad():
+            for name in self.extracted_params:
+                weight = self.extracted_params[name]
+                # linear layer or conv layer
+                if len(weight.shape) == 2 or len(weight.shape) == 4:
+                    loss = self.mhe_loss(weight)
+                    mhe_loss.append(loss.cpu().detach().item())
+            mhe_loss = np.array(mhe_loss)
+        return mhe_loss.sum()

generation/subject/train_dreambooth_hra.py ADDED Viewed

	@@ -0,0 +1,1123 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import argparse
+import hashlib
+import logging
+import math
+import os
+import warnings
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import AttnProcsLayers
+from oft_utils.attention_processor import HRAAttnProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
+from oft_utils.mhe import MHE_OFT as MHE
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.16.0.dev0")
+logger = get_logger(__name__)
+def save_model_card(repo_id: str, images=None, base_model=str, prompt=str, repo_folder=None):
+    img_str = ""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+instance_prompt: {prompt}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- oft
+inference: true
+---
+    """
+    model_card = f"""
+# OFT DreamBooth - {repo_id}
+These are OFT adaption weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/). You can find some example images in the following. \n
+{img_str}
+"""
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default='runwayml/stable-diffusion-v1-5',
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default='../data/dreambooth/backpack',
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default='data/class_data/backpack',
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default='a photo of qwe backpack',
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default='a photo of backpack',
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default='a qwe backpack in the jungle',
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--test_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is keeps class prior.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=True,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument(
+        "--prior_loss_weight",
+        type=float,
+        default=1.0,
+        help="The weight of prior preservation loss."
+    )
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=200,
+        help=(
+            "Minimal class images for prior preservation loss. If there are not enough images already present in"
+            " class_data_dir, additional images will be sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="log_hra/backpack-0",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=1,
+        help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument(
+        "--sample_batch_size",
+        type=int,
+        default=4,
+        help="Batch size (per device) for sampling images.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=2005,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=5000,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=(
+            "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`."
+            " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state"
+            " for more docs"
+        ),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        default=False,
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=6e-05,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument(
+        "--lr_power",
+        type=float,
+        default=1.0,
+        help="Power factor of the polynomial scheduler.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--prior_generation_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp32", "fp16", "bf16"],
+        help=(
+            "Choose prior generation precision between fp32, fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to  fp16 if a GPU is available else fp32."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=6, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument(
+        "--name",
+        type=str,
+        default='backpack-0',
+        help=(
+            "The name of the current experiment run, consists of [data]-[prompt]"
+        ),
+    )
+    parser.add_argument(
+        "--hra_r",
+        type=int,
+        default=8,
+        help=(
+            "The rank of HRA across different layers. It is best to set 'r' to an even number; otherwise, the default            initialization method will not work."
+        ),
+    )
+    parser.add_argument(
+        "--hra_apply_GS",
+        action='store_true',
+        default=False,
+        help=(
+            "Whether to apply Gram-Schmidt orthogonalization."
+        ),
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        # logger is not available yet
+        if args.class_data_dir is not None:
+            warnings.warn("You need not use --class_data_dir without --with_prior_preservation.")
+        if args.class_prompt is not None:
+            warnings.warn("You need not use --class_prompt without --with_prior_preservation.")
+    return args
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        class_num=None,
+        size=512,
+        center_crop=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            if class_num is not None:
+                self.num_class_images = min(len(self.class_images_path), class_num)
+            else:
+                self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
+                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+    def __len__(self):
+        return self._length
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(self.instance_images_path[index % self.num_instance_images])
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        if self.class_data_root:
+            class_image = Image.open(self.class_images_path[index % self.num_class_images])
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                truncation=True,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids
+        return example
+def collate_fn(examples, with_prior_preservation=False):
+    input_ids = [example["instance_prompt_ids"] for example in examples]
+    pixel_values = [example["instance_images"] for example in examples]
+    # Concat class and instance examples for prior preservation.
+    # We do this to avoid doing two forward passes.
+    if with_prior_preservation:
+        input_ids += [example["class_prompt_ids"] for example in examples]
+        pixel_values += [example["class_images"] for example in examples]
+    pixel_values = torch.stack(pixel_values)
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    input_ids = torch.cat(input_ids, dim=0)
+    batch = {
+        "input_ids": input_ids,
+        "pixel_values": pixel_values,
+    }
+    return batch
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+    def __len__(self):
+        return self.num_samples
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) # total_limit=args.checkpoints_total_limit)
+    wandb_init = {
+        "wandb": {
+            "name": args.name,
+            # "project": args.project,
+        }
+    }
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+    # Generate class images if prior preservation is enabled.
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+        if cur_class_images < args.num_class_images:
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            if args.prior_generation_precision == "fp32":
+                torch_dtype = torch.float32
+            elif args.prior_generation_precision == "fp16":
+                torch_dtype = torch.float16
+            elif args.prior_generation_precision == "bf16":
+                torch_dtype = torch.bfloat16
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+            for example in tqdm(
+                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
+            ):
+                images = pipeline(example["prompt"]).images
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    image.save(image_filename)
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False)
+    elif args.pretrained_model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+            use_fast=False,
+        )
+    # import correct text encoder class
+    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder = text_encoder_cls.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
+    )
+    # We only train the additional adapter OFT layers
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warn(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+    # now we will add new COT weights to the attention layers
+    # It's important to realize here how many attention weights will be added and of which sizes
+    # The sizes of the attention layers consist only of two different variables:
+    # 1) - the "hidden_size", which is increased according to `unet.config.block_out_channels`.
+    # 2) - the "cross attention size", which is set to `unet.config.cross_attention_dim`.
+    # Let's first see how many attention processors we will have to set.
+    # For Stable Diffusion, it should be equal to:
+    # - down blocks (2x attention layers) * (2x transformer layers) * (3x down blocks) = 12
+    # - mid blocks (2x attention layers) * (1x transformer layers) * (1x mid blocks) = 2
+    # - up blocks (2x attention layers) * (3x transformer layers) * (3x down blocks) = 18
+    # => 32 layers
+    # Set correct oft layers
+    oft_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        oft_attn_procs[name] = HRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, r=args.hra_r, apply_GS=args.hra_apply_GS)
+    unet.set_attn_processor(oft_attn_procs)
+    print(f'Total parameters requiring grad: {sum([p.numel() for p in unet.parameters() if p.requires_grad == True])}')
+    oft_layers = AttnProcsLayers(unet.attn_processors)
+    accelerator.register_for_checkpointing(oft_layers)
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    # Optimizer creation
+    optimizer = optimizer_class(
+        oft_layers.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    # Dataset and DataLoaders creation:
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        class_num=args.num_class_images,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+    )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=lambda examples: collate_fn(examples, args.with_prior_preservation),
+        num_workers=args.dataloader_num_workers,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+        num_cycles=args.lr_num_cycles,
+        power=args.lr_power,
+    )
+    # Prepare everything with our `accelerator`.
+    oft_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        oft_layers, optimizer, train_dataloader, lr_scheduler
+    )
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth-oft", config=vars(args), init_kwargs=wandb_init)
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the mos recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+            resume_global_step = global_step * args.gradient_accumulation_steps
+            first_epoch = global_step // num_update_steps_per_epoch
+            resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps)
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    # calculate the hyperspherical energy fine-tuning
+    # mhe = MHE(unet, eps=args.eps, r=args.r)
+    # mhe_loss = mhe.calculate_mhe()
+    # accelerator.log({"mhe_loss": mhe_loss}, step=0)
+    accelerator.log({"hra_r": args.hra_r}, step=0)
+    accelerator.log({"hra_apply_GS": args.hra_apply_GS}, step=0)
+    # accelerator.log({"COFT": 1 if args.coft else 0}, step=0)
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        for step, batch in enumerate(train_dataloader):
+            # Skip steps until we reach the resumed step
+            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % args.gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+                # Predict the noise residual
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                # Get the target for loss depending on the prediction type
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+                if args.with_prior_preservation:
+                    # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                    model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                    target, target_prior = torch.chunk(target, 2, dim=0)
+                    # Compute instance loss
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                    # Compute prior loss
+                    prior_loss = F.mse_loss(model_pred_prior.float(), target_prior.float(), reduction="mean")
+                    # Add the prior loss to the instance loss.
+                    loss = loss + args.prior_loss_weight * prior_loss
+                else:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                # --------------------------------------------------------
+                # orthogonality regularizer
+                # for name, param in unet.named_parameters():
+                #     if 'hra_u' in name:
+                #         device = param.device
+                #         hra_u_norm = param / (param.norm(dim=0))
+                #         orth_loss = torch.norm(torch.eye(8, device=device) - hra_u_norm.t() @ hra_u_norm)
+                #         loss = loss + 1e-5 * orth_loss
+                # --------------------------------------------------------
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = oft_layers.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0: # and epoch > 1:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # mhe = MHE(unet, eps=args.eps, r=args.r)
+                # mhe_loss = mhe.calculate_mhe()
+                # accelerator.log({"mhe_loss": mhe_loss}, step=global_step)
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=accelerator.unwrap_model(text_encoder),
+                    revision=args.revision,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+                # Create the output directory if it doesn't exist
+                tmp_dir = os.path.join(args.output_dir, str(epoch))
+                if not os.path.exists(tmp_dir):
+                    os.makedirs(tmp_dir)
+                for i, image in enumerate(images):
+                    np_image = np.array(image)
+                    pil_image = Image.fromarray(np_image)
+                    pil_image.save(os.path.join(args.output_dir, str(epoch), f"image_{i}.png"))
+                del pipeline
+                torch.cuda.empty_cache()
+    # Save the oft layers
+    accelerator.wait_for_everyone()
+    # if accelerator.is_main_process:
+    #     unet = unet.to(torch.float32)
+    #     unet.save_attn_procs(args.output_dir)
+    #     # Final inference
+    #     # Load previous pipeline
+    #     pipeline = DiffusionPipeline.from_pretrained(
+    #         args.pretrained_model_name_or_path, revision=args.revision, torch_dtype=weight_dtype
+    #     )
+    #     pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+    #     pipeline = pipeline.to(accelerator.device)
+    #     # load attention processors
+    #     pipeline.unet.load_attn_procs(args.output_dir)
+    #     # run inference
+    #     if args.validation_prompt and args.num_validation_images > 0:
+    #         generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+    #         images = [
+    #             pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+    #             for _ in range(args.num_validation_images)
+    #         ]
+    #         for tracker in accelerator.trackers:
+    #             if tracker.name == "tensorboard":
+    #                 np_images = np.stack([np.asarray(img) for img in images])
+    #                 tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+    #             if tracker.name == "wandb":
+    #                 tracker.log(
+    #                     {
+    #                         "test": [
+    #                             wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+    #                             for i, image in enumerate(images)
+    #                         ]
+    #                     }
+    #                 )
+    #     if args.push_to_hub:
+    #         save_model_card(
+    #             repo_id,
+    #             images=images,
+    #             base_model=args.pretrained_model_name_or_path,
+    #             prompt=args.instance_prompt,
+    #             repo_folder=args.output_dir,
+    #         )
+    #         upload_folder(
+    #             repo_id=repo_id,
+    #             folder_path=args.output_dir,
+    #             commit_message="End of training",
+    #             ignore_patterns=["step_*", "epoch_*"],
+    #         )
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    for arg in vars(args):
+        print(f'{arg}: {getattr(args, arg)}')
+    main(args)

generation/subject/train_dreambooth_hra.sh ADDED Viewed

	@@ -0,0 +1,186 @@

+prompt_idx=$1
+class_idx=$2
+lr=1e-4
+hra_r=8
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+# Define the unique_token, class_tokens, and subject_names
+unique_token="qwe"
+subject_names=(
+    "backpack" "backpack_dog" "bear_plushie" "berry_bowl" "can"
+    "candle" "cat" "cat2" "clock" "colorful_sneaker"
+    "dog" "dog2" "dog3" "dog5" "dog6"
+    "dog7" "dog8" "duck_toy" "fancy_boot" "grey_sloth_plushie"
+    "monster_toy" "pink_sunglasses" "poop_emoji" "rc_car" "red_cartoon"
+    "robot_toy" "shiny_sneaker" "teapot" "vase" "wolf_plushie"
+)
+class_tokens=(
+    "backpack" "backpack" "stuffed animal" "bowl" "can"
+    "candle" "cat" "cat" "clock" "sneaker"
+    "dog" "dog" "dog" "dog" "dog"
+    "dog" "dog" "toy" "boot" "stuffed animal"
+    "toy" "glasses" "toy" "toy" "cartoon"
+    "toy" "sneaker" "teapot" "vase" "stuffed animal"
+)
+echo "prompt_idx: $prompt_idx, class_idx: $class_idx"
+class_token=${class_tokens[$class_idx]}
+selected_subject=${subject_names[$class_idx]}
+if [[ $class_idx =~ ^(0|1|2|3|4|5|8|9|17|18|19|20|21|22|23|24|25|26|27|28|29)$ ]]; then
+  prompt_list=(
+    "a ${unique_token} ${class_token} in the jungle"
+    "a ${unique_token} ${class_token} in the snow"
+    "a ${unique_token} ${class_token} on the beach"
+    "a ${unique_token} ${class_token} on a cobblestone street"
+    "a ${unique_token} ${class_token} on top of pink fabric"
+    "a ${unique_token} ${class_token} on top of a wooden floor"
+    "a ${unique_token} ${class_token} with a city in the background"
+    "a ${unique_token} ${class_token} with a mountain in the background"
+    "a ${unique_token} ${class_token} with a blue house in the background"
+    "a ${unique_token} ${class_token} on top of a purple rug in a forest"
+    "a ${unique_token} ${class_token} with a wheat field in the background"
+    "a ${unique_token} ${class_token} with a tree and autumn leaves in the background"
+    "a ${unique_token} ${class_token} with the Eiffel Tower in the background"
+    "a ${unique_token} ${class_token} floating on top of water"
+    "a ${unique_token} ${class_token} floating in an ocean of milk"
+    "a ${unique_token} ${class_token} on top of green grass with sunflowers around it"
+    "a ${unique_token} ${class_token} on top of a mirror"
+    "a ${unique_token} ${class_token} on top of the sidewalk in a crowded street"
+    "a ${unique_token} ${class_token} on top of a dirt road"
+    "a ${unique_token} ${class_token} on top of a white rug"
+    "a red ${unique_token} ${class_token}"
+    "a purple ${unique_token} ${class_token}"
+    "a shiny ${unique_token} ${class_token}"
+    "a wet ${unique_token} ${class_token}"
+    "a cube shaped ${unique_token} ${class_token}"
+  )
+  prompt_test_list=(
+    "a ${class_token} in the jungle"
+    "a ${class_token} in the snow"
+    "a ${class_token} on the beach"
+    "a ${class_token} on a cobblestone street"
+    "a ${class_token} on top of pink fabric"
+    "a ${class_token} on top of a wooden floor"
+    "a ${class_token} with a city in the background"
+    "a ${class_token} with a mountain in the background"
+    "a ${class_token} with a blue house in the background"
+    "a ${class_token} on top of a purple rug in a forest"
+    "a ${class_token} with a wheat field in the background"
+    "a ${class_token} with a tree and autumn leaves in the background"
+    "a ${class_token} with the Eiffel Tower in the background"
+    "a ${class_token} floating on top of water"
+    "a ${class_token} floating in an ocean of milk"
+    "a ${class_token} on top of green grass with sunflowers around it"
+    "a ${class_token} on top of a mirror"
+    "a ${class_token} on top of the sidewalk in a crowded street"
+    "a ${class_token} on top of a dirt road"
+    "a ${class_token} on top of a white rug"
+    "a red ${class_token}"
+    "a purple ${class_token}"
+    "a shiny ${class_token}"
+    "a wet ${class_token}"
+    "a cube shaped ${class_token}"
+  )
+else
+  prompt_list=(
+    "a ${unique_token} ${class_token} in the jungle"
+    "a ${unique_token} ${class_token} in the snow"
+    "a ${unique_token} ${class_token} on the beach"
+    "a ${unique_token} ${class_token} on a cobblestone street"
+    "a ${unique_token} ${class_token} on top of pink fabric"
+    "a ${unique_token} ${class_token} on top of a wooden floor"
+    "a ${unique_token} ${class_token} with a city in the background"
+    "a ${unique_token} ${class_token} with a mountain in the background"
+    "a ${unique_token} ${class_token} with a blue house in the background"
+    "a ${unique_token} ${class_token} on top of a purple rug in a forest"
+    "a ${unique_token} ${class_token} wearing a red hat"
+    "a ${unique_token} ${class_token} wearing a santa hat"
+    "a ${unique_token} ${class_token} wearing a rainbow scarf"
+    "a ${unique_token} ${class_token} wearing a black top hat and a monocle"
+    "a ${unique_token} ${class_token} in a chef outfit"
+    "a ${unique_token} ${class_token} in a firefighter outfit"
+    "a ${unique_token} ${class_token} in a police outfit"
+    "a ${unique_token} ${class_token} wearing pink glasses"
+    "a ${unique_token} ${class_token} wearing a yellow shirt"
+    "a ${unique_token} ${class_token} in a purple wizard outfit"
+    "a red ${unique_token} ${class_token}"
+    "a purple ${unique_token} ${class_token}"
+    "a shiny ${unique_token} ${class_token}"
+    "a wet ${unique_token} ${class_token}"
+    "a cube shaped ${unique_token} ${class_token}"
+  )
+  prompt_test_list=(
+    "a ${class_token} in the jungle"
+    "a ${class_token} in the snow"
+    "a ${class_token} on the beach"
+    "a ${class_token} on a cobblestone street"
+    "a ${class_token} on top of pink fabric"
+    "a ${class_token} on top of a wooden floor"
+    "a ${class_token} with a city in the background"
+    "a ${class_token} with a mountain in the background"
+    "a ${class_token} with a blue house in the background"
+    "a ${class_token} on top of a purple rug in a forest"
+    "a ${class_token} wearing a red hat"
+    "a ${class_token} wearing a santa hat"
+    "a ${class_token} wearing a rainbow scarf"
+    "a ${class_token} wearing a black top hat and a monocle"
+    "a ${class_token} in a chef outfit"
+    "a ${class_token} in a firefighter outfit"
+    "a ${class_token} in a police outfit"
+    "a ${class_token} wearing pink glasses"
+    "a ${class_token} wearing a yellow shirt"
+    "a ${class_token} in a purple wizard outfit"
+    "a red ${class_token}"
+    "a purple ${class_token}"
+    "a shiny ${class_token}"
+    "a wet ${class_token}"
+    "a cube shaped ${class_token}"
+  )
+fi
+validation_prompt=${prompt_list[$prompt_idx]}
+test_prompt=${prompt_test_list[$prompt_idx]}
+name="${selected_subject}-${prompt_idx}"
+instance_prompt="a photo of ${unique_token} ${class_token}"
+class_prompt="a photo of ${class_token}"
+export OUTPUT_DIR="log_hra/lr_${lr}_r_${hra_r}/${name}"
+export INSTANCE_DIR="dreambooth/dataset/${selected_subject}"
+export CLASS_DIR="class_data/${class_token}"
+if [ -d "$OUTPUT_DIR" ]; then
+    echo "该目录已存在：$OUTPUT_DIR"
+fi
+accelerate launch train_dreambooth_hra.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir="$CLASS_DIR" \
+  --output_dir="$OUTPUT_DIR" \
+  --instance_prompt="$instance_prompt" \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --class_prompt="$class_prompt" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --checkpointing_steps=5000 \
+  --learning_rate=$lr \
+  --report_to="wandb" \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --max_train_steps=2005 \
+  --validation_prompt="$validation_prompt" \
+  --validation_epochs=1 \
+  --seed="0" \
+  --name="$name" \
+  --num_class_images=200 \
+  --hra_r=$hra_r

llama/data/MATH_test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

llama/data/gsm8k_test.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

llama/data/oft/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .config import OFTConfig
+from .layer import Conv2d, Linear, OFTLayer
+from .model import OFTModel
+__all__ = ["OFTConfig", "OFTModel", "Conv2d", "Linear", "OFTLayer"]

llama/data/oft/config.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+from peft.tuners.lycoris_utils import LycorisConfig
+from peft.utils import PeftType
+@dataclass
+class OFTConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`OFTModel`].
+    Args:
+        r (`int`): OFT rank.
+        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding
+            the output layer. If this is not specified, modules will be chosen according to the model architecture. If
+            the architecture is not known, an error will be raised -- in this case, you should specify the target
+            modules manually.
+        init_weights (`bool`):
+            Whether to perform initialization of OFT weights.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        modules_to_save (`List[str]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+        coft (`bool`):
+            Whether to use the constrained variant of OFT or not, off by default.
+        eps (`float`):
+            The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+        block_share (`bool`):
+            Whether to share the OFT parameters between blocks or not. This is `False` by default.
+    """
+    r: int = field(default=8, metadata={"help": "OFT rank"})
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with OFT."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+            "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the OFT layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    coft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the constrained variant of OFT or not."},
+    )
+    eps: float = field(
+        default=6e-5,
+        metadata={
+            "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True."
+        },
+    )
+    block_share: bool = field(
+        default=False,
+        metadata={"help": "Whether to share the OFT parameters between blocks or not."},
+    )
+    def __post_init__(self):
+        self.peft_type = PeftType.OFT
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )

llama/data/oft/layer.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from typing import Any, List, Optional, Set, Tuple
+import torch
+import torch.nn as nn
+from peft.tuners.lycoris_utils import LycorisLayer, check_adapters_to_merge
+class OFTLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("oft_r",)
+    # other_param_names is defined on parent class
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+        # OFT info
+        self.oft_r = nn.ParameterDict({})
+        self.coft = {}
+        self.eps = {}
+        self.block_share = {}
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.oft_r}
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool):
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+        else:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+    def reset_adapter_parameters(self, adapter_name: str):
+        nn.init.zeros_(self.oft_r[adapter_name])
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5))
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        module_dropout: float,
+        init_weights: bool,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create oft adapter
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            coft (`bool`): Whether to use the constrained variant of OFT or not.
+            eps (`float`):
+                The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+            block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+        """
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.module_dropout[adapter_name] = module_dropout
+        self.coft[adapter_name] = coft
+        self.block_share[adapter_name] = block_share
+        # Determine shape of OFT weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            shape = (
+                base_layer.out_channels,
+                base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+            )
+        else:
+            raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}")
+        self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r)
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, block_share)
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+    def unscale_layer(self, scale=None) -> None:
+        # scale is not used
+        pass
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+        for active_adapter in adapter_names:
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                orig_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = orig_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if orig_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]]
+                new_weights = torch.mm(orig_weights, delta_weight)
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+                if safe_merge and not torch.isfinite(new_weights).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+                base_layer.weight.data = new_weights
+                self.merged_adapters.append(active_adapter)
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                new_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if new_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]]
+                delta_inv = torch.inverse(delta_weight)
+                orig_weights = torch.mm(new_weights, delta_inv)
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights.reshape(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+                base_layer.weight.data = orig_weights
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        rank = self.r[adapter_name]
+        coft = self.coft[adapter_name]
+        eps = self.eps[adapter_name]
+        opt_r = self.oft_r[adapter_name]
+        if coft:
+            with torch.no_grad():
+                opt_r.copy_(self._project_batch(opt_r, eps=eps))
+        orth_rotate = self._cayley_batch(opt_r)
+        weight = self._block_diagonal(orth_rotate, rank)
+        return weight
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144
+    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)  # noqa: E741
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+        return Q
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
+    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
+        if oft_r.shape[0] == 1:
+            # block share
+            blocks = [oft_r[0, ...] for i in range(rank)]
+        else:
+            blocks = [oft_r[i, ...] for i in range(rank)]
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+        return A
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
+    def _project_batch(self, oft_r, eps=1e-5):
+        # scaling factor for each of the smaller block matrix
+        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
+        I = (  # noqa: E741
+            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
+            .unsqueeze(0)
+            .expand_as(oft_r)
+        )
+        diff = oft_r - I
+        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
+        mask = (norm_diff <= eps).bool()
+        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
+        return out
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            if len(result.shape) == 4:
+                result = result.permute(0, 2, 3, 1)
+            base_layer = self.get_base_layer()
+            base_bias = base_layer.bias
+            if base_bias is not None:
+                # Bias should be added after OFT forward
+                result = result - base_bias.data
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+                module_dropout = self.module_dropout[active_adapter]
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = self._get_delta_activations(active_adapter, result, *args, **kwargs)
+            if base_bias is not None:
+                result = result + base_bias.data
+            if len(result.shape) == 4:
+                result = result.permute(0, 3, 1, 2)
+        result = result.to(previous_dtype)
+        return result
+class Linear(OFTLayer):
+    """OFT implemented in Linear layer"""
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
+class Conv2d(OFTLayer):
+    """OFT implemented in Conv2d layer"""
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep

llama/data/oft/model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict, Type, Union
+import torch
+from torch import nn
+from peft.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+from .layer import Conv2d, Linear, OFTLayer
+class OFTModel(LycorisTuner):
+    """
+    Creates Orthogonal Finetuning model from a pretrained model. The method is described in
+    https://arxiv.org/abs/2306.07280
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`OFTConfig`]): The configuration of the OFT model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+    Returns:
+        `torch.nn.Module`: The OFT model.
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import OFTModel, OFTConfig
+        >>> config_te = OFTConfig(
+        ...     r=8,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = OFTConfig(
+        ...     r=8,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default")
+        >>> model.unet = OFTModel(model.unet, config_unet, "default")
+        ```
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`OFTConfig`]): The configuration of the OFT model.
+    """
+    prefix: str = "oft_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[OFTLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(config.rank_pattern.keys())
+        target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        if isinstance(target, OFTLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)

llama/finetune_32.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import copy
+from dataclasses import field, dataclass
+from typing import Sequence, Literal, List, Dict, Tuple
+import transformers
+from transformers import Trainer
+from transformers.modeling_utils import *
+from transformers.trainer import _is_peft_model
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.data.data_collator import DataCollator
+from transformers.training_args import TrainingArguments
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.trainer_callback import TrainerCallback
+from transformers.trainer_utils import EvalPrediction
+from torch.utils.data import Dataset, IterableDataset
+from datasets import load_dataset
+from peft import LoraConfig, get_peft_model, PeftModel, OFTConfig
+from datetime import datetime
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "</s>"
+DEFAULT_UNK_TOKEN = "</s>"
+PROMPT = (
+    "Below is an instruction that describes a task. "
+    "Write a response that appropriately completes the request.\n\n"
+    "### Instruction:\n{instruction}\n\n### Response:"
+)
+class MyTrainer(Trainer):
+    def __init__(
+            self,
+            model: Union[PreTrainedModel, nn.Module] = None,
+            args: TrainingArguments = None,
+            data_collator: Optional[DataCollator] = None,
+            train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None,
+            eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None,
+            tokenizer: Optional[PreTrainedTokenizerBase] = None,
+            model_init: Optional[Callable[[], PreTrainedModel]] = None,
+            compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+            callbacks: Optional[List[TrainerCallback]] = None,
+            optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+            preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+            lamda: float = 1e-4
+    ):
+        print('optimizers', optimizers)
+        if optimizers == None:
+            optimizers = (None, None)
+        super().__init__(model=model, args=args, data_collator=data_collator, train_dataset=train_dataset,
+                         eval_dataset=eval_dataset, processing_class=tokenizer, model_init=model_init,
+                         compute_metrics=compute_metrics, callbacks=callbacks,
+                         optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics)
+        self.lamda = lamda
+        self.oft_params: List[torch.nn.Parameter] = [
+            p for n, p in self.model.named_parameters() if "oft_r" in n
+        ]
+    def compute_loss(self, model, inputs, return_outputs=False,
+                    num_items_in_batch: Optional[torch.Tensor] = None,):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        # if self.label_smoother is not None and "labels" in inputs:
+        #     labels = inputs.pop("labels")
+        # else:
+        #     labels = None
+        # outputs = model(**inputs)
+        if self.label_smoother is not None and "labels" in inputs:
+            labels = inputs.pop("labels")
+        else:
+            labels = None
+        if self.model_accepts_loss_kwargs:
+            kwargs = {}
+            if num_items_in_batch is not None:
+                kwargs["num_items_in_batch"] = num_items_in_batch
+            inputs = {**inputs, **kwargs}
+        outputs = model(**inputs)
+    #
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+        if labels is not None:
+            unwrapped_model = unwrap_model(model)
+            if _is_peft_model(unwrapped_model):
+                model_name = unwrapped_model.base_model.model._get_name()
+            else:
+                model_name = unwrapped_model._get_name()
+            if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+                loss = self.label_smoother(outputs, labels, shift_labels=True)
+            else:
+                loss = self.label_smoother(outputs, labels)
+        else:
+            if isinstance(outputs, dict) and "loss" not in outputs:
+                raise ValueError(
+                    "The model did not return a loss from the inputs, only the following keys: "
+                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
+                )
+            # We don't use .loss here since the model may return tuples instead of ModelOutput.
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
+        # ------------------------------------------------------------------------------
+        # target_params = self.oft_params if hasattr(self, 'oft_params') and self.oft_params else model.named_parameters()
+        # for param_item in target_params:
+        #     # Handle both cached list (param only) and named_parameters (name, param)
+        #     if isinstance(param_item, tuple):
+        #         name, param = param_item
+        #         if 'oft_r' not in name: continue
+        #     else:
+        #         param = param_item
+        #     device = param.device
+        #     householder_U_norm = param / param.norm(dim=0)
+        #     orth_loss = torch.norm(
+        #         torch.eye(householder_U_norm.size(1), device=device) - householder_U_norm.t() @ householder_U_norm)
+        #     loss = loss + self.lamda * orth_loss.to(loss.device)
+        # ------------------------------------------------------------------------------
+        return (loss, outputs) if return_outputs else loss
+    def _move_model_to_device(self, model, device):
+        pass
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    adapter_name_or_path: Optional[str] = field(default=None)
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+    dataset_split: str = field(
+        default="train[:100000]", metadata={"help": "(`['train', 'test', 'eval']`):"}
+    )
+    dataset_field: List[str] = field(
+        default=None, metadata={"help": "Fields of dataset input and output."}
+    )
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(default=512, metadata={
+        "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."}, )
+    hrft_r: int = field(default=8, metadata={
+        "help": "The rank of the adapter. When passing `None` and `adapter_name_or_path` is also `None`, full fine-tuning is used."})
+    init_a: float = field(default=1e-4, metadata={"help": "The initial weights"})
+    eps: float = field(default=1e-4, metadata={"help": "The control strength of COFT. The freedom of rotation."})
+    lamda: float = field(default=1e-4, metadata={"help": "The control strength of regularity"})
+    add_orth: str = field(default='none', metadata={"help": ""})
+    init_weights: Literal[True, "pissa"] = field(
+        default=True,
+        metadata={
+            "help": (
+                "Passing True (default) results in the LoRA initialization."
+                "Passing `pissa` results in PiSSA initialization."
+            ),
+        },
+    )
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def smart_tokenizer_and_embedding_resize(
+        special_tokens_dict: Dict,
+        tokenizer: transformers.PreTrainedTokenizer,
+        model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def preprocess(
+        sources: Sequence[str],
+        targets: Sequence[str],
+        tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """Preprocess the data by tokenizing."""
+    examples = [s + t for s, t in zip(sources, targets)]
+    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
+    input_ids = examples_tokenized["input_ids"]
+    labels = copy.deepcopy(input_ids)
+    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
+        label[:source_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=labels)
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = [torch.tensor(x) for x in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = [torch.tensor(x) for x in labels]
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+def train_tokenize_function(examples, tokenizer, query, response):
+    sources = [PROMPT.format_map(dict(instruction=instruction)) for instruction in examples[query]]
+    targets = [f"{output}{tokenizer.eos_token}" for output in examples[response]]
+    data_dict = preprocess(sources, targets, tokenizer)
+    return data_dict
+def train():
+    parser = transformers.HfArgumentParser(TrainingArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+    # print(script_args)
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        script_args.model_name_or_path,
+        device_map={"": 0}, #device_map="auto",
+    )
+    if script_args.adapter_name_or_path is not None:
+        print(f"Load {script_args.init_weights} from {script_args.adapter_name_or_path}: ", )
+        model = PeftModel.from_pretrained(model, script_args.model_name_or_path,
+                                          subfolder=script_args.adapter_name_or_path, is_trainable=True)
+    elif script_args.hrft_r is not None:
+        print(f"Initilized {script_args.init_weights} layers")
+        hra_config = OFTConfig(
+            r= script_args.hrft_r,
+            eps=script_args.eps,
+            init_weights=script_args.init_weights,
+            target_modules=["q_proj", "v_proj"],
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, hra_config)
+    else:
+        print("Full Parameter Fine-Tuning")
+    print(model)
+    model.print_trainable_parameters()
+    # import time
+    # print("Program starts")
+    # time.sleep(300)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        script_args.model_name_or_path,
+        model_max_length=script_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    if tokenizer.pad_token is None:
+        smart_tokenizer_and_embedding_resize(
+            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
+            tokenizer=tokenizer,
+            model=model,
+        )
+    if "llama" in script_args.model_name_or_path:
+        tokenizer.add_special_tokens(
+            {
+                "eos_token": DEFAULT_EOS_TOKEN,
+                "bos_token": DEFAULT_BOS_TOKEN,
+                "unk_token": DEFAULT_UNK_TOKEN,
+            }
+        )
+    # if tokenizer.pad_token is None:
+    #     if tokenizer.unk_token_id is not None:
+    #         tokenizer.pad_token_id = tokenizer.unk_token_id
+    #         tokenizer.pad_token = tokenizer.unk_token
+    #         print("Set PAD token to UNK token.")
+    #     elif tokenizer.eos_token_id is not None:
+    #         tokenizer.pad_token_id = tokenizer.eos_token_id
+    #         tokenizer.pad_token = tokenizer.eos_token
+    #         print("Set PAD token to EOS token.")
+    #     if model is not None:
+    #         model.config.pad_token_id = tokenizer.pad_token_id
+    #         if model.config.pad_token_id != tokenizer.pad_token_id:
+    #             raise ValueError("Failed to sync pad_token_id between tokenizer and model config")
+    raw_train_datasets = load_dataset("json", data_files=script_args.data_path, split=script_args.dataset_split, name="metamath_qa",)
+    train_dataset = raw_train_datasets.map(
+        train_tokenize_function,
+        batched=True,
+        batch_size=3000,
+        num_proc=32,
+        remove_columns=raw_train_datasets.column_names,
+        load_from_cache_file=True,
+        desc="Running tokenizer on train dataset",
+        fn_kwargs={"tokenizer": tokenizer, "query": script_args.dataset_field[0],
+                   "response": script_args.dataset_field[1]}
+    )
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    data_module = dict(train_dataset=train_dataset, data_collator=data_collator)
+    trainer = MyTrainer(model=model, tokenizer=tokenizer, lamda=script_args.lamda,  args=script_args, **data_module)
+    model.config.use_cache = False
+    start_time = datetime.now()
+    print('start time: ', start_time.strftime("%Y-%m-%d %H:%M:%S"))
+    trainer.train()
+    end_time = datetime.now()
+    print('end time: ', end_time.strftime("%Y-%m-%d %H:%M:%S"), '| duration: ', end_time - start_time)
+    # trainer.save_state()
+    tokenizer.save_pretrained(os.path.join(script_args.output_dir, 'ft'))
+    model.save_pretrained(os.path.join(script_args.output_dir, 'ft'))
+if __name__ == "__main__":
+    train()

llama/inference/MATH_inference.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import argparse
+import json
+import pdb
+import jsonlines
+import util
+from vllm import LLM, SamplingParams
+import sys
+MAX_INT = sys.maxsize
+INVALID_ANS = "[invalid]"
+invalid_outputs = []
+def remove_boxed(s):
+    left = "\\boxed{"
+    try:
+        assert s[:len(left)] == left
+        assert s[-1] == "}"
+        return s[len(left):-1]
+    except:
+        return None
+def process_results(doc, completion, answer):
+    split_ans = completion.split('The answer is: ')
+    if len(split_ans) > 1:
+        ans = split_ans[-1]
+        extract_ans_temp = ans.split('.\n')[0]
+        extract_ans_temp = extract_ans_temp.strip()
+        if len(extract_ans_temp)>0 and extract_ans_temp[-1] == '.':
+            extract_ans = extract_ans_temp[0:-1]
+        else:
+            extract_ans = extract_ans_temp
+        extract_ans = extract_ans.strip()
+        if util.is_equiv(extract_ans, answer):
+            return True
+        else:
+            return False
+    else:
+        temp = {'question': doc, 'output': completion, 'answer': answer}
+        invalid_outputs.append(temp)
+        return False
+def batch_data(data_list, batch_size=1):
+    n = len(data_list) // batch_size
+    batch_data = []
+    for i in range(n-1):
+        start = i * batch_size
+        end = (i+1)*batch_size
+        batch_data.append(data_list[start:end])
+    last_start = (n-1) * batch_size
+    last_end = MAX_INT
+    batch_data.append(data_list[last_start:last_end])
+    return batch_data
+def test_hendrycks_math(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
+    hendrycks_math_ins = []
+    hendrycks_math_answers = []
+    problem_prompt = (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
+    )
+    print('promt =====', problem_prompt)
+    with open(data_path, "r+", encoding="utf8") as f:
+        for idx, item in enumerate(jsonlines.Reader(f)):
+            temp_instr = problem_prompt.format(instruction=item["instruction"])
+            hendrycks_math_ins.append(temp_instr)
+            solution = item['output']
+            temp_ans = remove_boxed(util.last_boxed_only_string(solution))
+            hendrycks_math_answers.append(temp_ans)
+    print('total length ===', len(hendrycks_math_ins))
+    hendrycks_math_ins = hendrycks_math_ins[start:end]
+    hendrycks_math_answers = hendrycks_math_answers[start:end]
+    print('lenght ====', len(hendrycks_math_ins))
+    batch_hendrycks_math_ins = batch_data(hendrycks_math_ins, batch_size=batch_size)
+    stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
+    sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=2048, stop=stop_tokens)
+    print('sampleing =====', sampling_params)
+    llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size)
+    outputs = llm.generate(hendrycks_math_ins, sampling_params)
+    res_completions = [output.outputs[0].text for output in outputs]
+    results = []
+    for idx, (prompt, completion, prompt_answer) in enumerate(zip(hendrycks_math_ins, res_completions, hendrycks_math_answers)):
+        res = process_results(prompt, completion, prompt_answer)
+        results.append(res)
+    acc = sum(results) / len(results)
+    print('len invalid outputs ====', len(invalid_outputs), ', valid_outputs===', len(invalid_outputs))
+    # print('start===', start, ', end====',end)
+    print('length====', len(results), ', acc====', acc)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default=0)  # model path
+    parser.add_argument("--data_file", type=str, default='data/MATH_test.jsonl')  # data path
+    parser.add_argument("--start", type=int, default=0) #start index
+    parser.add_argument("--end", type=int, default=MAX_INT)  # end index
+    parser.add_argument("--batch_size", type=int, default=50)  # batch_size
+    parser.add_argument("--tensor_parallel_size", type=int, default=1)  # tensor_parallel_size
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    test_hendrycks_math(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)

llama/inference/grader.py ADDED Viewed

	@@ -0,0 +1,141 @@

+"""
+This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
+- https://github.com/microsoft/ProphetNet/tree/master/CRITIC
+"""
+import multiprocessing
+from math import isclose
+from typing import Union
+from sympy import simplify, N
+from sympy.parsing.sympy_parser import parse_expr
+from sympy.parsing.latex import parse_latex
+def is_digit(s):
+    try:
+        float(str(s).replace(",", ""))
+        return True
+    except ValueError:
+        return False
+def math_equal(prediction: Union[bool, float, str],
+                reference: Union[float, str],
+                include_percentage: bool = True,
+                is_close: bool = True,
+                timeout: bool = False,
+                ) -> bool:
+    """
+    Exact match of math if and only if:
+    1. numerical equal: both can convert to float and are equal
+    2. symbolic equal: both can convert to sympy expression and are equal
+    """
+    try: # 1. numerical equal
+        if is_digit(prediction) and is_digit(reference):
+            prediction = float(str(prediction).replace(",", ""))
+            reference = float(str(reference).replace(",", ""))
+            # number questions
+            if include_percentage:
+                gt_result = [reference / 100, reference, reference * 100]
+            else:
+                gt_result = [reference]
+            for item in gt_result:
+                try:
+                    if is_close:
+                        if isclose(item, prediction, rel_tol=1e-4):
+                            return True
+                    else:
+                        if item == prediction:
+                            return True
+                except Exception:
+                    continue
+            return False
+    except:
+        pass
+    if not prediction and prediction not in [0, False]:
+        return False
+    # 2. symbolic equal
+    reference = str(reference).strip()
+    prediction = str(prediction).strip()
+    ## deal with [], (), {}
+    pred_str, ref_str = prediction, reference
+    if (prediction.startswith("[") and prediction.endswith("]") and not reference.startswith("(")) or \
+        (prediction.startswith("(") and prediction.endswith(")") and not reference.startswith("[")):
+        pred_str = pred_str.strip("[]()")
+        ref_str = ref_str.strip("[]()")
+    for s in ['{', "}", "(", ")"]:
+        ref_str = ref_str.replace(s, "")
+        pred_str = pred_str.replace(s, "")
+    if pred_str == ref_str:
+        return True
+    ## [a, b] vs. [c, d], return a==c and b==d
+    if (prediction.startswith("[") and prediction.endswith("]")) and (reference.startswith("[") and reference.endswith("]")) or \
+        (prediction.startswith("(") and prediction.endswith(")")) and (reference.startswith("(") and reference.endswith(")")):
+        pred_parts = prediction[1:-1].split(",")
+        ref_parts = reference[1:-1].split(",")
+        if len(pred_parts) == len(ref_parts):
+            if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
+                return True
+    # symbolic equal with sympy
+    if timeout:
+        if call_with_timeout(symbolic_equal_process, prediction, reference):
+            return True
+    else:
+        if symbolic_equal(prediction, reference):
+            return True
+    return False
+def math_equal_process(param):
+    return math_equal(param[-2], param[-1])
+def symbolic_equal(a, b):
+    def _parse(s):
+        for f in [parse_latex, parse_expr]:
+            try:
+                return f(s)
+            except:
+                pass
+        return s
+    a = _parse(a)
+    b = _parse(b)
+    try:
+        if simplify(a-b) == 0:
+            return True
+    except:
+        pass
+    try:
+        if isclose(N(a), N(b), rel_tol=1e-3):
+            return True
+    except:
+        pass
+    return False
+def symbolic_equal_process(a, b, output_queue):
+    result = symbolic_equal(a, b)
+    output_queue.put(result)
+def call_with_timeout(func, *args, timeout=1, **kwargs):
+    output_queue = multiprocessing.Queue()
+    process_args = args + (output_queue,)
+    process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
+    process.start()
+    process.join(timeout)
+    if process.is_alive():
+        process.terminate()
+        process.join()
+        return False
+    return output_queue.get()

llama/inference/gsm8k_inference.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import argparse
+import json
+import re
+import jsonlines
+from fraction import Fraction
+from vllm import LLM, SamplingParams
+import sys
+from grader import math_equal
+MAX_INT = sys.maxsize
+def is_number(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        pass
+    try:
+        import unicodedata
+        unicodedata.numeric(s)
+        return True
+    except (TypeError, ValueError):
+        pass
+    return False
+def extract_answer_number(completion):
+    text = completion.split('The answer is: ')
+    if len(text) > 1:
+        extract_ans = text[-1].strip()
+        match = re.search(r'[\-+]?\d*[\.,/]?\d+', extract_ans)
+        if match:
+            if '/' in match.group():
+                denominator = match.group().split('/')[1]
+                numerator = match.group().split('/')[0]
+                if is_number(denominator) == True and is_number(numerator) == True:
+                    if denominator == '0':
+                        return round(float(numerator.replace(',', '')))
+                    else:
+                        frac = Fraction(match.group().replace(',', ''))
+                        num_numerator = frac.numerator
+                        num_denominator = frac.denominator
+                        return round(float(num_numerator / num_denominator))
+                else:
+                    return None
+            else:
+                if float(match.group().replace(',', '')) == float('inf'):
+                    return None
+                return round(float(match.group().replace(',', '')))
+        else:
+            return None
+    else:
+        return None
+def batch_data(data_list, batch_size=1):
+    n = len(data_list) // batch_size
+    batch_data = []
+    for i in range(n-1):
+        start = i * batch_size
+        end = (i+1)*batch_size
+        batch_data.append(data_list[start:end])
+    last_start = (n-1) * batch_size
+    last_end = MAX_INT
+    batch_data.append(data_list[last_start:last_end])
+    return batch_data
+def gsm8k_test(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
+    INVALID_ANS = "[invalid]"
+    gsm8k_ins = []
+    gsm8k_answers = []
+    problem_prompt = (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
+    )
+    print('promt =====', problem_prompt)
+    with open(data_path,"r+", encoding="utf8") as f:
+        for idx, item in enumerate(jsonlines.Reader(f)):
+            temp_instr = problem_prompt.format(instruction=item["question"])
+            gsm8k_ins.append(temp_instr)
+            temp_ans = item['answer'].split('#### ')[1]
+            temp_ans = int(temp_ans.replace(',', ''))
+            gsm8k_answers.append(temp_ans)
+    gsm8k_ins = gsm8k_ins[start:end]
+    gsm8k_answers = gsm8k_answers[start:end]
+    print('lenght ====', len(gsm8k_ins))
+    batch_gsm8k_ins = batch_data(gsm8k_ins, batch_size=batch_size)
+    stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
+    sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=1024, stop=stop_tokens)
+    print('sampleing =====', sampling_params)
+    llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size)
+    result = []
+    outputs = llm.generate(gsm8k_ins, sampling_params)
+    res_completions = [output.outputs[0].text for output in outputs]
+    invalid_outputs = []
+    for idx, (prompt, completion, prompt_answer) in enumerate(zip(gsm8k_ins, res_completions, gsm8k_answers)):
+        doc = {'question': prompt}
+        y_pred = extract_answer_number(completion)
+        if y_pred != None:
+            result.append(float(y_pred) == float(prompt_answer) or math_equal(y_pred, prompt_answer))
+        else:
+            result.append(False)
+            temp = {'question': prompt, 'output': completion, 'answer': prompt_answer}
+            invalid_outputs.append(temp)
+    acc = sum(result) / len(result)
+    print('len invalid outputs ====', len(invalid_outputs), ', valid_outputs===', len(invalid_outputs))
+    # print('start===', start, ', end====', end)
+    print('gsm8k length====', len(result), ', gsm8k acc====', acc)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)  # model path
+    parser.add_argument("--data_file", type=str, default='data/gsm8k_test.jsonl')  # data path
+    parser.add_argument("--start", type=int, default=0) #start index
+    parser.add_argument("--end", type=int, default=MAX_INT)  # end index
+    parser.add_argument("--batch_size", type=int, default=60)  # batch_size
+    parser.add_argument("--tensor_parallel_size", type=int, default=1)  # tensor_parallel_size
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    gsm8k_test(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)

llama/inference/util.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import pprint
+from grader import math_equal
+def last_boxed_only(sample):
+    q, a = sample
+    a = last_boxed_only_string(a)
+    if a == None:
+        return None
+    return (q, a)
+def last_boxed_only_string(string):
+    idx = string.rfind("\\boxed")
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+    if right_brace_idx == None:
+        retval = None
+    else:
+        retval = string[idx:right_brace_idx + 1]
+    return retval
+def only_until_first_boxed_from_tokens(string, tokens):
+    idx = string.find("\\boxed")
+    if idx < 0:
+        idx = string.find("\\fbox")
+        if idx < 0:
+            return None
+    cum_length = 0
+    for i, t in enumerate(tokens):
+        cum_length += len(t)
+        if cum_length >= idx:
+            break
+    return tokens[:i]
+def clean_numbers(sample):
+    if not sample:
+        return None
+    new_sample = list()
+    for s in sample:
+        new_sample.append(_clean_numbers(s))
+    return tuple(new_sample)
+def _clean_numbers(string):
+    """
+    Clean Numbers in the given string
+    >>> _clean_numbers(None, "Hello 123")
+    'Hello 123'
+    >>> _clean_numbers(None, "Hello 1234")
+    'Hello 1,234'
+    >>> _clean_numbers(None, "Hello 1234324asdasd")
+    'Hello 1,234,324asdasd'
+    """
+    num_prev_digits = 0
+    new_string = ""
+    for i, c in enumerate(string):
+        # isdigit() doesnt work here because of weird unicode chars.
+        if c in {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}:
+            num_prev_digits += 1
+        else:
+            if num_prev_digits > 3:
+                # Some fixing
+                string_number = new_string[-num_prev_digits:]
+                new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
+            num_prev_digits = 0
+        new_string += c
+    if num_prev_digits > 3:
+        # Some fixing
+        string_number = new_string[-num_prev_digits:]
+        new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
+    return new_string
+def fix_fracs(string):
+    substrs = string.split("\\frac")
+    new_str = substrs[0]
+    if len(substrs) > 1:
+        substrs = substrs[1:]
+        for substr in substrs:
+            new_str += "\\frac"
+            if substr[0] == "{":
+                new_str += substr
+            else:
+                try:
+                    assert len(substr) >= 2
+                except AssertionError:
+                    return string
+                a = substr[0]
+                b = substr[1]
+                if b != "{":
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}{" + b + "}" + post_substr
+                    else:
+                        new_str += "{" + a + "}{" + b + "}"
+                else:
+                    if len(substr) > 2:
+                        post_substr = substr[2:]
+                        new_str += "{" + a + "}" + b + post_substr
+                    else:
+                        new_str += "{" + a + "}" + b
+    string = new_str
+    return string
+def fix_a_slash_b(string):
+    if len(string.split("/")) != 2:
+        return string
+    a = string.split("/")[0]
+    b = string.split("/")[1]
+    try:
+        a = int(a)
+        b = int(b)
+        assert string == "{}/{}".format(a, b)
+        new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
+        return new_string
+    except AssertionError:
+        return string
+def remove_right_units(string):
+    # "\\text{ " only ever occurs (at least in the val set) when describing units
+    if "\\text{ " in string:
+        splits = string.split("\\text{ ")
+        assert len(splits) == 2
+        return splits[0]
+    else:
+        return string
+def fix_sqrt(string):
+    if "\\sqrt" not in string:
+        return string
+    splits = string.split("\\sqrt")
+    new_string = splits[0]
+    for split in splits[1:]:
+        if split[0] != "{":
+            a = split[0]
+            new_substr = "\\sqrt{" + a + "}" + split[1:]
+        else:
+            new_substr = "\\sqrt" + split
+        new_string += new_substr
+    return new_string
+def strip_string(string):
+    # linebreaks
+    string = string.replace("\n", "")
+    # remove inverse spaces
+    string = string.replace("\\!", "")
+    # replace \\ with \
+    string = string.replace("\\\\", "\\")
+    # replace tfrac and dfrac with frac
+    string = string.replace("tfrac", "frac")
+    string = string.replace("dfrac", "frac")
+    # remove \left and \right
+    string = string.replace("\\left", "")
+    string = string.replace("\\right", "")
+    # Remove circ (degrees)
+    string = string.replace("^{\\circ}", "")
+    string = string.replace("^\\circ", "")
+    # remove dollar signs
+    string = string.replace("\\$", "")
+    # remove units (on the right)
+    string = remove_right_units(string)
+    # remove percentage
+    string = string.replace("\\%", "")
+    string = string.replace("\%", "")  # noqa: W605
+    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
+    string = string.replace(" .", " 0.")
+    string = string.replace("{.", "{0.")
+    # if empty, return empty string
+    if len(string) == 0:
+        return string
+    if string[0] == ".":
+        string = "0" + string
+    # to consider: get rid of e.g. "k = " or "q = " at beginning
+    if len(string.split("=")) == 2:
+        if len(string.split("=")[0]) <= 2:
+            string = string.split("=")[1]
+    # fix sqrt3 --> sqrt{3}
+    string = fix_sqrt(string)
+    # remove spaces
+    string = string.replace(" ", "")
+    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
+    string = fix_fracs(string)
+    # manually change 0.5 --> \frac{1}{2}
+    if string == "0.5":
+        string = "\\frac{1}{2}"
+    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
+    string = fix_a_slash_b(string)
+    return string
+def is_equiv(str1, str2, verbose=False):
+    if str1 is None and str2 is None:
+        print("WARNING: Both None")
+        return True
+    if str1 is None or str2 is None:
+        return False
+    try:
+        ss1 = strip_string(str1)
+        ss2 = strip_string(str2)
+        #pdb.set_trace()
+        if verbose:
+            print(ss1, ss2)
+        #return ss1 == ss2
+        res = math_equal(ss1,ss2) or ss1 == ss2
+        return res
+    except Exception:
+        #return str1 == str2
+        res = math_equal(str1,str1) or str1 == str2
+        return res
+class NotEqual:
+    def __eq__(self, other):
+        return False

llama/merge_adapter_to_base_model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel, PeftConfig
+import argparse
+import torch
+parser = argparse.ArgumentParser(description='Merge Adapter to Base Model')
+parser.add_argument('--base_mode', type=str)
+parser.add_argument('--adapter', type=str)
+parser.add_argument('--output_path', type=str)
+args = parser.parse_args()
+model = AutoModelForCausalLM.from_pretrained(args.base_mode, torch_dtype=torch.bfloat16, device_map="cpu")
+tokenizer = AutoTokenizer.from_pretrained(args.base_mode, device_map='auto')
+#
+# tokenizer = AutoTokenizer.from_pretrained(args.adapter)
+model.resize_token_embeddings(32001)
+print('len', len(tokenizer))
+print(f"Base model vocab size after resize: {model.get_input_embeddings().weight.shape[0]}")
+#
+lora_config = PeftConfig.from_pretrained(args.adapter)
+lora_config.init_oft_weights=True
+model = PeftModel.from_pretrained(model, args.adapter, config=lora_config)
+model = model.merge_and_unload()
+model.save_pretrained(args.output_path,  safe_serialization=False)
+tokenizer.save_pretrained(args.output_path)

llama/output/cp1e4/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

llama/output/cp1e4/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "block_share": false,
+  "coft": false,
+  "eps": 0.0001,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.0,
+  "modules_to_save": null,
+  "peft_type": "OFT",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llama/output/cp1e4/ft/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[PAD]": 32000
+}

llama/output/cp1e4/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llama/output/cp1e4/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llama/output/cp1e4/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "</s>",
+  "use_default_system_prompt": false
+}

llama/output/cp1e5/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: facebook/opt-125m
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

llama/output/cp1e5/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "facebook/opt-125m",
+  "block_share": false,
+  "coft": false,
+  "eps": 0.0001,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.0,
+  "modules_to_save": null,
+  "peft_type": "OFT",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llama/output/cp1e5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2e-05,
+  "eval_steps": 500,
+  "global_step": 2,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 2e-05,
+      "step": 2,
+      "total_flos": 368078929920.0,
+      "train_loss": 2.170874834060669,
+      "train_runtime": 0.7734,
+      "train_samples_per_second": 2.586,
+      "train_steps_per_second": 2.586
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 2,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 0,
+  "total_flos": 368078929920.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

llama/output/cp1e5N/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

llama/output/cp1e5N/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "block_share": false,
+  "coft": false,
+  "eps": 0.0001,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.0,
+  "modules_to_save": null,
+  "peft_type": "OFT",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llama/output/cp1e5N/ft/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[PAD]": 32000
+}

llama/output/cp1e5N/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llama/output/cp1e5N/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llama/output/cp1e5N/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "</s>",
+  "use_default_system_prompt": false
+}

llama/output/cp3e5/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

llama/output/cp3e5/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "block_share": false,
+  "coft": false,
+  "eps": 0.0001,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.0,
+  "modules_to_save": null,
+  "peft_type": "OFT",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llama/output/cp3e5/trainer_state.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 6250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.32,
+      "grad_norm": 3.8161606788635254,
+      "learning_rate": 2.8241524699899885e-05,
+      "loss": 0.3071,
+      "step": 1000
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 3.0841195583343506,
+      "learning_rate": 2.317615224686078e-05,
+      "loss": 0.2366,
+      "step": 2000
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 2.72049880027771,
+      "learning_rate": 1.6067682497434074e-05,
+      "loss": 0.215,
+      "step": 3000
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 2.600498914718628,
+      "learning_rate": 8.692414973449614e-06,
+      "loss": 0.186,
+      "step": 4000
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 2.350414752960205,
+      "learning_rate": 2.893317942061826e-06,
+      "loss": 0.1763,
+      "step": 5000
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 2.4736688137054443,
+      "learning_rate": 1.194984047782738e-07,
+      "loss": 0.1733,
+      "step": 6000
+    },
+    {
+      "epoch": 2.0,
+      "step": 6250,
+      "total_flos": 3.546740381344727e+18,
+      "train_loss": 0.21396501647949218,
+      "train_runtime": 35607.6251,
+      "train_samples_per_second": 5.617,
+      "train_steps_per_second": 0.176
+    }
+  ],
+  "logging_steps": 1000,
+  "max_steps": 6250,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 0,
+  "total_flos": 3.546740381344727e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

llama/output/cp3e5N/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

llama/output/cp3e5N/ft/adapter_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "block_share": false,
+  "coft": false,
+  "eps": 0.0001,
+  "inference_mode": true,
+  "init_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "module_dropout": 0.0,
+  "modules_to_save": null,
+  "peft_type": "OFT",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

llama/output/cp3e5N/ft/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[PAD]": 32000
+}

llama/output/cp3e5N/ft/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

llama/output/cp3e5N/ft/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llama/output/cp3e5N/ft/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "</s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "</s>",
+  "use_default_system_prompt": false
+}

llama/output/cpr1/ft/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: meta-llama/Llama-2-7b-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0