xxxpo13
/

Pyramid_Flow

Model card Files Files and versions

xet

Community

xxxpo13 commited on Oct 11, 2024

Commit

c38f042

verified ·

1 Parent(s): fee70e0

Update utils.py

Browse files

Files changed (1) hide show

utils.py +453 -92

utils.py CHANGED Viewed

@@ -1,96 +1,457 @@
 import os
 import torch
 import torch.distributed as dist
-import torch.nn as nn
-from torch.utils.data import DataLoader, DistributedSampler
-from torchvision import datasets, transforms
-from torch.nn.parallel import DistributedDataParallel as DDP
-# Set your model class here (for demonstration, we'll create a simple CNN)
-class SimpleCNN(nn.Module):
-    def __init__(self):
-        super(SimpleCNN, self).__init__()
-        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
-        self.fc1 = nn.Linear(64 * 7 * 7, 128)
-        self.fc2 = nn.Linear(128, 10)
-    def forward(self, x):
-        x = nn.ReLU()(self.conv1(x))
-        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)
-        x = nn.ReLU()(self.conv2(x))
-        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)
-        x = x.view(x.size(0), -1)
-        x = nn.ReLU()(self.fc1(x))
-        x = self.fc2(x)
-        return x
-def init_distributed_mode():
-    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        rank = int(os.environ['RANK'])
-        world_size = int(os.environ['WORLD_SIZE'])
-        dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
-        torch.cuda.set_device(rank % torch.cuda.device_count())
-        print(f"Initialized distributed mode: rank {rank}, world size {world_size}")
     else:
-        print("Not using distributed mode")
-        rank = 0
-        world_size = 1
-    return rank, world_size
-def main():
-    # Initialize the distributed mode
-    rank, world_size = init_distributed_mode()
-    # Set up data transformations
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.5,), (0.5,))
-    ])
-    # Load dataset
-    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
-    train_sampler = DistributedSampler(train_dataset)
-    train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
-    # Initialize model
-    model = SimpleCNN()
-    device = torch.device(f'cuda:{rank % torch.cuda.device_count()}')
-    model.to(device)
-    # Wrap the model with DDP
-    if world_size > 1:
-        model = DDP(model, device_ids=[rank], output_device=rank)
-    # Set up the optimizer and loss function
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    criterion = nn.CrossEntropyLoss()
-    # Training loop
-    for epoch in range(10):  # Train for 10 epochs
-        train_sampler.set_epoch(epoch)  # Shuffle data every epoch
-        running_loss = 0.0
-        for inputs, targets in train_loader:
-            inputs, targets = inputs.to(device), targets.to(device)
-            # Forward pass
-            outputs = model(inputs)
-            loss = criterion(outputs, targets)
-            # Backward pass and optimization
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item()
-        if rank == 0:  # Only print from the main process
-            print(f'Epoch [{epoch + 1}/10], Loss: {running_loss / len(train_loader):.4f}')
-    # Clean up distributed training
-    if world_size > 1:
-        dist.destroy_process_group()
-if __name__ == '__main__':
-    main()

 import os
 import torch
+import PIL.Image
+import numpy as np
+from torch import nn
 import torch.distributed as dist
+import timm.models.hub as timm_hub
+"""Modified from https://github.com/CompVis/taming-transformers.git"""
+import hashlib
+import requests
+from tqdm import tqdm
+try:
+    import piq
+except:
+    pass
+_CONTEXT_PARALLEL_GROUP = None
+_CONTEXT_PARALLEL_SIZE = None
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def is_context_parallel_initialized():
+    if _CONTEXT_PARALLEL_GROUP is None:
+        return False
+    else:
+        return True
+def set_context_parallel_group(size, group):
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_SIZE
+    _CONTEXT_PARALLEL_GROUP = group
+    _CONTEXT_PARALLEL_SIZE = size
+def initialize_context_parallel(context_parallel_size):
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_SIZE
+    assert _CONTEXT_PARALLEL_GROUP is None, "context parallel group is already initialized"
+    _CONTEXT_PARALLEL_SIZE = context_parallel_size
+    rank = torch.distributed.get_rank()
+    world_size = torch.distributed.get_world_size()
+    for i in range(0, world_size, context_parallel_size):
+        ranks = range(i, i + context_parallel_size)
+        group = torch.distributed.new_group(ranks)
+        if rank in ranks:
+            _CONTEXT_PARALLEL_GROUP = group
+            break
+def get_context_parallel_group():
+    assert _CONTEXT_PARALLEL_GROUP is not None, "context parallel group is not initialized"
+    return _CONTEXT_PARALLEL_GROUP
+def get_context_parallel_world_size():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    return _CONTEXT_PARALLEL_SIZE
+def get_context_parallel_rank():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    rank = get_rank()
+    cp_rank = rank % _CONTEXT_PARALLEL_SIZE
+    return cp_rank
+def get_context_parallel_group_rank():
+    assert _CONTEXT_PARALLEL_SIZE is not None, "context parallel size is not initialized"
+    rank = get_rank()
+    cp_group_rank = rank // _CONTEXT_PARALLEL_SIZE
+    return cp_group_rank
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+        return cached_file
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+    return get_cached_file_path()
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)):
+            l.weight.data = l.weight.data.to(torch.float16)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(torch.float16)
+    model.apply(_convert_weights_to_fp16)
+def convert_weights_to_bf16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_bf16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear)):
+            l.weight.data = l.weight.data.to(torch.bfloat16)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(torch.bfloat16)
+    model.apply(_convert_weights_to_bf16)
+def save_result(result, result_dir, filename, remove_duplicate="", save_format='json'):
+    import json
+    import jsonlines
+    print("Dump result")
+    # Make the temp dir for saving results
+    if not os.path.exists(result_dir):
+        if is_main_process():
+            os.makedirs(result_dir)
+        if is_dist_avail_and_initialized():
+            torch.distributed.barrier()
+    result_file = os.path.join(
+        result_dir, "%s_rank%d.json" % (filename, get_rank())
+    )
+    final_result_file = os.path.join(result_dir, f"{filename}.{save_format}")
+    json.dump(result, open(result_file, "w"))
+    if is_dist_avail_and_initialized():
+        torch.distributed.barrier()
+    if is_main_process():
+        # print("rank %d starts merging results." % get_rank())
+        # combine results from all processes
+        result = []
+        for rank in range(get_world_size()):
+            result_file = os.path.join(result_dir, "%s_rank%d.json" % (filename, rank))
+            res = json.load(open(result_file, "r"))
+            result += res
+        # print("Remove duplicate")
+        if remove_duplicate:
+            result_new = []
+            id_set = set()
+            for res in result:
+                if res[remove_duplicate] not in id_set:
+                    id_set.add(res[remove_duplicate])
+                    result_new.append(res)
+            result = result_new
+        if save_format == 'json':
+            json.dump(result, open(final_result_file, "w"))
+        else:
+            assert save_format == 'jsonl', "Only support json adn jsonl format"
+            with jsonlines.open(final_result_file, "w") as writer:
+                writer.write_all(result)
+        # print("result file saved to %s" % final_result_file)
+    return final_result_file
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out
+URL_MAP = {
+    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
+MD5_MAP = {
+    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    print(md5_hash(path))
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class KeyNotFoundError(Exception):
+    def __init__(self, cause, keys=None, visited=None):
+        self.cause = cause
+        self.keys = keys
+        self.visited = visited
+        messages = list()
+        if keys is not None:
+            messages.append("Key not found: {}".format(keys))
+        if visited is not None:
+            messages.append("Visited: {}".format(visited))
+        messages.append("Cause:\n{}".format(cause))
+        message = "\n".join(messages)
+        super().__init__(message)
+def retrieve(
+    list_or_dict, key, splitval="/", default=None, expand=True, pass_success=False
+):
+    """Given a nested list or dict return the desired value at key expanding
+    callable nodes if necessary and :attr:`expand` is ``True``. The expansion
+    is done in-place.
+    Parameters
+    ----------
+        list_or_dict : list or dict
+            Possibly nested list or dictionary.
+        key : str
+            key/to/value, path like string describing all keys necessary to
+            consider to get to the desired value. List indices can also be
+            passed here.
+        splitval : str
+            String that defines the delimiter between keys of the
+            different depth levels in `key`.
+        default : obj
+            Value returned if :attr:`key` is not found.
+        expand : bool
+            Whether to expand callable nodes on the path or not.
+    Returns
+    -------
+        The desired value or if :attr:`default` is not ``None`` and the
+        :attr:`key` is not found returns ``default``.
+    Raises
+    ------
+        Exception if ``key`` not in ``list_or_dict`` and :attr:`default` is
+        ``None``.
+    """
+    keys = key.split(splitval)
+    success = True
+    try:
+        visited = []
+        parent = None
+        last_key = None
+        for key in keys:
+            if callable(list_or_dict):
+                if not expand:
+                    raise KeyNotFoundError(
+                        ValueError(
+                            "Trying to get past callable node with expand=False."
+                        ),
+                        keys=keys,
+                        visited=visited,
+                    )
+                list_or_dict = list_or_dict()
+                parent[last_key] = list_or_dict
+            last_key = key
+            parent = list_or_dict
+            try:
+                if isinstance(list_or_dict, dict):
+                    list_or_dict = list_or_dict[key]
+                else:
+                    list_or_dict = list_or_dict[int(key)]
+            except (KeyError, IndexError, ValueError) as e:
+                raise KeyNotFoundError(e, keys=keys, visited=visited)
+            visited += [key]
+        # final expansion of retrieved value
+        if expand and callable(list_or_dict):
+            list_or_dict = list_or_dict()
+            parent[last_key] = list_or_dict
+    except KeyNotFoundError as e:
+        if default is None:
+            raise e
+        else:
+            list_or_dict = default
+            success = False
+    if not pass_success:
+        return list_or_dict
     else:
+        return list_or_dict, success