Spaces:

MohamedRashad
/

Infinity

Paused

App Files Files Community

MohamedRashad commited on Jan 6, 2025

Commit

366fd1c

1 Parent(s): 27e1ebb

Enable bf16 in load_infinity function and enhance transform function with type hints and error handling; refactor joint_vi_vae_encode_decode for improved performance and error management

Browse files

Files changed (1) hide show

app.py +158 -67

app.py CHANGED Viewed

@@ -102,7 +102,7 @@ def load_infinity(
     text_channels=2048,
     apply_spatial_patchify=0,
     use_flex_attn=False,
-    bf16=False,
 ):
     print('[Loading Infinity]')
@@ -156,45 +156,137 @@ def load_infinity(
         # Initialize random number generator on the correct device
         infinity_test.rng = torch.Generator(device=device)
     return infinity_test
-def transform(pil_img, tgt_h, tgt_w):
     width, height = pil_img.size
-    if width / height <= tgt_w / tgt_h:
-        resized_width = tgt_w
-        resized_height = int(tgt_w / (width / height))
-    else:
-        resized_height = tgt_h
-        resized_width = int((width / height) * tgt_h)
-    pil_img = pil_img.resize((resized_width, resized_height), resample=PImage.LANCZOS)
-    # crop the center out
-    arr = np.array(pil_img)
-    crop_y = (arr.shape[0] - tgt_h) // 2
-    crop_x = (arr.shape[1] - tgt_w) // 2
-    im = to_tensor(arr[crop_y: crop_y + tgt_h, crop_x: crop_x + tgt_w])
-    return im.add(im).add_(-1)
-def joint_vi_vae_encode_decode(vae, image_path, scale_schedule, device, tgt_h, tgt_w):
-    pil_image = Image.open(image_path).convert('RGB')
-    inp = transform(pil_image, tgt_h, tgt_w)
-    inp = inp.unsqueeze(0).to(device)
-    scale_schedule = [(item[0], item[1], item[2]) for item in scale_schedule]
-    t1 = time.time()
-    h, z, _, all_bit_indices, _, infinity_input = vae.encode(inp, scale_schedule=scale_schedule)
-    t2 = time.time()
-    recons_img = vae.decode(z)[0]
-    if len(recons_img.shape) == 4:
-        recons_img = recons_img.squeeze(1)
-    print(f'recons: z.shape: {z.shape}, recons_img shape: {recons_img.shape}')
-    t3 = time.time()
-    print(f'vae encode takes {t2-t1:.2f}s, decode takes {t3-t2:.2f}s')
-    recons_img = (recons_img + 1) / 2
-    recons_img = recons_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)
-    gt_img = (inp[0] + 1) / 2
-    gt_img = gt_img.permute(1, 2, 0).mul_(255).cpu().numpy().astype(np.uint8)
-    print(recons_img.shape, gt_img.shape)
-    return gt_img, recons_img, all_bit_indices
 def load_visual_tokenizer(args):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -219,29 +311,26 @@ def load_visual_tokenizer(args):
     return vae
 def load_transformer(vae, args):
     model_path = args.model_path
-    if args.checkpoint_type == 'torch':
-        # copy large model to local; save slim to local; and copy slim to nas; load local slim model
         if osp.exists(args.cache_dir):
             local_model_path = osp.join(args.cache_dir, 'tmp', model_path.replace('/', '_'))
         else:
             local_model_path = model_path
         if args.enable_model_cache:
             slim_model_path = model_path.replace('ar-', 'slim-')
             local_slim_model_path = local_model_path.replace('ar-', 'slim-')
             os.makedirs(osp.dirname(local_slim_model_path), exist_ok=True)
-            print(f'model_path: {model_path}, slim_model_path: {slim_model_path}')
-            print(f'local_model_path: {local_model_path}, local_slim_model_path: {local_slim_model_path}')
             if not osp.exists(local_slim_model_path):
                 if osp.exists(slim_model_path):
-                    print(f'copy {slim_model_path} to {local_slim_model_path}')
                     shutil.copyfile(slim_model_path, local_slim_model_path)
                 else:
                     if not osp.exists(local_model_path):
-                        print(f'copy {model_path} to {local_model_path}')
                         shutil.copyfile(model_path, local_model_path)
                     save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
-                    print(f'copy {local_slim_model_path} to {slim_model_path}')
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
                         os.remove(local_model_path)
@@ -249,33 +338,35 @@ def load_transformer(vae, args):
             slim_model_path = local_slim_model_path
         else:
             slim_model_path = model_path
-        print(f'load checkpoint from {slim_model_path}')
-    if args.model_type == 'infinity_2b':
-        kwargs_model = dict(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8) # 2b model
-    elif args.model_type == 'infinity_layer12':
-        kwargs_model = dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer16':
-        kwargs_model = dict(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer24':
-        kwargs_model = dict(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer32':
-        kwargs_model = dict(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer40':
-        kwargs_model = dict(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer48':
-        kwargs_model = dict(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
     infinity = load_infinity(
-        rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
         use_scale_schedule_embedding=args.use_scale_schedule_embedding,
         pn=args.pn,
-        use_bit_label=args.use_bit_label,
-        add_lvl_embeding_only_first_block=args.add_lvl_embeding_only_first_block,
-        model_path=slim_model_path,
-        scale_schedule=None,
-        vae=vae,
-        device=None,
         model_kwargs=kwargs_model,
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,

     text_channels=2048,
     apply_spatial_patchify=0,
     use_flex_attn=False,
+    bf16=True,
 ):
     print('[Loading Infinity]')
         # Initialize random number generator on the correct device
         infinity_test.rng = torch.Generator(device=device)
     return infinity_test
+def transform(pil_img: PImage.Image, tgt_h: int, tgt_w: int) -> torch.Tensor:
+    """
+    Transform a PIL image to a tensor with target dimensions while preserving aspect ratio.
+    Args:
+        pil_img: PIL Image to transform
+        tgt_h: Target height
+        tgt_w: Target width
+    Returns:
+        torch.Tensor: Normalized tensor image in range [-1, 1]
+    """
+    if not isinstance(pil_img, PImage.Image):
+        raise TypeError("Input must be a PIL Image")
+    if tgt_h <= 0 or tgt_w <= 0:
+        raise ValueError("Target dimensions must be positive")
+    # Calculate resize dimensions preserving aspect ratio
     width, height = pil_img.size
+    scale = min(tgt_w / width, tgt_h / height)
+    new_width = int(width * scale)
+    new_height = int(height * scale)
+    # Resize using LANCZOS for best quality
+    pil_img = pil_img.resize((new_width, new_height), resample=PImage.LANCZOS)
+    # Create center crop
+    arr = np.array(pil_img, dtype=np.uint8)
+    # Calculate crop coordinates
+    y1 = max(0, (new_height - tgt_h) // 2)
+    x1 = max(0, (new_width - tgt_w) // 2)
+    y2 = y1 + tgt_h
+    x2 = x1 + tgt_w
+    # Crop and convert to tensor
+    arr = arr[y1:y2, x1:x2]
+    # Convert to normalized tensor in one step
+    return torch.from_numpy(arr.transpose(2, 0, 1)).float().div_(127.5).sub_(1)
+def joint_vi_vae_encode_decode(
+    vae: 'VAEModel',  # Type hint would be more specific with actual VAE class
+    image_path: str | Path,
+    scale_schedule: List[tuple],
+    device: torch.device | str,
+    tgt_h: int,
+    tgt_w: int
+) -> tuple[np.ndarray, np.ndarray, torch.Tensor]:
+    """
+    Encode and decode an image using a VAE model with joint visual-infinity processing.
+    Args:
+        vae: The VAE model instance
+        image_path: Path to input image
+        scale_schedule: List of scale tuples for processing
+        device: Target device for computation
+        tgt_h: Target height for the image
+        tgt_w: Target width for the image
+    Returns:
+        tuple containing:
+        - Original image as numpy array (uint8)
+        - Reconstructed image as numpy array (uint8)
+        - Bit indices tensor
+    Raises:
+        FileNotFoundError: If image file doesn't exist
+        RuntimeError: If VAE processing fails
+    """
+    try:
+        # Validate input path
+        if not Path(image_path).exists():
+            raise FileNotFoundError(f"Image not found at {image_path}")
+        # Load and preprocess image
+        pil_image = Image.open(image_path).convert('RGB')
+        inp = transform(pil_image, tgt_h, tgt_w)
+        inp = inp.unsqueeze(0).to(device)
+        # Normalize scale schedule
+        scale_schedule = [(s[0], s[1], s[2]) for s in scale_schedule]
+        # Decide whether to use CPU or GPU
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        # Time the encoding/decoding operations
+        with torch.amp.autocast(device, dtype=torch.bfloat16):
+            encode_start = time.perf_counter()
+            h, z, _, all_bit_indices, _, _ = vae.encode(
+                inp,
+                scale_schedule=scale_schedule
+            )
+            encode_time = time.perf_counter() - encode_start
+            decode_start = time.perf_counter()
+            recons_img = vae.decode(z)[0]
+            decode_time = time.perf_counter() - decode_start
+        # Process reconstruction
+        if recons_img.dim() == 4:
+            recons_img = recons_img.squeeze(1)
+        # Log performance metrics
+        print(f'VAE encode: {encode_time:.2f}s, decode: {decode_time:.2f}s')
+        print(f'Reconstruction shape: {recons_img.shape}, z shape: {z.shape}')
+        # Convert to numpy arrays efficiently
+        recons_img = (recons_img.add(1).div(2)
+                     .permute(1, 2, 0)
+                     .mul(255)
+                     .cpu()
+                     .numpy()
+                     .astype(np.uint8))
+        gt_img = (inp[0].add(1).div(2)
+                 .permute(1, 2, 0)
+                 .mul(255)
+                 .cpu()
+                 .numpy()
+                 .astype(np.uint8))
+        return gt_img, recons_img, all_bit_indices
+    except Exception as e:
+        print(f"Error in VAE processing: {str(e)}")
+        raise RuntimeError("VAE processing failed") from e
 def load_visual_tokenizer(args):
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     return vae
 def load_transformer(vae, args):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     model_path = args.model_path
+    if args.checkpoint_type == 'torch':
         if osp.exists(args.cache_dir):
             local_model_path = osp.join(args.cache_dir, 'tmp', model_path.replace('/', '_'))
         else:
             local_model_path = model_path
         if args.enable_model_cache:
             slim_model_path = model_path.replace('ar-', 'slim-')
             local_slim_model_path = local_model_path.replace('ar-', 'slim-')
             os.makedirs(osp.dirname(local_slim_model_path), exist_ok=True)
             if not osp.exists(local_slim_model_path):
                 if osp.exists(slim_model_path):
                     shutil.copyfile(slim_model_path, local_slim_model_path)
                 else:
                     if not osp.exists(local_model_path):
                         shutil.copyfile(model_path, local_model_path)
                     save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
                         os.remove(local_model_path)
             slim_model_path = local_slim_model_path
         else:
             slim_model_path = model_path
+        print(f'Loading checkpoint from {slim_model_path}')
+    else:
+        raise ValueError(f"Unsupported checkpoint_type: {args.checkpoint_type}")
+    model_configs = {
+        'infinity_2b': dict(depth=32, embed_dim=2048, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8),
+        'infinity_layer12': dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+        'infinity_layer16': dict(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+        'infinity_layer24': dict(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+        'infinity_layer32': dict(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+        'infinity_layer40': dict(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+        'infinity_layer48': dict(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4),
+    }
+    kwargs_model = model_configs.get(args.model_type)
+    if kwargs_model is None:
+        raise ValueError(f"Unsupported model_type: {args.model_type}")
     infinity = load_infinity(
+        rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
         use_scale_schedule_embedding=args.use_scale_schedule_embedding,
         pn=args.pn,
+        use_bit_label=args.use_bit_label,
+        add_lvl_embeding_only_first_block=args.add_lvl_embeding_only_first_block,
+        model_path=slim_model_path,
+        scale_schedule=None,
+        vae=vae,
+        device=device,
         model_kwargs=kwargs_model,
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,