Spaces:

MohamedRashad
/

Infinity

Paused

App Files Files Community

MohamedRashad commited on Jan 6, 2025

Commit

5c095cd

1 Parent(s): e5b7e3e

Refactor image generation process by removing gen_one_img function; streamline prompt encoding and inference steps

Browse files

Files changed (1) hide show

app.py +43 -98

app.py CHANGED Viewed

@@ -6,10 +6,8 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import os.path as osp
 import time
-import hashlib
 import argparse
 import shutil
-import re
 import random
 from pathlib import Path
 from typing import List
@@ -22,7 +20,7 @@ import torch.nn.functional as F
 from PIL import Image
 import PIL.Image as PImage
 from torchvision.transforms.functional import to_tensor
-from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import spaces
@@ -66,71 +64,6 @@ def encode_prompt(text_tokenizer, text_encoder, prompt):
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
-def gen_one_img(
-    infinity_test,
-    vae,
-    text_tokenizer,
-    text_encoder,
-    prompt,
-    cfg_list=[],
-    tau_list=[],
-    negative_prompt='',
-    scale_schedule=None,
-    top_k=900,
-    top_p=0.97,
-    cfg_sc=3,
-    cfg_exp_k=0.0,
-    cfg_insertion_layer=-5,
-    vae_type=0,
-    gumbel=0,
-    softmax_merge_topk=-1,
-    gt_leak=-1,
-    gt_ls_Bl=None,
-    g_seed=None,
-    sampling_per_bits=1,
-):
-    sstt = time.time()
-    if not isinstance(cfg_list, list):
-        cfg_list = [cfg_list] * len(scale_schedule)
-    if not isinstance(tau_list, list):
-        tau_list = [tau_list] * len(scale_schedule)
-    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt)
-    if negative_prompt:
-        negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
-    else:
-        negative_label_B_or_BLT = None
-    print(f'cfg: {cfg_list}, tau: {tau_list}')
-    # Set device if not provided
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    # Set autocast dtype based on bf16 and device support
-    if device == 'cuda' and torch.cuda.is_bf16_supported():
-        autocast_dtype = torch.bfloat16
-    else:
-        autocast_dtype = torch.float32
-    torch.cuda.empty_cache()
-    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
-        stt = time.time()
-        _, _, img_list = infinity_test.autoregressive_infer_cfg(
-            vae=vae,
-            scale_schedule=scale_schedule,
-            label_B_or_BLT=text_cond_tuple, g_seed=g_seed,
-            B=1, negative_label_B_or_BLT=negative_label_B_or_BLT, force_gt_Bhw=None,
-            cfg_sc=cfg_sc, cfg_list=cfg_list, tau_list=tau_list, top_k=top_k, top_p=top_p,
-            returns_vemb=1, ratio_Bl1=None, gumbel=gumbel, norm_cfg=False,
-            cfg_exp_k=cfg_exp_k, cfg_insertion_layer=cfg_insertion_layer,
-            vae_type=vae_type, softmax_merge_topk=softmax_merge_topk,
-            ret_img=True, trunk_scale=1000,
-            gt_leak=gt_leak, gt_ls_Bl=gt_ls_Bl, inference_mode=True,
-            sampling_per_bits=sampling_per_bits,
-        )
-    print(f"cost: {time.time() - sstt}, infinity cost={time.time() - stt}")
-    img = img_list[0]
-    return img
 def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_fsdp'):
     print('[Save slim model]')
     full_ckpt = torch.load(infinity_model_path, map_location=device)
@@ -143,15 +76,15 @@ def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_
     print('[Save slim model] done')
     return save_file
-def load_tokenizer(t5_path =''):
     print('[Loading tokenizer and text encoder]')
-    text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
-    text_tokenizer.model_max_length = 512
-    text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
-    text_encoder.to('cuda')
-    text_encoder.eval()
-    text_encoder.requires_grad_(False)
-    return text_tokenizer, text_encoder
 def load_infinity(
     rope2d_each_sa_layer,
@@ -220,8 +153,8 @@ def load_infinity(
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
-    # Initialize random number generator on the correct device
-    infinity_test.rng = torch.Generator(device=device)
     return infinity_test
@@ -444,6 +377,7 @@ text_tokenizer, text_encoder = load_tokenizer(t5_path="google/flan-t5-xl")
 vae = load_visual_tokenizer(args)
 infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU
 def generate_image(prompt, cfg, tau, h_div_w, seed):
@@ -460,31 +394,42 @@ def generate_image(prompt, cfg, tau, h_div_w, seed):
     scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
     scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
-    # Generate the image
-    generated_image = gen_one_img(
-        infinity,
-        vae,
-        text_tokenizer,
-        text_encoder,
-        prompt,
-        g_seed=seed,
-        gt_leak=0,
-        gt_ls_Bl=None,
-        cfg_list=cfg,
-        tau_list=tau,
-        scale_schedule=scale_schedule,
-        cfg_insertion_layer=[args.cfg_insertion_layer],
-        vae_type=args.vae_type,
-        sampling_per_bits=args.sampling_per_bits,
-    )
-    # Convert the image to RGB and uint8
-    image = generated_image.cpu().numpy()
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     image = np.uint8(image)
     return image
 markdown_description = """### Instructions:

 import os.path as osp
 import time
 import argparse
 import shutil
 import random
 from pathlib import Path
 from typing import List
 from PIL import Image
 import PIL.Image as PImage
 from torchvision.transforms.functional import to_tensor
+from transformers import AutoTokenizer, T5EncoderModel
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import spaces
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
 def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_fsdp'):
     print('[Save slim model]')
     full_ckpt = torch.load(infinity_model_path, map_location=device)
     print('[Save slim model] done')
     return save_file
+def load_tokenizer(t5_path=''):
     print('[Loading tokenizer and text encoder]')
+    tokenizer = AutoTokenizer.from_pretrained(t5_path, legacy=True)
+    tokenizer.model_max_length = 512
+    encoder = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
+    encoder.eval()
+    encoder.to("cuda" if torch.cuda.is_available() else "cpu")
+    encoder.requires_grad_(False)
+    return tokenizer, encoder
 def load_infinity(
     rope2d_each_sa_layer,
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
+        # Initialize random number generator on the correct device
+        infinity_test.rng = torch.Generator(device=device)
     return infinity_test
 vae = load_visual_tokenizer(args)
 infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU
 def generate_image(prompt, cfg, tau, h_div_w, seed):
     scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
     scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+    # Encode the prompt
+    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt)
+    # Set device if not provided
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Set autocast dtype based on bf16 and device support
+    if device == 'cuda' and torch.cuda.is_bf16_supported():
+        autocast_dtype = torch.bfloat16
+    else:
+        autocast_dtype = torch.float32
+    torch.cuda.empty_cache()
+    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
+        _, _, img_list = infinity.autoregressive_infer_cfg(
+            vae=vae,
+            scale_schedule=scale_schedule,
+            label_B_or_BLT=text_cond_tuple, g_seed=seed,
+            B=1, negative_label_B_or_BLT=None, force_gt_Bhw=None,
+            cfg_sc=3, cfg_list=[cfg] * len(scale_schedule), tau_list=[tau] * len(scale_schedule), top_k=900, top_p=0.97,
+            returns_vemb=1, ratio_Bl1=None, gumbel=0, norm_cfg=False,
+            cfg_exp_k=0.0, cfg_insertion_layer=[args.cfg_insertion_layer],
+            vae_type=args.vae_type, softmax_merge_topk=-1,
+            ret_img=True, trunk_scale=1000,
+            gt_leak=0, gt_ls_Bl=None, inference_mode=True,
+            sampling_per_bits=args.sampling_per_bits,
+        )
+    img = img_list[0]
+    image = img.cpu().numpy()
     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     image = np.uint8(image)
     return image
 markdown_description = """### Instructions: