Spaces:

MohamedRashad
/

Infinity

Paused

App Files Files Community

MohamedRashad commited on Jan 6, 2025

Commit

715c7b0

1 Parent(s): 920cc4d

Refactor encode_prompt function by removing enable_positive_prompt parameter and related logic; streamline image generation process

Browse files

Files changed (1) hide show

app.py +58 -72

app.py CHANGED Viewed

@@ -49,19 +49,7 @@ def download_infinity_weights(weights_path):
     except Exception as e:
         print(f"Error downloading weights: {e}")
-def extract_key_val(text):
-    pattern = r'<(.+?):(.+?)>'
-    matches = re.findall(pattern, text)
-    key_val = {}
-    for match in matches:
-        key_val[match[0]] = match[1].lstrip()
-    return key_val
-def encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt=False):
-    if enable_positive_prompt:
-        print(f'before positive_prompt aug: {prompt}')
-        prompt = aug_with_positive_prompt(prompt)
-        print(f'after positive_prompt aug: {prompt}')
     print(f'prompt={prompt}')
     captions = [prompt]
     tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
@@ -78,14 +66,6 @@ def encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt=F
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
-def aug_with_positive_prompt(prompt):
-    for key in ['man', 'woman', 'men', 'women', 'boy', 'girl', 'child', 'person', 'human', 'adult', 'teenager', 'employee',
-                'employer', 'worker', 'mother', 'father', 'sister', 'brother', 'grandmother', 'grandfather', 'son', 'daughter']:
-        if key in prompt:
-            prompt = prompt + '. very smooth faces, good looking faces, face to the camera, perfect facial features'
-            break
-    return prompt
 def enhance_image(image):
     for t in range(1):
         contrast_image = image.copy()
@@ -118,20 +98,31 @@ def gen_one_img(
     gt_ls_Bl=None,
     g_seed=None,
     sampling_per_bits=1,
-    enable_positive_prompt=0,
 ):
     sstt = time.time()
     if not isinstance(cfg_list, list):
         cfg_list = [cfg_list] * len(scale_schedule)
     if not isinstance(tau_list, list):
         tau_list = [tau_list] * len(scale_schedule)
-    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt)
     if negative_prompt:
         negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
     else:
         negative_label_B_or_BLT = None
     print(f'cfg: {cfg_list}, tau: {tau_list}')
-    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):
         stt = time.time()
         _, _, img_list = infinity_test.autoregressive_infer_cfg(
             vae=vae,
@@ -169,7 +160,7 @@ def save_slim_model(infinity_model_path, save_file=None, device='cpu', key='gpt_
     return save_file
 def load_tokenizer(t5_path =''):
-    print(f'[Loading tokenizer and text encoder]')
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
@@ -195,7 +186,7 @@ def load_infinity(
     use_flex_attn=False,
     bf16=False,
 ):
-    print(f'[Loading Infinity]')
     # Set device if not provided
     if device is None:
@@ -241,13 +232,13 @@ def load_infinity(
         infinity_test.eval()
         infinity_test.requires_grad_(False)
-        print(f'[Load Infinity weights]')
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
         # Initialize random number generator on the correct device
         infinity_test.rng = torch.Generator(device=device)
         return infinity_test
 def transform(pil_img, tgt_h, tgt_w):
@@ -471,50 +462,45 @@ infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU
-def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False):
-    try:
-        args.prompt = prompt
-        args.cfg = cfg
-        args.tau = tau
-        args.h_div_w = h_div_w
-        args.seed = seed
-        args.enable_positive_prompt = enable_positive_prompt
-        # Find the closest h_div_w_template
-        h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates - h_div_w))]
-        # Get scale_schedule based on h_div_w_template_
-        scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
-        scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
-        # Generate the image
-        generated_image = gen_one_img(
-            infinity,
-            vae,
-            text_tokenizer,
-            text_encoder,
-            prompt,
-            g_seed=seed,
-            gt_leak=0,
-            gt_ls_Bl=None,
-            cfg_list=cfg,
-            tau_list=tau,
-            scale_schedule=scale_schedule,
-            cfg_insertion_layer=[args.cfg_insertion_layer],
-            vae_type=args.vae_type,
-            sampling_per_bits=args.sampling_per_bits,
-            enable_positive_prompt=enable_positive_prompt,
-        )
-        # Convert the image to RGB and uint8
-        image = generated_image.cpu().numpy()
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        image = np.uint8(image)
-        return image
-    except Exception as e:
-        print(f"Error generating image: {e}")
-        return None
 # Set up Gradio interface
 with gr.Blocks() as demo:

     except Exception as e:
         print(f"Error downloading weights: {e}")
+def encode_prompt(text_tokenizer, text_encoder, prompt):
     print(f'prompt={prompt}')
     captions = [prompt]
     tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
 def enhance_image(image):
     for t in range(1):
         contrast_image = image.copy()
     gt_ls_Bl=None,
     g_seed=None,
     sampling_per_bits=1,
 ):
     sstt = time.time()
     if not isinstance(cfg_list, list):
         cfg_list = [cfg_list] * len(scale_schedule)
     if not isinstance(tau_list, list):
         tau_list = [tau_list] * len(scale_schedule)
+    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt)
     if negative_prompt:
         negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
     else:
         negative_label_B_or_BLT = None
     print(f'cfg: {cfg_list}, tau: {tau_list}')
+    # Set device if not provided
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Set autocast dtype based on bf16 and device support
+    if device == 'cuda' and torch.cuda.is_bf16_supported():
+        autocast_dtype = torch.bfloat16
+    else:
+        autocast_dtype = torch.float32
+    torch.cuda.empty_cache()
+    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
         stt = time.time()
         _, _, img_list = infinity_test.autoregressive_infer_cfg(
             vae=vae,
     return save_file
 def load_tokenizer(t5_path =''):
+    print('[Loading tokenizer and text encoder]')
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
     use_flex_attn=False,
     bf16=False,
 ):
+    print('[Loading Infinity]')
     # Set device if not provided
     if device is None:
         infinity_test.eval()
         infinity_test.requires_grad_(False)
+        print('[Load Infinity weights]')
         state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
         # Initialize random number generator on the correct device
         infinity_test.rng = torch.Generator(device=device)
         return infinity_test
 def transform(pil_img, tgt_h, tgt_w):
 # Define the image generation function
 @spaces.GPU
+def generate_image(prompt, cfg, tau, h_div_w, seed):
+    args.prompt = prompt
+    args.cfg = cfg
+    args.tau = tau
+    args.h_div_w = h_div_w
+    args.seed = seed
+    # Find the closest h_div_w_template
+    h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates - h_div_w))]
+    # Get scale_schedule based on h_div_w_template_
+    scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
+    scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+    # Generate the image
+    generated_image = gen_one_img(
+        infinity,
+        vae,
+        text_tokenizer,
+        text_encoder,
+        prompt,
+        g_seed=seed,
+        gt_leak=0,
+        gt_ls_Bl=None,
+        cfg_list=cfg,
+        tau_list=tau,
+        scale_schedule=scale_schedule,
+        cfg_insertion_layer=[args.cfg_insertion_layer],
+        vae_type=args.vae_type,
+        sampling_per_bits=args.sampling_per_bits,
+    )
+    # Convert the image to RGB and uint8
+    image = generated_image.cpu().numpy()
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = np.uint8(image)
+    return image
 # Set up Gradio interface
 with gr.Blocks() as demo: