Spaces:

Kunbyte
/

DRA-Ctrl

Running on Zero

App Files Files Community

caohy666 commited on Jun 17

Commit

88d26a2

1 Parent(s): 8851fb5

<fix> fix some bugs in app.py.

Browse files

Files changed (1) hide show

app.py +124 -114

app.py CHANGED Viewed

@@ -45,9 +45,11 @@ there's no need to manually input edge maps, depth maps, or other condition imag
 The corresponding condition images will be automatically extracted.
 """
-def init_pipeline():
-    global pipe
     # init models
     transformer = HunyuanVideoTransformer3DModel.from_pretrained('hunyuanvideo-community/HunyuanVideo-I2V',
@@ -78,101 +80,106 @@ def init_pipeline():
     vae.enable_tiling()
     vae.enable_slicing()
-    # insert LoRA
-    lora_config = LoraConfig(
-        r=16,
-        lora_alpha=16,
-        init_lora_weights="gaussian",
-        target_modules=[
-            'attn.to_k', 'attn.to_q', 'attn.to_v', 'attn.to_out.0',
-            'attn.add_k_proj', 'attn.add_q_proj', 'attn.add_v_proj', 'attn.to_add_out',
-            'ff.net.0.proj', 'ff.net.2',
-            'ff_context.net.0.proj', 'ff_context.net.2',
-            'norm1_context.linear', 'norm1.linear',
-            'norm.linear', 'proj_mlp', 'proj_out',
-        ]
-    )
-    transformer.add_adapter(lora_config)
-    # hack LoRA forward
-    def create_hacked_forward(module):
-        lora_forward = module.forward
-        non_lora_forward = module.base_layer.forward
-        img_sequence_length = int((args.img_size / 8 / 2) ** 2)
-        encoder_sequence_length = 144 + 252 # encoder sequence: 144 img 252 txt
-        num_imgs = 4
-        num_generated_imgs = 3
-        num_encoder_sequences = 2 if args.task in ['subject_driven', 'style_transfer'] else 1
-        def hacked_lora_forward(self, x, *args, **kwargs):
-            if x.shape[1] == img_sequence_length * num_imgs and len(x.shape) > 2:
-                return torch.cat((
-                    lora_forward(x[:, :-img_sequence_length*num_generated_imgs], *args, **kwargs),
-                    non_lora_forward(x[:, -img_sequence_length*num_generated_imgs:], *args, **kwargs)
-                ), dim=1)
-            elif x.shape[1] == encoder_sequence_length * num_encoder_sequences or x.shape[1] == encoder_sequence_length:
-                return lora_forward(x, *args, **kwargs)
-            elif x.shape[1] == img_sequence_length * num_imgs + encoder_sequence_length * num_encoder_sequences:
-                return torch.cat((
-                    lora_forward(x[:, :(num_imgs - num_generated_imgs)*img_sequence_length], *args, **kwargs),
-                    non_lora_forward(x[:, (num_imgs - num_generated_imgs)*img_sequence_length:-num_encoder_sequences*encoder_sequence_length], *args, **kwargs),
-                    lora_forward(x[:, -num_encoder_sequences*encoder_sequence_length:], *args, **kwargs)
-                ), dim=1)
-            elif x.shape[1] == 3072:
-                return non_lora_forward(x, *args, **kwargs)
-            else:
-                raise ValueError(
-                    f"hacked_lora_forward receives unexpected sequence length: {x.shape[1]}, input shape: {x.shape}!"
-                )
-        return hacked_lora_forward.__get__(module, type(module))
-    for n, m in transformer.named_modules():
-        if isinstance(m, peft.tuners.lora.layer.Linear):
-            m.forward = create_hacked_forward(m)
-    # load LoRA weights
-    model_root = hf_hub_download(
-        repo_id="Kunbyte/DRA-Ctrl",
-        filename=f"{task}.safetensors",
-        resume_download=True)
-    try:
-        with safe_open(model_root, framework="pt") as f:
-            lora_weights = {}
-            for k in f.keys():
-                param = f.get_tensor(k)
-                if k.endswith(".weight"):
-                    k = k.replace('.weight', '.default.weight')
-                lora_weights[k] = param
-            transformer.load_state_dict(lora_weights, strict=False)
-    except Exception as e:
-        raise ValueError(f'{e}')
-    transformer.requires_grad_(False)
-    pipe = HunyuanVideoImageToVideoPipeline(
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        transformer=transformer,
-        vae=vae,
-        scheduler=copy.deepcopy(scheduler),
-        text_encoder_2=text_encoder_2,
-        tokenizer_2=tokenizer_2,
-        image_processor=image_processor,
-    )
 @spaces.GPU
-def process_image_and_text(condition_image, target_prompt, condition_image_prompt, task):
     # start generation
     c_txt = None if condition_image_prompt == "" else condition_image_prompt
     c_img = condition_image.resize((512, 512))
     t_txt = target_prompt
-    if args.task not in ['subject_driven', 'style_transfer']:
-        if args.task == "canny":
             def get_canny_edge(img):
                 img_np = np.array(img)
                 img_gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
@@ -182,21 +189,21 @@ def process_image_and_text(condition_image, target_prompt, condition_image_promp
                 edges[edges == 0] = 128
                 return Image.fromarray(edges).convert("RGB")
             c_img = get_canny_edge(c_img)
-        elif args.task == "coloring":
             c_img = (
-                c_img.resize((args.img_size, args.img_size))
                 .convert("L")
                 .convert("RGB")
             )
-        elif args.task == "deblurring":
             blur_radius = 10
             c_img = (
                 c_img.convert("RGB")
                 .filter(ImageFilter.GaussianBlur(blur_radius))
-                .resize((args.img_size, args.img_size))
                 .convert("RGB")
             )
-        elif args.task == "depth":
             def get_depth_map(img):
                 from transformers import pipeline
@@ -205,43 +212,40 @@ def process_image_and_text(condition_image, target_prompt, condition_image_promp
                     model="LiheYoung/depth-anything-small-hf",
                     device="cpu",
                 )
-                return depth_pipe(img)["depth"].convert("RGB").resize((args.img_size, args.img_size))
             c_img = get_depth_map(c_img)
             c_img.save(os.path.join(save_dir, f"depth.png"))
             k = (255 - 128) / 255
             b = 128
             c_img = c_img.point(lambda x: k * x + b)
-        elif args.task == "depth_pred":
             c_img = c_img
-        elif args.task == "fill":
-            c_img = c_img.resize((args.img_size, args.img_size)).convert("RGB")
-            x1, x2 = args.fill_x1, args.fill_x2
-            y1, y2 = args.fill_y1, args.fill_y2
-            mask = Image.new("L", (args.img_size, args.img_size), 0)
             draw = ImageDraw.Draw(mask)
             draw.rectangle((x1, y1, x2, y2), fill=255)
-            if args.inpainting:
                 mask = Image.eval(mask, lambda a: 255 - a)
             c_img = Image.composite(
                 c_img,
-                Image.new("RGB", (args.img_size, args.img_size), (255, 255, 255)),
                 mask
             )
             c_img.save(os.path.join(save_dir, f"mask.png"))
             c_img = Image.composite(
                 c_img,
-                Image.new("RGB", (args.img_size, args.img_size), (128, 128, 128)),
                 mask
             )
-        elif args.task == "sr":
-            c_img = c_img.resize((int(args.img_size / 4), int(args.img_size / 4))).convert("RGB")
             c_img.save(os.path.join(save_dir, f"low_resolution.png"))
-            c_img = c_img.resize((args.img_size, args.img_size))
             c_img.save(os.path.join(save_dir, f"low_to_high.png"))
-    if pipe is None:
-        init_pipeline()
     gen_img = pipe(
         image=c_img,
         prompt=[t_txt.strip()],
@@ -253,7 +257,7 @@ def process_image_and_text(condition_image, target_prompt, condition_image_promp
         num_inference_steps=50,
         guidance_scale=6.0,
         num_videos_per_prompt=1,
-        generator=torch.Generator(device=pipe.transformer.device).manual_seed(0),
         output_type='pt',
         image_embed_interleave=4,
         frame_gap=48,
@@ -295,8 +299,14 @@ def create_app():
                     elem_id="task_selection"
                 )
                 gr.Markdown(notice, elem_id="notice")
-                target_prompt = gr.Textbox(lines=2, label="Target Prompt", elem_id="text")
-                condition_image_prompt = gr.Textbox(lines=2, label="Condition Image Prompt", elem_id="text")
                 submit_btn = gr.Button("Run", elem_id="submit_btn")
             with gr.Column(variant="panel", elem_classes="outputPanel"):
@@ -304,7 +314,7 @@ def create_app():
         submit_btn.click(
             fn=process_image_and_text,
-            inputs=[condition_image, target_prompt, condition_image_prompt, task],
             outputs=output_image,
         )

 The corresponding condition images will be automatically extracted.
 """
+pipe = None
+current_task = None
+def init_basemodel():
+    global transformer, scheduler, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, image_processor
     # init models
     transformer = HunyuanVideoTransformer3DModel.from_pretrained('hunyuanvideo-community/HunyuanVideo-I2V',
     vae.enable_tiling()
     vae.enable_slicing()
 @spaces.GPU
+def process_image_and_text(condition_image, target_prompt, condition_image_prompt, task, random_seed, inpainting, fill_x1, fill_x2, fill_y1, fill_y2):
+    # set up models
+    required_models = [transformer, scheduler, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, image_processor]
+    if any(model is None for model in required_models):
+        init_basemodel()
+    if pipe is None or current_task != task:
+        # insert LoRA
+        lora_config = LoraConfig(
+            r=16,
+            lora_alpha=16,
+            init_lora_weights="gaussian",
+            target_modules=[
+                'attn.to_k', 'attn.to_q', 'attn.to_v', 'attn.to_out.0',
+                'attn.add_k_proj', 'attn.add_q_proj', 'attn.add_v_proj', 'attn.to_add_out',
+                'ff.net.0.proj', 'ff.net.2',
+                'ff_context.net.0.proj', 'ff_context.net.2',
+                'norm1_context.linear', 'norm1.linear',
+                'norm.linear', 'proj_mlp', 'proj_out',
+            ]
+        )
+        transformer.add_adapter(lora_config)
+        # hack LoRA forward
+        def create_hacked_forward(module):
+            lora_forward = module.forward
+            non_lora_forward = module.base_layer.forward
+            img_sequence_length = int((512 / 8 / 2) ** 2)
+            encoder_sequence_length = 144 + 252 # encoder sequence: 144 img 252 txt
+            num_imgs = 4
+            num_generated_imgs = 3
+            num_encoder_sequences = 2 if task in ['subject_driven', 'style_transfer'] else 1
+            def hacked_lora_forward(self, x, *args, **kwargs):
+                if x.shape[1] == img_sequence_length * num_imgs and len(x.shape) > 2:
+                    return torch.cat((
+                        lora_forward(x[:, :-img_sequence_length*num_generated_imgs], *args, **kwargs),
+                        non_lora_forward(x[:, -img_sequence_length*num_generated_imgs:], *args, **kwargs)
+                    ), dim=1)
+                elif x.shape[1] == encoder_sequence_length * num_encoder_sequences or x.shape[1] == encoder_sequence_length:
+                    return lora_forward(x, *args, **kwargs)
+                elif x.shape[1] == img_sequence_length * num_imgs + encoder_sequence_length * num_encoder_sequences:
+                    return torch.cat((
+                        lora_forward(x[:, :(num_imgs - num_generated_imgs)*img_sequence_length], *args, **kwargs),
+                        non_lora_forward(x[:, (num_imgs - num_generated_imgs)*img_sequence_length:-num_encoder_sequences*encoder_sequence_length], *args, **kwargs),
+                        lora_forward(x[:, -num_encoder_sequences*encoder_sequence_length:], *args, **kwargs)
+                    ), dim=1)
+                elif x.shape[1] == 3072:
+                    return non_lora_forward(x, *args, **kwargs)
+                else:
+                    raise ValueError(
+                        f"hacked_lora_forward receives unexpected sequence length: {x.shape[1]}, input shape: {x.shape}!"
+                    )
+            return hacked_lora_forward.__get__(module, type(module))
+        for n, m in transformer.named_modules():
+            if isinstance(m, peft.tuners.lora.layer.Linear):
+                m.forward = create_hacked_forward(m)
+        # load LoRA weights
+        model_root = hf_hub_download(
+            repo_id="Kunbyte/DRA-Ctrl",
+            filename=f"{task}.safetensors",
+            resume_download=True)
+        try:
+            with safe_open(model_root, framework="pt") as f:
+                lora_weights = {}
+                for k in f.keys():
+                    param = f.get_tensor(k)
+                    if k.endswith(".weight"):
+                        k = k.replace('.weight', '.default.weight')
+                    lora_weights[k] = param
+                transformer.load_state_dict(lora_weights, strict=False)
+        except Exception as e:
+            raise ValueError(f'{e}')
+        transformer.requires_grad_(False)
+        pipe = HunyuanVideoImageToVideoPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            vae=vae,
+            scheduler=copy.deepcopy(scheduler),
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+            image_processor=image_processor,
+        )
     # start generation
     c_txt = None if condition_image_prompt == "" else condition_image_prompt
     c_img = condition_image.resize((512, 512))
     t_txt = target_prompt
+    if task not in ['subject_driven', 'style_transfer']:
+        if task == "canny":
             def get_canny_edge(img):
                 img_np = np.array(img)
                 img_gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
                 edges[edges == 0] = 128
                 return Image.fromarray(edges).convert("RGB")
             c_img = get_canny_edge(c_img)
+        elif task == "coloring":
             c_img = (
+                c_img.resize((512, 512))
                 .convert("L")
                 .convert("RGB")
             )
+        elif task == "deblurring":
             blur_radius = 10
             c_img = (
                 c_img.convert("RGB")
                 .filter(ImageFilter.GaussianBlur(blur_radius))
+                .resize((512, 512))
                 .convert("RGB")
             )
+        elif task == "depth":
             def get_depth_map(img):
                 from transformers import pipeline
                     model="LiheYoung/depth-anything-small-hf",
                     device="cpu",
                 )
+                return depth_pipe(img)["depth"].convert("RGB").resize((512, 512))
             c_img = get_depth_map(c_img)
             c_img.save(os.path.join(save_dir, f"depth.png"))
             k = (255 - 128) / 255
             b = 128
             c_img = c_img.point(lambda x: k * x + b)
+        elif task == "depth_pred":
             c_img = c_img
+        elif task == "fill":
+            c_img = c_img.resize((512, 512)).convert("RGB")
+            x1, x2 = fill_x1, fill_x2
+            y1, y2 = fill_y1, fill_y2
+            mask = Image.new("L", (512, 512), 0)
             draw = ImageDraw.Draw(mask)
             draw.rectangle((x1, y1, x2, y2), fill=255)
+            if inpainting:
                 mask = Image.eval(mask, lambda a: 255 - a)
             c_img = Image.composite(
                 c_img,
+                Image.new("RGB", (512, 512), (255, 255, 255)),
                 mask
             )
             c_img.save(os.path.join(save_dir, f"mask.png"))
             c_img = Image.composite(
                 c_img,
+                Image.new("RGB", (512, 512), (128, 128, 128)),
                 mask
             )
+        elif task == "sr":
+            c_img = c_img.resize((int(512 / 4), int(512 / 4))).convert("RGB")
             c_img.save(os.path.join(save_dir, f"low_resolution.png"))
+            c_img = c_img.resize((512, 512))
             c_img.save(os.path.join(save_dir, f"low_to_high.png"))
     gen_img = pipe(
         image=c_img,
         prompt=[t_txt.strip()],
         num_inference_steps=50,
         guidance_scale=6.0,
         num_videos_per_prompt=1,
+        generator=torch.Generator(device=pipe.transformer.device).manual_seed(random_seed),
         output_type='pt',
         image_embed_interleave=4,
         frame_gap=48,
                     elem_id="task_selection"
                 )
                 gr.Markdown(notice, elem_id="notice")
+                target_prompt = gr.Textbox(lines=2, label="Target Prompt", elem_id="tp")
+                condition_image_prompt = gr.Textbox(lines=2, label="Condition Image Prompt", elem_id="cp")
+                random_seed = gr.Number(label="Random Seed", , precision=0, value=0, elem_id="seed")
+                inpainting = gr.Checkbox(label="Inpainting", value=False, elem_id="inpainting")
+                fill_x1 = gr.Number(label="In/Out-painting Box Left Boundary", precision=0, value=128, elem_id="fill_x1")
+                fill_x2 = gr.Number(label="In/Out-painting Box Right Boundary", precision=0, value=384, elem_id="fill_x2")
+                fill_y1 = gr.Number(label="In/Out-painting Box Top Boundary", precision=0, value=128, elem_id="fill_y1")
+                fill_y2 = gr.Number(label="In/Out-painting Box Bottom Boundary", precision=0, value=384, elem_id="fill_y2")
                 submit_btn = gr.Button("Run", elem_id="submit_btn")
             with gr.Column(variant="panel", elem_classes="outputPanel"):
         submit_btn.click(
             fn=process_image_and_text,
+            inputs=[condition_image, target_prompt, condition_image_prompt, task, random_seed, inpainting, fill_x1, fill_x2, fill_y1, fill_y2],
             outputs=output_image,
         )