Spaces:

XiangpengYang
/

VideoCoF

Running on Zero

App Files Files Community

XiangpengYang commited on Dec 12, 2025

Commit

97a5f5d

1 Parent(s): c7eaeb0

dmd + cof

Browse files

Files changed (4) hide show

app.py +14 -6
inference.py +22 -1
videox_fun/ui/ui.py +2 -2
videox_fun/utils/lora_utils.py +18 -40

app.py CHANGED Viewed

@@ -79,6 +79,7 @@ def load_video_frames(video_path: str, source_frames: int):
             pil_frame = Image.fromarray(frame)
             if original_height is None:
                 original_width, original_height = pil_frame.size
             frames.append(pil_frame)
         except IndexError:
             break
@@ -92,6 +93,9 @@ def load_video_frames(video_path: str, source_frames: int):
             w, h = (original_width, original_height) if original_width else (832, 480)
             frames.append(Image.new('RGB', (w, h), (0, 0, 0)))
     input_video = torch.from_numpy(np.array(frames))
     input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0).float()
     input_video = input_video * (2.0 / 255.0) - 1.0
@@ -143,6 +147,8 @@ class VideoCoF_Controller(Wan_Controller):
         # Ensure model is on CUDA inside the zero-gpu decorated function
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
             # If pipeline is not on cuda, move it (if possible, but usually accelerate handles this or it's handled by parts)
             # However, Wan_Controller logic might rely on `self.device`.
             # We explicitly set `self.device` to cuda here.
@@ -166,7 +172,7 @@ class VideoCoF_Controller(Wan_Controller):
         # 1. Merge VideoCoF LoRA
         if self.lora_model_path != "none":
             print(f"Merge VideoCoF Lora: {self.lora_model_path}")
-            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
         # 2. Merge Acceleration LoRA (FusionX) if enabled
         acc_lora_path = os.path.join(self.personalized_model_dir, "Wan2.1_Text_to_Video_14B_FusionX_LoRA.safetensors")
@@ -174,7 +180,7 @@ class VideoCoF_Controller(Wan_Controller):
             if os.path.exists(acc_lora_path):
                 print(f"Merge Acceleration LoRA: {acc_lora_path}")
                 # FusionX LoRA generally uses multiplier 1.0
-                self.pipeline = merge_lora(self.pipeline, acc_lora_path, multiplier=1.0)
             else:
                 print(f"Warning: Acceleration LoRA not found at {acc_lora_path}")
@@ -217,6 +223,7 @@ class VideoCoF_Controller(Wan_Controller):
             print(f"Input video dimensions: {w}x{h}")
             print(f"Running pipeline with frames={length_slider}, source={source_frames_slider}, reasoning={reasoning_frames_slider}")
             sample = self.pipeline(
                 video=input_video_tensor,
@@ -230,6 +237,7 @@ class VideoCoF_Controller(Wan_Controller):
                 generator=generator,
                 guidance_scale=cfg_scale_slider,
                 num_inference_steps=sample_step_slider,
                 repeat_rope=repeat_rope_checkbox,
                 cot=True,
             ).videos
@@ -241,21 +249,21 @@ class VideoCoF_Controller(Wan_Controller):
             # Unmerge in case of error (LIFO order)
             if enable_acceleration and os.path.exists(acc_lora_path):
                  print("Unmerging Acceleration LoRA (due to error)")
-                 self.pipeline = unmerge_lora(self.pipeline, acc_lora_path, multiplier=1.0)
             if self.lora_model_path != "none":
                  print("Unmerging VideoCoF LoRA (due to error)")
-                 self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
             return gr.update(), gr.update(), f"Error: {str(e)}"
         # Unmerge LoRAs (LIFO order)
         if enable_acceleration and os.path.exists(acc_lora_path):
             print("Unmerging Acceleration LoRA")
-            self.pipeline = unmerge_lora(self.pipeline, acc_lora_path, multiplier=1.0)
         if self.lora_model_path != "none":
             print("Unmerging VideoCoF LoRA")
-            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
         # Save output
         save_sample_path = self.save_outputs(

             pil_frame = Image.fromarray(frame)
             if original_height is None:
                 original_width, original_height = pil_frame.size
+                print(f"Original video dimensions: {original_width}x{original_height}")
             frames.append(pil_frame)
         except IndexError:
             break
             w, h = (original_width, original_height) if original_width else (832, 480)
             frames.append(Image.new('RGB', (w, h), (0, 0, 0)))
+    assert len(frames) == source_frames, f"Loaded {len(frames)} frames, expected {source_frames}"
+    print(f"Loaded {source_frames} source frames")
     input_video = torch.from_numpy(np.array(frames))
     input_video = input_video.permute([3, 0, 1, 2]).unsqueeze(0).float()
     input_video = input_video * (2.0 / 255.0) - 1.0
         # Ensure model is on CUDA inside the zero-gpu decorated function
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
             # If pipeline is not on cuda, move it (if possible, but usually accelerate handles this or it's handled by parts)
             # However, Wan_Controller logic might rely on `self.device`.
             # We explicitly set `self.device` to cuda here.
         # 1. Merge VideoCoF LoRA
         if self.lora_model_path != "none":
             print(f"Merge VideoCoF Lora: {self.lora_model_path}")
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider, device=self.device)
         # 2. Merge Acceleration LoRA (FusionX) if enabled
         acc_lora_path = os.path.join(self.personalized_model_dir, "Wan2.1_Text_to_Video_14B_FusionX_LoRA.safetensors")
             if os.path.exists(acc_lora_path):
                 print(f"Merge Acceleration LoRA: {acc_lora_path}")
                 # FusionX LoRA generally uses multiplier 1.0
+                self.pipeline = merge_lora(self.pipeline, acc_lora_path, multiplier=1.0, device=self.device)
             else:
                 print(f"Warning: Acceleration LoRA not found at {acc_lora_path}")
             print(f"Input video dimensions: {w}x{h}")
             print(f"Running pipeline with frames={length_slider}, source={source_frames_slider}, reasoning={reasoning_frames_slider}")
+            shift = 3
             sample = self.pipeline(
                 video=input_video_tensor,
                 generator=generator,
                 guidance_scale=cfg_scale_slider,
                 num_inference_steps=sample_step_slider,
+                shift=shift,
                 repeat_rope=repeat_rope_checkbox,
                 cot=True,
             ).videos
             # Unmerge in case of error (LIFO order)
             if enable_acceleration and os.path.exists(acc_lora_path):
                  print("Unmerging Acceleration LoRA (due to error)")
+                 self.pipeline = unmerge_lora(self.pipeline, acc_lora_path, multiplier=1.0, device=self.device)
             if self.lora_model_path != "none":
                  print("Unmerging VideoCoF LoRA (due to error)")
+                 self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider, device=self.device)
             return gr.update(), gr.update(), f"Error: {str(e)}"
         # Unmerge LoRAs (LIFO order)
         if enable_acceleration and os.path.exists(acc_lora_path):
             print("Unmerging Acceleration LoRA")
+            self.pipeline = unmerge_lora(self.pipeline, acc_lora_path, multiplier=1.0, device=self.device)
         if self.lora_model_path != "none":
             print("Unmerging VideoCoF LoRA")
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider, device=self.device)
         # Save output
         save_sample_path = self.save_outputs(

inference.py CHANGED Viewed

@@ -89,6 +89,9 @@ def parse_args():
     parser.add_argument("--output_dir", type=str, required=True, help="Output directory for generated videos")
     parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducible generation")
     parser.add_argument("--videocof_path", type=str, default=None, help="Path to videocof weight checkpoint")
     parser.add_argument("--num_frames", type=int, default=65, help="Total number of frames (input + generated)")
     parser.add_argument("--source_frames", type=int, default=33, help="Number of source frames; default 33")
     parser.add_argument("--reasoning_frames", type=int, default=4, help="Grounding frames in the middle segment (pixel-space)")
@@ -320,7 +323,25 @@ def main():
     else:
         pipeline.to(device=device)
-    # LoRA
     if args.videocof_path is not None:
         pipeline = merge_lora(pipeline, args.videocof_path, lora_weight, device=device)
         print(f"[GPU {rank}] Loaded LoRA from {args.videocof_path}")

     parser.add_argument("--output_dir", type=str, required=True, help="Output directory for generated videos")
     parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducible generation")
     parser.add_argument("--videocof_path", type=str, default=None, help="Path to videocof weight checkpoint")
+    parser.add_argument("--lora_path", type=str, default=None, help="Path to LoRA checkpoint")
+    parser.add_argument("--enable_acceleration_lora", action="store_true", help="Enable loading the acceleration (FusionX) LoRA")
+    parser.add_argument("--acceleration_lora_path", type=str, default=None, help="Optional path to acceleration LoRA; defaults to FusionX under model directory")
     parser.add_argument("--num_frames", type=int, default=65, help="Total number of frames (input + generated)")
     parser.add_argument("--source_frames", type=int, default=33, help="Number of source frames; default 33")
     parser.add_argument("--reasoning_frames", type=int, default=4, help="Grounding frames in the middle segment (pixel-space)")
     else:
         pipeline.to(device=device)
+    # Acceleration LoRA (FusionX) mirrors app.py behavior
+    if args.enable_acceleration_lora:
+        default_acc_path = os.path.join(model_name, "Wan2.1_Text_to_Video_14B_FusionX_LoRA.safetensors")
+        acc_lora_path = args.acceleration_lora_path or default_acc_path
+        if os.path.exists(acc_lora_path):
+            print(f"[GPU {rank}] Merge Acceleration LoRA: {acc_lora_path}")
+            pipeline = merge_lora(pipeline, acc_lora_path, multiplier=1.0, device=device)
+        else:
+            print(f"[GPU {rank}] Warning: Acceleration LoRA not found at {acc_lora_path}")
+    # Custom LoRA
+    if args.lora_path is not None:
+        if os.path.exists(args.lora_path):
+            print(f"[GPU {rank}] Loading custom LoRA: {args.lora_path}")
+            pipeline = merge_lora(pipeline, args.lora_path, lora_weight, device=device)
+        else:
+            print(f"[GPU {rank}] Warning: Provided lora_path not found: {args.lora_path}")
+    # VideoCoF LoRA
     if args.videocof_path is not None:
         pipeline = merge_lora(pipeline, args.videocof_path, lora_weight, device=device)
         print(f"[GPU {rank}] Loaded LoRA from {args.videocof_path}")

videox_fun/ui/ui.py CHANGED Viewed

@@ -194,10 +194,10 @@ def create_prompts(
     negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value=negative_prompt)
     return prompt_textbox, negative_prompt_textbox
-def create_samplers(controller, maximum_step=100):
     with gr.Row():
         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(controller.scheduler_dict.keys()), value=list(controller.scheduler_dict.keys())[0])
-        sample_step_slider = gr.Slider(label="Sampling steps", value=50, minimum=1, maximum=maximum_step, step=1)
     return sampler_dropdown, sample_step_slider

     negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=2, value=negative_prompt)
     return prompt_textbox, negative_prompt_textbox
+def create_samplers(controller, maximum_step=50):
     with gr.Row():
         sampler_dropdown   = gr.Dropdown(label="Sampling method", choices=list(controller.scheduler_dict.keys()), value=list(controller.scheduler_dict.keys())[0])
+        sample_step_slider = gr.Slider(label="Sampling steps", value=4, minimum=1, maximum=maximum_step, step=1)
     return sampler_dropdown, sample_step_slider

videox_fun/utils/lora_utils.py CHANGED Viewed

@@ -389,28 +389,9 @@ def merge_lora(pipeline, lora_path, multiplier, device='cpu', dtype=torch.float3
             key = key.replace(".self_attn.", "_self_attn_")
             key = key.replace(".cross_attn.", "_cross_attn_")
             key = key.replace(".ffn.", "_ffn_")
-            key = key.replace("text_embedding.", "text_embedding_")
-            key = key.replace("time_embedding.", "time_embedding_")
             key = key.replace(".lora_A.default.", ".lora_down.")
             key = key.replace(".lora_B.default.", ".lora_up.")
-            key = key.replace(".lora_A.weight", ".lora_down.weight")
-            key = key.replace(".lora_B.weight", ".lora_up.weight")
-        if key.endswith(".lora_down.weight"):
-            layer = key[:-len(".lora_down.weight")]
-            elem = "lora_down.weight"
-        elif key.endswith(".lora_up.weight"):
-            layer = key[:-len(".lora_up.weight")]
-            elem = "lora_up.weight"
-        elif key.endswith(".alpha"):
-            layer = key[:-len(".alpha")]
-            elem = "alpha"
-        else:
-            continue
-        if layer.endswith("."):
-            layer = layer[:-1]
         updates[layer][elem] = value
     sequential_cpu_offload_flag = False
@@ -484,10 +465,20 @@ def merge_lora(pipeline, lora_path, multiplier, device='cpu', dtype=torch.float3
                 if error_flag:
                     continue
         origin_dtype = curr_layer.weight.data.dtype
         origin_device = curr_layer.weight.data.device
         curr_layer = curr_layer.to(device, dtype)
         weight_up = elems['lora_up.weight'].to(device, dtype)
         weight_down = elems['lora_down.weight'].to(device, dtype)
@@ -529,28 +520,9 @@ def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.fl
             key = key.replace(".self_attn.", "_self_attn_")
             key = key.replace(".cross_attn.", "_cross_attn_")
             key = key.replace(".ffn.", "_ffn_")
-            key = key.replace("text_embedding.", "text_embedding_")
-            key = key.replace("time_embedding.", "time_embedding_")
             key = key.replace(".lora_A.default.", ".lora_down.")
             key = key.replace(".lora_B.default.", ".lora_up.")
-            key = key.replace(".lora_A.weight", ".lora_down.weight")
-            key = key.replace(".lora_B.weight", ".lora_up.weight")
-        if key.endswith(".lora_down.weight"):
-            layer = key[:-len(".lora_down.weight")]
-            elem = "lora_down.weight"
-        elif key.endswith(".lora_up.weight"):
-            layer = key[:-len(".lora_up.weight")]
-            elem = "lora_up.weight"
-        elif key.endswith(".alpha"):
-            layer = key[:-len(".alpha")]
-            elem = "alpha"
-        else:
-            continue
-        if layer.endswith("."):
-            layer = layer[:-1]
         updates[layer][elem] = value
     sequential_cpu_offload_flag = False
@@ -617,10 +589,16 @@ def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.fl
                 if error_flag:
                     continue
         origin_dtype = curr_layer.weight.data.dtype
         origin_device = curr_layer.weight.data.device
         curr_layer = curr_layer.to(device, dtype)
         weight_up = elems['lora_up.weight'].to(device, dtype)
         weight_down = elems['lora_down.weight'].to(device, dtype)

             key = key.replace(".self_attn.", "_self_attn_")
             key = key.replace(".cross_attn.", "_cross_attn_")
             key = key.replace(".ffn.", "_ffn_")
             key = key.replace(".lora_A.default.", ".lora_down.")
             key = key.replace(".lora_B.default.", ".lora_up.")
+        layer, elem = key.split('.', 1)
         updates[layer][elem] = value
     sequential_cpu_offload_flag = False
                 if error_flag:
                     continue
+        # Some resolved modules (e.g., container blocks/norm-only) may not have a weight parameter.
+        if not hasattr(curr_layer, "weight"):
+            # Skip incompatible / non-leaf modules
+            continue
         origin_dtype = curr_layer.weight.data.dtype
         origin_device = curr_layer.weight.data.device
         curr_layer = curr_layer.to(device, dtype)
+        # Some checkpoints (e.g., norm-only entries) may not contain both weights.
+        if 'lora_up.weight' not in elems or 'lora_down.weight' not in elems:
+            # Skip incompatible layer instead of raising KeyError
+            curr_layer = curr_layer.to(origin_device, origin_dtype)
+            continue
         weight_up = elems['lora_up.weight'].to(device, dtype)
         weight_down = elems['lora_down.weight'].to(device, dtype)
             key = key.replace(".self_attn.", "_self_attn_")
             key = key.replace(".cross_attn.", "_cross_attn_")
             key = key.replace(".ffn.", "_ffn_")
             key = key.replace(".lora_A.default.", ".lora_down.")
             key = key.replace(".lora_B.default.", ".lora_up.")
+        layer, elem = key.split('.', 1)
         updates[layer][elem] = value
     sequential_cpu_offload_flag = False
                 if error_flag:
                     continue
+        if not hasattr(curr_layer, "weight"):
+            continue
         origin_dtype = curr_layer.weight.data.dtype
         origin_device = curr_layer.weight.data.device
         curr_layer = curr_layer.to(device, dtype)
+        if 'lora_up.weight' not in elems or 'lora_down.weight' not in elems:
+            curr_layer = curr_layer.to(origin_device, origin_dtype)
+            continue
         weight_up = elems['lora_up.weight'].to(device, dtype)
         weight_down = elems['lora_down.weight'].to(device, dtype)