Long-video

Runtime error

App Files Files Community

tester343 commited on Dec 18, 2025

Commit

85ab54a

verified ·

1 Parent(s): 878b63c

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -97

app.py CHANGED Viewed

@@ -14,22 +14,18 @@ from diffusers import WanImageToVideoPipeline
 from diffusers.utils import export_to_video
 # =========================================================
-# 1. ARCHITECTURAL UPGRADE COMPONENTS (GQA & MoE)
 # =========================================================
 class GQAAttention(nn.Module):
-    """
-    GROUPED QUERY ATTENTION (GQA)
-    Adjusts the dense attention to a grouped structure (Mistral-style).
-    Reduces KV-cache by 4x, critical for 20s+ (321 frames) generation.
-    """
     def __init__(self, original_attn):
         super().__init__()
-        # Extract parameters from the original Wan attention layer
         self.num_heads = original_attn.num_heads
         self.head_dim = original_attn.head_dim
-        self.num_kv_heads = self.num_heads // 4  # 4:1 Ratio for GQA
         self.q_proj = original_attn.q_proj
         self.k_proj = original_attn.k_proj
         self.v_proj = original_attn.v_proj
@@ -37,170 +33,114 @@ class GQAAttention(nn.Module):
     def forward(self, x, freqs_cis=None):
         batch, seq_len, _ = x.shape
         q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim)
         k = self.k_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)
         v = self.v_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)
-        # Apply RoPE (Rotary Position Embeddings)
-        # We reuse Wan's native freqs_cis to ensure spatial/temporal logic stays intact
-        # Expand K/V for multi-head attention
         k = k.repeat_interleave(self.num_heads // self.num_kv_heads, dim=2)
         v = v.repeat_interleave(self.num_heads // self.num_kv_heads, dim=2)
         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
-        # Efficient scaled dot product attention
         attn_output = F.scaled_dot_product_attention(q, k, v)
         attn_output = attn_output.transpose(1, 2).reshape(batch, seq_len, -1)
         return self.o_proj(attn_output)
 class SparseMoEFFN(nn.Module):
-    """
-    MIXTURE OF EXPERTS (MoE)
-    Replaces the standard dense Feed-Forward Network.
-    Routes video tokens to specialized experts (Textures vs. Motion).
-    """
     def __init__(self, original_ffn):
         super().__init__()
         in_dim = original_ffn.ffn[0].in_features
-        self.router = nn.Linear(in_dim, 8) # 8 Experts
         self.experts = nn.ModuleList([
-            nn.Sequential(
-                nn.Linear(in_dim, in_dim * 2),
-                nn.SiLU(),
-                nn.Linear(in_dim * 2, in_dim)
-            ) for _ in range(8)
         ])
         self.top_k = 2
     def forward(self, x):
         batch, seq, dim = x.shape
         flat_x = x.view(-1, dim)
-        # Gate tokens to top-2 experts
         logits = self.router(flat_x)
         weights, selected_experts = torch.topk(logits, self.top_k)
         weights = F.softmax(weights, dim=-1)
         out = torch.zeros_like(flat_x)
         for i, expert in enumerate(self.experts):
             mask = (selected_experts == i).any(dim=-1)
             if mask.any():
-                # Apply expert weight
-                expert_out = expert(flat_x[mask])
-                out[mask] += expert_out * weights[mask][:, :1]
         return out.view(batch, seq, dim)
 # =========================================================
-# 2. MODEL PATCHING & LOADING
 # =========================================================
-MODEL_ID = "Wan-AI/Wan2.1-I2V-1.3B-480P-Diffusers"
 HF_TOKEN = os.environ.get("HF_TOKEN")
-def patch_wan_model(pipe):
-    """Injects GQA and MoE into the Transformer architecture"""
-    print("🛠️ Patching Wan Transformer: Injecting GQA and MoE...")
     for i, block in enumerate(pipe.transformer.blocks):
-        # Patch Attention -> GQA
         if hasattr(block, 'attn'):
             block.attn = GQAAttention(block.attn)
-        # Patch FFN -> MoE (Only in every 2nd block to keep compute efficient)
         if hasattr(block, 'ffn') and i % 2 == 0:
             block.ffn = SparseMoEFFN(block.ffn)
     return pipe
 # =========================================================
-# 3. GENERATION ENGINE
 # =========================================================
 @spaces.GPU(duration=600)
-def generate_long_video(image_path, prompt, duration, steps):
-    if not image_path:
-        raise gr.Error("Please upload an image.")
-    print("⏳ Initializing Model...")
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.bfloat16,
         token=HF_TOKEN
     )
-    # Apply architectural adjustments
-    pipe = patch_wan_model(pipe)
-    # Optimization for 20s+ generation
     pipe.enable_model_cpu_offload()
-    pipe.vae.enable_tiling() # Mandatory for 300+ frames
-    # Resize input image
     img = Image.open(image_path).convert("RGB")
-    img = img.resize((832, 480)) # Optimized for 16:9
-    # Calculate frames: 20 seconds @ 16 FPS = 321 frames (Wan 4n+1 rule)
     num_frames = int(duration * 16)
-    if (num_frames - 1) % 4 != 0:
-        num_frames += (4 - ((num_frames - 1) % 4))
-    print(f"🎬 Generation Start: {duration}s | {num_frames} frames")
     with torch.inference_mode():
         output = pipe(
             image=img,
-            prompt=prompt + ", cinematic, high resolution, consistent motion, masterpiece",
-            negative_prompt="static, blurry, shaky, low quality, morphing, jittery",
             num_frames=num_frames,
             num_inference_steps=steps,
             guidance_scale=5.0,
-            generator=torch.Generator("cuda").manual_seed(random.randint(0, 10**6))
         )
-    # Export to video
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
         video_path = f.name
     export_to_video(output.frames[0], video_path, fps=16)
-    # Memory Management
-    del pipe
-    gc.collect()
-    torch.cuda.empty_cache()
     return video_path
-# =========================================================
-# 4. GRADIO INTERFACE (FIXED THEME)
-# =========================================================
 with gr.Blocks() as demo:
-    gr.HTML("<h1 style='text-align:center;'>Wan 1.3B: MoE + GQA Hybrid</h1>")
-    gr.Markdown("Architecture: **Grouped Query Attention** for 20s stability + **MoE** for Mistral-style efficiency.")
     with gr.Row():
         with gr.Column():
-            img_input = gr.Image(type="filepath", label="Input Image")
-            prompt_input = gr.Textbox(
-                label="Motion Prompt",
-                value="A cinematic tracking shot of a white tiger running through a futuristic neon city at night"
-            )
-            with gr.Row():
-                duration_slider = gr.Slider(5, 25, value=20, step=5, label="Duration (Seconds)")
-                steps_slider = gr.Slider(10, 30, value=20, step=1, label="Quality Steps")
-            btn = gr.Button("🚀 Generate 20s Video", variant="primary")
         with gr.Column():
-            video_output = gr.Video(label="Final Generated Video (20s+)")
-    btn.click(
-        fn=generate_long_video,
-        inputs=[img_input, prompt_input, duration_slider, steps_slider],
-        outputs=video_output
-    )
-if __name__ == "__main__":
-    demo.queue().launch()

 from diffusers.utils import export_to_video
 # =========================================================
+# 1. ARCHITECTURAL UPGRADES (GQA & MoE)
 # =========================================================
+# The custom classes (GQAAttention, SparseMoEFFN) stay the same as
+# they are architectural modifications to the base model's logic.
 class GQAAttention(nn.Module):
     def __init__(self, original_attn):
         super().__init__()
         self.num_heads = original_attn.num_heads
         self.head_dim = original_attn.head_dim
+        self.num_kv_heads = max(1, self.num_heads // 4)
         self.q_proj = original_attn.q_proj
         self.k_proj = original_attn.k_proj
         self.v_proj = original_attn.v_proj
     def forward(self, x, freqs_cis=None):
         batch, seq_len, _ = x.shape
         q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim)
         k = self.k_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)
         v = self.v_proj(x).view(batch, seq_len, self.num_kv_heads, self.head_dim)
         k = k.repeat_interleave(self.num_heads // self.num_kv_heads, dim=2)
         v = v.repeat_interleave(self.num_heads // self.num_kv_heads, dim=2)
         q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
         attn_output = F.scaled_dot_product_attention(q, k, v)
         attn_output = attn_output.transpose(1, 2).reshape(batch, seq_len, -1)
         return self.o_proj(attn_output)
 class SparseMoEFFN(nn.Module):
     def __init__(self, original_ffn):
         super().__init__()
         in_dim = original_ffn.ffn[0].in_features
+        self.router = nn.Linear(in_dim, 8)
         self.experts = nn.ModuleList([
+            nn.Sequential(nn.Linear(in_dim, in_dim * 2), nn.SiLU(), nn.Linear(in_dim * 2, in_dim))
+            for _ in range(8)
         ])
         self.top_k = 2
     def forward(self, x):
         batch, seq, dim = x.shape
         flat_x = x.view(-1, dim)
         logits = self.router(flat_x)
         weights, selected_experts = torch.topk(logits, self.top_k)
         weights = F.softmax(weights, dim=-1)
         out = torch.zeros_like(flat_x)
         for i, expert in enumerate(self.experts):
             mask = (selected_experts == i).any(dim=-1)
             if mask.any():
+                out[mask] += expert(flat_x[mask]) * weights[mask][:, :1]
         return out.view(batch, seq, dim)
 # =========================================================
+# 2. CONFIGURATION & PATCHING
 # =========================================================
+# CORRECT MODEL ID: Wan 2.1 I2V 14B is the standard for Image-to-Video
+MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
 HF_TOKEN = os.environ.get("HF_TOKEN")
+def patch_model(pipe):
+    print("🛠️ Patching Transformer with GQA and MoE...")
     for i, block in enumerate(pipe.transformer.blocks):
         if hasattr(block, 'attn'):
             block.attn = GQAAttention(block.attn)
         if hasattr(block, 'ffn') and i % 2 == 0:
             block.ffn = SparseMoEFFN(block.ffn)
     return pipe
 # =========================================================
+# 3. GENERATION
 # =========================================================
 @spaces.GPU(duration=600)
+def generate_20s_video(image_path, prompt, duration, steps):
+    if not HF_TOKEN:
+        raise gr.Error("HF_TOKEN missing. Please set it in your environment variables.")
+    print(f"⏳ Loading Model: {MODEL_ID}")
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         torch_dtype=torch.bfloat16,
         token=HF_TOKEN
     )
+    # Apply architecture modifications
+    pipe = patch_model(pipe)
     pipe.enable_model_cpu_offload()
+    pipe.vae.enable_tiling()
     img = Image.open(image_path).convert("RGB")
+    img = img.resize((832, 480)) # Maintain 16:9 for 480P
+    # Wan formula for frames: 4n + 1
     num_frames = int(duration * 16)
+    num_frames = ((num_frames - 1) // 4) * 4 + 1
     with torch.inference_mode():
         output = pipe(
             image=img,
+            prompt=prompt + ", high quality, cinematically consistent",
             num_frames=num_frames,
             num_inference_steps=steps,
             guidance_scale=5.0,
+            generator=torch.Generator("cuda").manual_seed(42)
         )
     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as f:
         video_path = f.name
     export_to_video(output.frames[0], video_path, fps=16)
     return video_path
+# Gradio Setup
 with gr.Blocks() as demo:
+    gr.Markdown("# 🎬 Optimized Wan 2.1 (GQA + MoE)")
     with gr.Row():
         with gr.Column():
+            img = gr.Image(type="filepath", label="Input Image")
+            txt = gr.Textbox(label="Prompt", value="A futuristic city with flying cars at sunset")
+            dur = gr.Slider(5, 20, value=20, label="Duration (Seconds)")
+            stp = gr.Slider(10, 30, value=20, label="Steps")
+            btn = gr.Button("Generate 20s Video")
         with gr.Column():
+            vid = gr.Video()
+    btn.click(generate_20s_video, [img, txt, dur, stp], vid)
+demo.queue().launch()