Spaces:

Jdbbd
/

Fggfg

Paused

App Files Files Community

Ksjsjjdj commited on Dec 20, 2025

Commit

c0d0dd5

verified ·

1 Parent(s): 1c1e20c

Update app.py

Browse files

Files changed (1) hide show

app.py +306 -87

app.py CHANGED Viewed

@@ -2,54 +2,237 @@ import os
 import sys
 import subprocess
 import traceback
 from pathlib import Path
-def install_dependencies():
-    commands = [
-        "pip install spaces-0.1.0-py3-none-any.whl",
-        "pip install librosa"
-    ]
-    for cmd in commands:
-        os.system(cmd)
-install_dependencies()
 import spaces
-import numpy as np
-from PIL import Image
-import soundfile as sf
 import torch
-import gradio as gr
 import librosa
 from huggingface_hub import snapshot_download
 try:
     import diffusers
 except ImportError:
-    os.system("pip install diffusers")
-    import diffusers
-MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
-try:
-    LOCAL_DIR = snapshot_download(repo_id=MODEL_ID, repo_type="model")
-except Exception:
-    LOCAL_DIR = MODEL_ID
-pipe = None
-def load_audio_for_model(audio_filepath):
     try:
-        wav, sr = librosa.load(audio_filepath, sr=16000)
-        return wav, sr
-    except Exception:
-        return None, None
-def to_pil(image):
-    if image is None: return None
-    if isinstance(image, Image.Image): return image.convert("RGB")
-    if isinstance(image, str): return Image.open(image).convert("RGB")
-    arr = np.array(image)
-    return Image.fromarray(arr).convert("RGB")
 def merge_audio_video(video_path, audio_path, output_path):
     cmd = [
@@ -65,74 +248,110 @@ def merge_audio_video(video_path, audio_path, output_path):
     subprocess.run(cmd, check=True)
     return output_path
-@spaces.GPU(duration=120)
-def generate_video(image_input, audio_filepath, prompt):
-    global pipe
     if image_input is None or audio_filepath is None:
-        raise gr.Error("Error inputs")
-    try:
-        if pipe is None:
-            from diffusers import WanSpeechToVideoPipeline
-            pipe = WanSpeechToVideoPipeline.from_pretrained(
-                LOCAL_DIR,
-                use_safetensors=True,
-                torch_dtype=torch.float32
-            ).to("cpu")
-        audio_values, sample_rate = load_audio_for_model(audio_filepath)
-        init_image = to_pil(image_input)
-        w, h = init_image.size
-        w = (w // 16) * 16
-        h = (h // 16) * 16
-        init_image = init_image.resize((w, h), Image.LANCZOS)
         out = pipe(
             image=init_image,
             audio=audio_values,
             num_inference_steps=25,
             guidance_scale=4.0,
             sampling_rate=sample_rate,
-            prompt=prompt
         )
-        frames = out.frames[0]
-        temp_mute_video = "temp_mute.mp4"
-        final_video = "output_s2v.mp4"
-        from diffusers.utils import export_to_video
-        export_to_video(frames, temp_mute_video, fps=16)
-        final_output = merge_audio_video(temp_mute_video, audio_filepath, final_video)
-        return final_output
-    except Exception as e:
-        traceback.print_exc()
-        raise gr.Error(str(e))
-with gr.Blocks(title="Wan2.1 Speech to Video") as demo:
-    gr.Markdown("# Wan2.2-S2V Generador de Video")
-    with gr.Row():
-        with gr.Column():
-            img_input = gr.Image(label="Imagen de referencia", type="pil")
-            audio_input = gr.Audio(label="Audio (.wav)", type="filepath")
-            prompt_input = gr.Textbox(label="Prompt")
-            btn = gr.Button("Generar Video", variant="primary")
-        with gr.Column():
-            video_output = gr.Video(label="Resultado")
-    btn.click(
-        fn=generate_video,
-        inputs=[img_input, audio_input, prompt_input],
-        outputs=video_output
-    )
 if __name__ == "__main__":
-    demo.launch()

 import sys
 import subprocess
 import traceback
+import gc
+import tempfile
+import random
+import time
 from pathlib import Path
+os.system("pip install spaces-0.1.0-py3-none-any.whl moviepy==1.0.3 imageio[ffmpeg] librosa soundfile diffusers accelerate")
 import spaces
 import torch
+import numpy as np
 import librosa
+import soundfile as sf
+from PIL import Image
+from moviepy.editor import VideoFileClip, concatenate_videoclips
 from huggingface_hub import snapshot_download
+import gradio as gr
 try:
     import diffusers
+    from diffusers import AutoencoderKLWan, WanPipeline, WanImageToVideoPipeline, UniPCMultistepScheduler, WanSpeechToVideoPipeline
+    from diffusers.utils import export_to_video
 except ImportError:
+    pass
+MODEL_ID_TI2V = "FastVideo/FastWan2.2-TI2V-5B-FullAttn-Diffusers"
+MODEL_ID_S2V = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
+MODELS = {
+    "ti2v_text": None,
+    "ti2v_image": None,
+    "s2v": None
+}
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def load_models_at_startup():
+    global MODELS
     try:
+        vae = AutoencoderKLWan.from_pretrained(MODEL_ID_TI2V, subfolder="vae", torch_dtype=torch.float32)
+        text_pipe = WanPipeline.from_pretrained(MODEL_ID_TI2V, vae=vae, torch_dtype=torch.bfloat16)
+        text_pipe.scheduler = UniPCMultistepScheduler.from_config(text_pipe.scheduler.config, flow_shift=8.0)
+        try:
+            if DEVICE == "cuda":
+                text_pipe.enable_model_cpu_offload()
+            else:
+                text_pipe.to(DEVICE)
+        except RuntimeError:
+            text_pipe.to("cpu")
+        MODELS["ti2v_text"] = text_pipe
+        image_pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID_TI2V, vae=vae, torch_dtype=torch.bfloat16)
+        image_pipe.scheduler = UniPCMultistepScheduler.from_config(image_pipe.scheduler.config, flow_shift=8.0)
+        try:
+            if DEVICE == "cuda":
+                image_pipe.enable_model_cpu_offload()
+            else:
+                image_pipe.to(DEVICE)
+        except RuntimeError:
+            image_pipe.to("cpu")
+        MODELS["ti2v_image"] = image_pipe
+    except Exception as e:
+        pass
+    try:
+        s2v_pipe = WanSpeechToVideoPipeline.from_pretrained(
+            MODEL_ID_S2V,
+            torch_dtype=torch.bfloat16
+        )
+        try:
+            if DEVICE == "cuda":
+                s2v_pipe.enable_model_cpu_offload()
+            else:
+                s2v_pipe.to(DEVICE)
+        except RuntimeError:
+            s2v_pipe.to("cpu")
+        MODELS["s2v"] = s2v_pipe
+    except Exception as e:
+        pass
+load_models_at_startup()
+def auto_duration_estimator(mode, input_data, duration_val):
+    base_overhead = 45
+    if mode == "s2v":
+        audio_path = input_data
+        if audio_path:
+            try:
+                dur = librosa.get_duration(filename=audio_path)
+                return int(base_overhead + (dur * 15))
+            except:
+                return 120
+        return 120
+    else:
+        num_images = len(input_data) if input_data else 0
+        if num_images > 0:
+            total_seconds = max(duration_val, num_images * 2)
+        else:
+            total_seconds = duration_val
+        return int(base_overhead + (total_seconds * 12))
+def fast_stitch_videos(video_paths):
+    if not video_paths: return None
+    if len(video_paths) == 1: return video_paths[0]
+    try:
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+            for path in video_paths:
+                f.write(f"file '{path}'\n")
+            list_path = f.name
+        with tempfile.NamedTemporaryFile(suffix="_stitched_stream.mp4", delete=False) as tmp:
+            out_path = tmp.name
+        cmd = [
+            "ffmpeg", "-y", "-f", "concat", "-safe", "0",
+            "-i", list_path, "-c", "copy", out_path
+        ]
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        os.remove(list_path)
+        return out_path
+    except:
+        return video_paths[-1]
+@spaces.GPU(duration=lambda *args: auto_duration_estimator("ti2v", args[0], args[5]))
+def generate_ti2v_gpu_stream(input_files, prompt, height, width, negative_prompt, duration_seconds, guidance_scale, steps, seed, randomize_seed, progress=gr.Progress(track_tqdm=True)):
+    global MODELS
+    text_to_video_pipe = MODELS.get("ti2v_text")
+    image_to_video_pipe = MODELS.get("ti2v_image")
+    if not text_to_video_pipe or not image_to_video_pipe:
+        raise gr.Error("Models failed to load at startup. Check system memory.")
+    MOD_VALUE = 32
+    target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
+    target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
+    master_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
+    video_clips_paths = []
+    pil_images = []
+    if input_files:
+        files_list = input_files if isinstance(input_files, list) else [input_files]
+        for f in files_list:
+            try:
+                path = f.name if hasattr(f, "name") else f
+                img = Image.open(path).convert("RGB")
+                pil_images.append(img)
+            except:
+                continue
+    SAFE_CHUNK_DURATION = 4.0
+    FIXED_FPS = 24
+    last_preview_frame = None
+    if len(pil_images) > 0:
+        seconds_per_image = max(2.0, duration_seconds / len(pil_images))
+        for i, img in enumerate(pil_images):
+            current_chunk_duration = min(seconds_per_image, SAFE_CHUNK_DURATION)
+            num_frames = int(current_chunk_duration * FIXED_FPS)
+            local_seed = master_seed + i
+            generator = torch.Generator(device=DEVICE).manual_seed(local_seed)
+            resized_image = img.resize((target_w, target_h))
+            try:
+                with torch.inference_mode():
+                    output_frames = image_to_video_pipe(
+                        image=resized_image,
+                        prompt=prompt,
+                        negative_prompt=negative_prompt,
+                        height=target_h,
+                        width=target_w,
+                        num_frames=num_frames,
+                        guidance_scale=float(guidance_scale),
+                        num_inference_steps=int(steps),
+                        generator=generator
+                    ).frames[0]
+                with tempfile.NamedTemporaryFile(suffix=f"_img_{i}.mp4", delete=False) as tmp:
+                    export_to_video(output_frames, tmp.name, fps=FIXED_FPS)
+                    video_clips_paths.append(tmp.name)
+                if len(output_frames) > 0:
+                    last_preview_frame = output_frames[-1]
+                current_stitched = fast_stitch_videos(video_clips_paths)
+                yield current_stitched, last_preview_frame, master_seed
+            except Exception:
+                continue
+    else:
+        num_chunks = int(np.ceil(duration_seconds / SAFE_CHUNK_DURATION))
+        frames_per_chunk = int(SAFE_CHUNK_DURATION * FIXED_FPS)
+        for i in range(num_chunks):
+            chunk_seed = master_seed + (i * 100)
+            generator = torch.Generator(device=DEVICE).manual_seed(chunk_seed)
+            with torch.inference_mode():
+                output_frames = text_to_video_pipe(
+                    prompt=prompt,
+                    negative_prompt=negative_prompt,
+                    height=target_h,
+                    width=target_w,
+                    num_frames=frames_per_chunk,
+                    guidance_scale=float(guidance_scale),
+                    num_inference_steps=int(steps),
+                    generator=generator
+                ).frames[0]
+            with tempfile.NamedTemporaryFile(suffix=f"_chunk_{i}.mp4", delete=False) as tmp:
+                export_to_video(output_frames, tmp.name, fps=FIXED_FPS)
+                video_clips_paths.append(tmp.name)
+            if len(output_frames) > 0:
+                last_preview_frame = output_frames[-1]
+            current_stitched = fast_stitch_videos(video_clips_paths)
+            yield current_stitched, last_preview_frame, master_seed
 def merge_audio_video(video_path, audio_path, output_path):
     cmd = [
     subprocess.run(cmd, check=True)
     return output_path
+def load_audio_for_model(audio_filepath):
+    try:
+        wav, sr = librosa.load(audio_filepath, sr=16000)
+        return wav, sr
+    except:
+        return None, None
+@spaces.GPU(duration=lambda *args: auto_duration_estimator("s2v", args[1], 0))
+def generate_s2v_gpu(image_input, audio_filepath, prompt, seed, randomize_seed):
+    global MODELS
+    pipe = MODELS.get("s2v")
+    if not pipe:
+        raise gr.Error("S2V Model not initialized.")
     if image_input is None or audio_filepath is None:
+        raise gr.Error("Inputs Missing")
+    audio_values, sample_rate = load_audio_for_model(audio_filepath)
+    if audio_values is None:
+        raise gr.Error("Invalid Audio")
+    init_image = image_input.convert("RGB")
+    w, h = init_image.size
+    w = (w // 16) * 16
+    h = (h // 16) * 16
+    init_image = init_image.resize((w, h), Image.LANCZOS)
+    current_seed = random.randint(0, 2**32 - 1) if randomize_seed else int(seed)
+    generator = torch.Generator(device=DEVICE).manual_seed(current_seed)
+    with torch.inference_mode():
         out = pipe(
             image=init_image,
             audio=audio_values,
             num_inference_steps=25,
             guidance_scale=4.0,
             sampling_rate=sample_rate,
+            prompt=prompt,
+            generator=generator
         )
+    frames = out.frames[0]
+    with tempfile.NamedTemporaryFile(suffix="_temp_mute.mp4", delete=False) as tmp_vid:
+        temp_mute_path = tmp_vid.name
+    with tempfile.NamedTemporaryFile(suffix="_output_s2v.mp4", delete=False) as tmp_final:
+        final_video_path = tmp_final.name
+    export_to_video(frames, temp_mute_path, fps=30)
+    final_output = merge_audio_video(temp_mute_path, audio_filepath, final_video_path)
+    return final_output, current_seed
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Wan 2.2 Unified Streaming Video Platform")
+    with gr.Tabs():
+        with gr.TabItem("Text & Image to Video (Streaming & Long Duration)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    ti2v_files = gr.File(label="Input Images", file_count="multiple", type="filepath", file_types=["image"])
+                    ti2v_prompt = gr.Textbox(label="Prompt", value="Cinematic view, realistic lighting, 4k", lines=2)
+                    ti2v_duration = gr.Slider(minimum=2, maximum=300, step=1, value=5, label="Total Duration (s)")
+                    with gr.Accordion("Advanced", open=False):
+                        ti2v_neg = gr.Textbox(label="Negative Prompt", value="low quality, distortion, text, watermark", lines=2)
+                        ti2v_seed = gr.Slider(label="Seed", minimum=0, maximum=2**32-1, step=1, value=42)
+                        ti2v_rand = gr.Checkbox(label="Random Seed", value=True)
+                        with gr.Row():
+                            ti2v_h = gr.Slider(256, 1024, 32, 832, label="Height")
+                            ti2v_w = gr.Slider(256, 1024, 32, 832, label="Width")
+                        ti2v_steps = gr.Slider(2, 10, 1, 4, label="Steps")
+                        ti2v_scale = gr.Slider(1.0, 8.0, 0.1, 5.0, label="CFG")
+                    btn_ti2v = gr.Button("Start Streaming Generation", variant="primary")
+                with gr.Column(scale=2):
+                    with gr.Row():
+                        out_ti2v = gr.Video(label="Live Video Stream", autoplay=True)
+                        out_preview_ti2v = gr.Image(label="Last Frame Preview", interactive=False)
+                    out_seed_ti2v = gr.Number(label="Seed Used")
+            btn_ti2v.click(
+                fn=generate_ti2v_gpu_stream,
+                inputs=[ti2v_files, ti2v_prompt, ti2v_h, ti2v_w, ti2v_neg, ti2v_duration, ti2v_scale, ti2v_steps, ti2v_seed, ti2v_rand],
+                outputs=[out_ti2v, out_preview_ti2v, out_seed_ti2v]
+            )
+        with gr.TabItem("Speech to Video (S2V)"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    s2v_img = gr.Image(label="Reference Image", type="pil")
+                    s2v_audio = gr.Audio(label="Audio Input", type="filepath")
+                    s2v_prompt = gr.Textbox(label="Prompt", value="Realistic movement, talking face")
+                    s2v_seed = gr.Slider(label="Seed", minimum=0, maximum=2**32-1, step=1, value=42)
+                    s2v_rand = gr.Checkbox(label="Random Seed", value=True)
+                    btn_s2v = gr.Button("Generate S2V", variant="primary")
+                with gr.Column(scale=2):
+                    out_s2v = gr.Video(label="Result")
+                    out_seed_s2v = gr.Number(label="Seed Used")
+            btn_s2v.click(generate_s2v_gpu, [s2v_img, s2v_audio, s2v_prompt, s2v_seed, s2v_rand], [out_s2v, out_seed_s2v])
 if __name__ == "__main__":
+    demo.queue().launch()