Ksjsjjdj commited on
Commit
d357fb1
·
verified ·
1 Parent(s): 8692393

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -0
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ from pathlib import Path
5
+ from PIL import Image
6
+ import soundfile as sf
7
+
8
+ MODEL_ID = "Wan-AI/Wan2.2-S2V-14B" # HF repo del modelo Speech-to-Video :contentReference[oaicite:1]{index=1}
9
+ LOCAL_DIR = Path("wan2.2_s2v_model")
10
+
11
+ # 🟡 Instalar deps al iniciar
12
+ print("Instalando dependencias…")
13
+ os.system("pip install -q diffusers transformers accelerate safetensors gradio soundfile ffmpeg-python huggingface-hub")
14
+
15
+ # 💾 Descargar modelo (usa HF CLI)
16
+ print("Descargando modelo…")
17
+ os.system(f"pip install -q \"huggingface_hub[cli]\"")
18
+ os.system(f"huggingface-cli download {MODEL_ID} --local-dir {LOCAL_DIR}")
19
+
20
+ # 📦 Import pipeline (después de instalar)
21
+ from diffusers import DiffusionPipeline
22
+
23
+ def load_audio(file):
24
+ wav, sr = sf.read(file.name, dtype="float32")
25
+ if wav.ndim > 1:
26
+ wav = wav.mean(axis=1)
27
+ return wav, sr
28
+
29
+ def generate_video(image, audio_file):
30
+ device = "cuda" if torch.cuda.is_available() else "cpu"
31
+ dtype = torch.float16 if device=="cuda" else torch.float32
32
+
33
+ # Cargar pipeline desde el local descargado
34
+ print("Cargando Diffusers Pipeline desde:", LOCAL_DIR)
35
+ pipe = DiffusionPipeline.from_pretrained(
36
+ LOCAL_DIR,
37
+ torch_dtype=dtype,
38
+ use_safetensors=True,
39
+ device_map="auto" if device=="cuda" else None
40
+ )
41
+
42
+ # Preparar inputs
43
+ audio_array, sample_rate = load_audio(audio_file)
44
+ init_image = image.convert("RGB")
45
+
46
+ # Llamar a pipeline (ajustá parámetros según resultados)
47
+ out = pipe(
48
+ image=init_image,
49
+ audio=audio_array,
50
+ audio_sample_rate=sample_rate,
51
+ num_inference_steps=25,
52
+ guidance_scale=4.0,
53
+ frame_rate=16,
54
+ max_frames=64,
55
+ )
56
+
57
+ # Extraer frames
58
+ frames = getattr(out, "frames", getattr(out, "images", out))
59
+
60
+ # Guardar video con ffmpeg
61
+ import tempfile, subprocess
62
+ tmpdir = tempfile.mkdtemp()
63
+ for i, f in enumerate(frames):
64
+ fname = Path(tmpdir) / f"frame_{i:04d}.png"
65
+ f.save(fname)
66
+
67
+ out_video = "wan_s2v_output.mp4"
68
+ subprocess.run([
69
+ "ffmpeg", "-y", "-framerate", "16",
70
+ "-i", str(Path(tmpdir) / "frame_%04d.png"),
71
+ "-c:v", "libx264", "-pix_fmt", "yuv420p", out_video
72
+ ], check=True)
73
+
74
+ return out_video
75
+
76
+ # ────────── Gradio UI ──────────
77
+
78
+ with gr.Blocks() as demo:
79
+ gr.Markdown("# 🎬 Wan2.2-S2V (Speech-to-Video) Gradio App")
80
+ with gr.Row():
81
+ img = gr.Image(label="Imagen de referencia")
82
+ audio = gr.Audio(label="Audio (.wav)")
83
+ btn = gr.Button("Generar Video")
84
+ out_video = gr.Video(label="Resultado de Video")
85
+
86
+ btn.click(generate_video, inputs=[img, audio], outputs=out_video)
87
+
88
+ # 🟦 Lanzar en HuggingFace Space
89
+ demo.launch()