Ksjsjjdj commited on
Commit
44fb0fb
·
verified ·
1 Parent(s): 458786f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -40
app.py CHANGED
@@ -3,7 +3,10 @@ os.system("pip install -q git+https://github.com/tolgacangoz/diffusers.git@integ
3
  os.system("pip install -q ./spaces-0.1.0-py3-none-any.whl || pip install -q spaces || true")
4
  from huggingface_hub import snapshot_download
5
  MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
6
- LOCAL_DIR = snapshot_download(repo_id=MODEL_ID, repo_type="model")
 
 
 
7
  import gradio as gr
8
  from pathlib import Path
9
  from PIL import Image
@@ -41,11 +44,29 @@ def load_audio(audio):
41
  return wav, sr
42
  except Exception:
43
  return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  @spaces.GPU(duration=120)
45
  def generate_video_gpu(image, audio_file):
46
  global pipe
47
  import torch
48
- import tempfile, subprocess
49
  from pathlib import Path as _P
50
  try:
51
  from diffusers import WanSpeechToVideoPipeline as PipelineClass
@@ -55,51 +76,66 @@ def generate_video_gpu(image, audio_file):
55
  except Exception:
56
  from diffusers import DiffusionPipeline as PipelineClass
57
  dtype = torch.float16
58
- if pipe is None:
59
- pipe = PipelineClass.from_pretrained(
60
- LOCAL_DIR,
61
- torch_dtype=dtype,
62
- use_safetensors=True,
63
- device_map="balanced"
64
- )
65
- audio_array, sample_rate = load_audio(audio_file)
66
- if audio_array is None or sample_rate is None:
67
- return None
68
- init_image = image.convert("RGB")
69
- out = pipe(
70
- image=init_image,
71
- audio=audio_array,
72
- audio_sample_rate=sample_rate,
73
- num_inference_steps=25,
74
- guidance_scale=4.0,
75
- frame_rate=16,
76
- max_frames=64,
77
- )
78
- frames = getattr(out, "frames", getattr(out, "images", out))
79
- out_path = "wan_s2v_output.mp4"
80
  try:
81
- from diffusers.utils import export_to_video
82
- export_to_video(frames, out_path, fps=16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  except Exception:
84
- tmpdir = tempfile.mkdtemp()
85
- for i, f in enumerate(frames):
86
- fname = _P(tmpdir) / f"frame_{i:04d}.png"
87
- if hasattr(f, "save"):
88
- f.save(fname)
89
- else:
90
- Image.fromarray((np.array(f) * 255).astype("uint8")).save(fname)
91
- subprocess.run([
92
- "ffmpeg", "-y", "-framerate", "16",
93
- "-i", str(_P(tmpdir) / "frame_%04d.png"),
94
- "-c:v", "libx264", "-pix_fmt", "yuv420p", out_path
95
- ], check=True)
96
- return out_path
97
  def generate_video(image, audio):
98
  return generate_video_gpu(image, audio)
99
  with gr.Blocks() as demo:
100
  gr.Markdown("# Wan2.2-S2V Gradio Space")
101
  with gr.Row():
102
- img = gr.Image(label="Imagen de referencia")
103
  audio = gr.Audio(label="Audio (.wav)", type="numpy")
104
  btn = gr.Button("Generar Video")
105
  out_video = gr.Video(label="Resultado de Video")
 
3
  os.system("pip install -q ./spaces-0.1.0-py3-none-any.whl || pip install -q spaces || true")
4
  from huggingface_hub import snapshot_download
5
  MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
6
+ try:
7
+ LOCAL_DIR = snapshot_download(repo_id=MODEL_ID, repo_type="model")
8
+ except Exception:
9
+ LOCAL_DIR = MODEL_ID
10
  import gradio as gr
11
  from pathlib import Path
12
  from PIL import Image
 
44
  return wav, sr
45
  except Exception:
46
  return None, None
47
+ def to_pil(image):
48
+ if image is None:
49
+ return None
50
+ if isinstance(image, Image.Image):
51
+ return image.convert("RGB")
52
+ if isinstance(image, (str, Path)):
53
+ return Image.open(str(image)).convert("RGB")
54
+ arr = np.array(image)
55
+ if arr.dtype != np.uint8:
56
+ if arr.max() <= 1.0:
57
+ arr = (arr * 255).clip(0,255).astype("uint8")
58
+ else:
59
+ arr = arr.clip(0,255).astype("uint8")
60
+ if arr.ndim == 2:
61
+ arr = np.stack([arr]*3, axis=-1)
62
+ if arr.ndim == 3 and arr.shape[2] == 4:
63
+ arr = arr[..., :3]
64
+ return Image.fromarray(arr)
65
  @spaces.GPU(duration=120)
66
  def generate_video_gpu(image, audio_file):
67
  global pipe
68
  import torch
69
+ import tempfile, subprocess, traceback
70
  from pathlib import Path as _P
71
  try:
72
  from diffusers import WanSpeechToVideoPipeline as PipelineClass
 
76
  except Exception:
77
  from diffusers import DiffusionPipeline as PipelineClass
78
  dtype = torch.float16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
+ if pipe is None:
81
+ try:
82
+ pipe = PipelineClass.from_pretrained(
83
+ LOCAL_DIR,
84
+ torch_dtype=dtype,
85
+ use_safetensors=True,
86
+ device_map="balanced"
87
+ )
88
+ except Exception:
89
+ pipe = PipelineClass.from_pretrained(
90
+ MODEL_ID,
91
+ torch_dtype=dtype,
92
+ use_safetensors=True,
93
+ device_map="balanced"
94
+ )
95
+ audio_array, sample_rate = load_audio(audio_file)
96
+ if audio_array is None or sample_rate is None:
97
+ return None
98
+ init_image = to_pil(image)
99
+ if init_image is None:
100
+ return None
101
+ out = pipe(
102
+ image=init_image,
103
+ audio=audio_array,
104
+ audio_sample_rate=sample_rate,
105
+ num_inference_steps=25,
106
+ guidance_scale=4.0,
107
+ frame_rate=16,
108
+ max_frames=64,
109
+ )
110
+ frames = getattr(out, "frames", getattr(out, "images", out))
111
+ out_path = "wan_s2v_output.mp4"
112
+ try:
113
+ from diffusers.utils import export_to_video
114
+ export_to_video(frames, out_path, fps=16)
115
+ except Exception:
116
+ tmpdir = tempfile.mkdtemp()
117
+ for i, f in enumerate(frames):
118
+ fname = _P(tmpdir) / f"frame_{i:04d}.png"
119
+ if hasattr(f, "save"):
120
+ f.save(fname)
121
+ else:
122
+ Image.fromarray((np.array(f) * 255).clip(0,255).astype("uint8")).save(fname)
123
+ subprocess.run([
124
+ "ffmpeg", "-y", "-framerate", "16",
125
+ "-i", str(_P(tmpdir) / "frame_%04d.png"),
126
+ "-c:v", "libx264", "-pix_fmt", "yuv420p", out_path
127
+ ], check=True)
128
+ return out_path
129
  except Exception:
130
+ with open("error.log", "a") as _f:
131
+ _f.write(traceback.format_exc() + "\n")
132
+ return None
 
 
 
 
 
 
 
 
 
 
133
  def generate_video(image, audio):
134
  return generate_video_gpu(image, audio)
135
  with gr.Blocks() as demo:
136
  gr.Markdown("# Wan2.2-S2V Gradio Space")
137
  with gr.Row():
138
+ img = gr.Image(label="Imagen de referencia", type="numpy")
139
  audio = gr.Audio(label="Audio (.wav)", type="numpy")
140
  btn = gr.Button("Generar Video")
141
  out_video = gr.Video(label="Resultado de Video")