Ksjsjjdj commited on
Commit
668d06a
verified
1 Parent(s): 9deb6ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -48
app.py CHANGED
@@ -4,15 +4,11 @@ import subprocess
4
  import traceback
5
  from pathlib import Path
6
 
7
- # --- 1. INSTALACI脫N DE LIBRER脥AS ---
8
  def install_dependencies():
9
- print("Instalando librer铆as necesarias...")
10
- # Se elimin贸 'spaces' de la lista
11
  commands = [
12
  "pip install spaces-0.1.0-py3-none-any.whl"
13
  ]
14
  for cmd in commands:
15
- print(f"Ejecutando: {cmd}")
16
  os.system(cmd)
17
 
18
  install_dependencies()
@@ -24,7 +20,6 @@ import torch
24
  import gradio as gr
25
  from huggingface_hub import snapshot_download
26
 
27
- # Verificaci贸n de librer铆as cr铆ticas
28
  try:
29
  import diffusers
30
  import accelerate
@@ -35,41 +30,31 @@ except ImportError:
35
 
36
  import spaces
37
 
38
- # --- 2. Descarga del Modelo ---
39
  MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
40
- print(f"Verificando modelo {MODEL_ID}...")
41
  try:
42
  LOCAL_DIR = snapshot_download(repo_id=MODEL_ID, repo_type="model")
43
- except Exception as e:
44
- print(f"Error descargando modelo, usando ID remoto: {e}")
45
  LOCAL_DIR = MODEL_ID
46
 
47
- # Variable global para el pipeline
48
  pipe = None
49
 
50
- # --- 3. Funciones Auxiliares ---
51
  def load_audio_for_model(audio_filepath):
52
- """Carga el audio y lo prepara para el pipeline"""
53
  try:
54
  wav, sr = sf.read(audio_filepath)
55
- # Convertir a float32 si es necesario
56
  if wav.dtype != np.float32:
57
  if np.issubdtype(wav.dtype, np.integer):
58
  wav = wav.astype("float32") / 32768.0
59
  else:
60
  wav = wav.astype("float32")
61
 
62
- # Mezclar a mono si es est茅reo
63
  if wav.ndim > 1:
64
  wav = wav.mean(axis=1)
65
 
66
  return wav, sr
67
- except Exception as e:
68
- print(f"Error cargando audio: {e}")
69
  return None, None
70
 
71
  def to_pil(image):
72
- """Convierte cualquier entrada a PIL Image RGB"""
73
  if image is None: return None
74
  if isinstance(image, Image.Image): return image.convert("RGB")
75
  if isinstance(image, str): return Image.open(image).convert("RGB")
@@ -77,86 +62,68 @@ def to_pil(image):
77
  return Image.fromarray(arr).convert("RGB")
78
 
79
  def merge_audio_video(video_path, audio_path, output_path):
80
- """Combina el video generado con el audio original usando FFmpeg"""
81
- print("Combinando audio y video...")
82
  cmd = [
83
  "ffmpeg", "-y",
84
- "-i", video_path, # Video input
85
- "-i", audio_path, # Audio input
86
- "-c:v", "copy", # Copiar stream de video (no re-codificar)
87
- "-c:a", "aac", # Codificar audio a AAC
88
- "-map", "0:v:0", "-map", "1:a:0",
89
- "-shortest", # Cortar al m谩s corto
90
  output_path
91
  ]
92
  subprocess.run(cmd, check=True)
93
  return output_path
94
 
95
- # --- 4. Generaci贸n ---
96
- # Se elimin贸 el decorador @spaces.GPU
97
  @spaces.GPU(duration=120)
98
- def generate_video(image_input, audio_filepath):
99
  global pipe
100
 
101
- # 1. Validaciones
102
  if image_input is None or audio_filepath is None:
103
- raise gr.Error("Por favor sube una imagen y un audio.")
104
 
105
- print(f"Procesando audio: {audio_filepath}")
106
-
107
  try:
108
- # 2. Carga del Modelo (Lazy Loading)
109
  if pipe is None:
110
- print("Cargando pipeline en memoria...")
111
  from diffusers import WanSpeechToVideoPipeline
112
 
113
- # Se eliminaron las comprobaciones expl铆citas de CUDA/CPU/Device map
114
- # El pipeline usar谩 la configuraci贸n por defecto de torch/accelerate
115
  pipe = WanSpeechToVideoPipeline.from_pretrained(
116
  LOCAL_DIR,
117
  use_safetensors=True
118
  )
119
 
120
- # 3. Preparar inputs
121
  audio_values, sample_rate = load_audio_for_model(audio_filepath)
122
  init_image = to_pil(image_input)
123
 
124
- # Redimensionar imagen (m煤ltiplos de 16)
125
  w, h = init_image.size
126
  w = (w // 16) * 16
127
  h = (h // 16) * 16
128
  init_image = init_image.resize((w, h), Image.LANCZOS)
129
 
130
- print("Iniciando inferencia...")
131
-
132
- # 4. Inferencia
133
  out = pipe(
134
  image=init_image,
135
  audio=audio_values,
136
  num_inference_steps=25,
137
- guidance_scale=4.0
 
 
138
  )
139
 
140
  frames = out.frames[0]
141
 
142
- # 5. Exportar Video Mudo Temporal
143
  temp_mute_video = "temp_mute.mp4"
144
  final_video = "output_s2v.mp4"
145
 
146
  from diffusers.utils import export_to_video
147
  export_to_video(frames, temp_mute_video, fps=16)
148
 
149
- # 6. A帽adir Audio
150
  final_output = merge_audio_video(temp_mute_video, audio_filepath, final_video)
151
 
152
  return final_output
153
 
154
  except Exception as e:
155
- print("ERROR CR脥TICO DURANTE LA GENERACI脫N:")
156
  traceback.print_exc()
157
- raise gr.Error(f"Error generando video: {str(e)}")
158
 
159
- # --- 5. Interfaz Gradio ---
160
  with gr.Blocks(title="Wan2.1 Speech to Video") as demo:
161
  gr.Markdown("# Wan2.2-S2V Generador de Video")
162
 
@@ -164,6 +131,7 @@ with gr.Blocks(title="Wan2.1 Speech to Video") as demo:
164
  with gr.Column():
165
  img_input = gr.Image(label="Imagen de referencia", type="pil")
166
  audio_input = gr.Audio(label="Audio (.wav)", type="filepath")
 
167
  btn = gr.Button("Generar Video", variant="primary")
168
 
169
  with gr.Column():
@@ -171,7 +139,7 @@ with gr.Blocks(title="Wan2.1 Speech to Video") as demo:
171
 
172
  btn.click(
173
  fn=generate_video,
174
- inputs=[img_input, audio_input],
175
  outputs=video_output
176
  )
177
 
 
4
  import traceback
5
  from pathlib import Path
6
 
 
7
  def install_dependencies():
 
 
8
  commands = [
9
  "pip install spaces-0.1.0-py3-none-any.whl"
10
  ]
11
  for cmd in commands:
 
12
  os.system(cmd)
13
 
14
  install_dependencies()
 
20
  import gradio as gr
21
  from huggingface_hub import snapshot_download
22
 
 
23
  try:
24
  import diffusers
25
  import accelerate
 
30
 
31
  import spaces
32
 
 
33
  MODEL_ID = "tolgacangoz/Wan2.2-S2V-14B-Diffusers"
 
34
  try:
35
  LOCAL_DIR = snapshot_download(repo_id=MODEL_ID, repo_type="model")
36
+ except Exception:
 
37
  LOCAL_DIR = MODEL_ID
38
 
 
39
  pipe = None
40
 
 
41
  def load_audio_for_model(audio_filepath):
 
42
  try:
43
  wav, sr = sf.read(audio_filepath)
 
44
  if wav.dtype != np.float32:
45
  if np.issubdtype(wav.dtype, np.integer):
46
  wav = wav.astype("float32") / 32768.0
47
  else:
48
  wav = wav.astype("float32")
49
 
 
50
  if wav.ndim > 1:
51
  wav = wav.mean(axis=1)
52
 
53
  return wav, sr
54
+ except Exception:
 
55
  return None, None
56
 
57
  def to_pil(image):
 
58
  if image is None: return None
59
  if isinstance(image, Image.Image): return image.convert("RGB")
60
  if isinstance(image, str): return Image.open(image).convert("RGB")
 
62
  return Image.fromarray(arr).convert("RGB")
63
 
64
  def merge_audio_video(video_path, audio_path, output_path):
 
 
65
  cmd = [
66
  "ffmpeg", "-y",
67
+ "-i", video_path,
68
+ "-i", audio_path,
69
+ "-c:v", "copy",
70
+ "-c:a", "aac",
71
+ "-map", "0:v:0", "-map", "1:a:0",
72
+ "-shortest",
73
  output_path
74
  ]
75
  subprocess.run(cmd, check=True)
76
  return output_path
77
 
 
 
78
  @spaces.GPU(duration=120)
79
+ def generate_video(image_input, audio_filepath, prompt):
80
  global pipe
81
 
 
82
  if image_input is None or audio_filepath is None:
83
+ raise gr.Error("Error inputs")
84
 
 
 
85
  try:
 
86
  if pipe is None:
 
87
  from diffusers import WanSpeechToVideoPipeline
88
 
 
 
89
  pipe = WanSpeechToVideoPipeline.from_pretrained(
90
  LOCAL_DIR,
91
  use_safetensors=True
92
  )
93
 
 
94
  audio_values, sample_rate = load_audio_for_model(audio_filepath)
95
  init_image = to_pil(image_input)
96
 
 
97
  w, h = init_image.size
98
  w = (w // 16) * 16
99
  h = (h // 16) * 16
100
  init_image = init_image.resize((w, h), Image.LANCZOS)
101
 
 
 
 
102
  out = pipe(
103
  image=init_image,
104
  audio=audio_values,
105
  num_inference_steps=25,
106
+ guidance_scale=4.0,
107
+ sampling_rate=sample_rate,
108
+ prompt=prompt
109
  )
110
 
111
  frames = out.frames[0]
112
 
 
113
  temp_mute_video = "temp_mute.mp4"
114
  final_video = "output_s2v.mp4"
115
 
116
  from diffusers.utils import export_to_video
117
  export_to_video(frames, temp_mute_video, fps=16)
118
 
 
119
  final_output = merge_audio_video(temp_mute_video, audio_filepath, final_video)
120
 
121
  return final_output
122
 
123
  except Exception as e:
 
124
  traceback.print_exc()
125
+ raise gr.Error(str(e))
126
 
 
127
  with gr.Blocks(title="Wan2.1 Speech to Video") as demo:
128
  gr.Markdown("# Wan2.2-S2V Generador de Video")
129
 
 
131
  with gr.Column():
132
  img_input = gr.Image(label="Imagen de referencia", type="pil")
133
  audio_input = gr.Audio(label="Audio (.wav)", type="filepath")
134
+ prompt_input = gr.Textbox(label="Prompt")
135
  btn = gr.Button("Generar Video", variant="primary")
136
 
137
  with gr.Column():
 
139
 
140
  btn.click(
141
  fn=generate_video,
142
+ inputs=[img_input, audio_input, prompt_input],
143
  outputs=video_output
144
  )
145