baenacoco commited on
Commit
f594bcf
·
verified ·
1 Parent(s): aad2df6

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +6 -7
  2. app.py +479 -0
  3. hub_utils.py +64 -0
  4. packages.txt +6 -0
  5. requirements.txt +19 -0
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Talking Head Full
3
- emoji: 🏃
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Talking Head - Full Pipeline
3
+ emoji: 🎥
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ hardware: a100-large
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Space 6: Full Pipeline (simplified Space 5)
2
+
3
+ One-click: downloads models -> TTS -> Image -> Lip-sync -> video.
4
+ GPU: A100 (same as Space 5 with fewer controls)
5
+ """
6
+ import gc
7
+ import json
8
+ import logging
9
+ import os
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ import traceback
14
+ from pathlib import Path
15
+
16
+ import gradio as gr
17
+ import numpy as np
18
+ import soundfile as sf
19
+ import torch
20
+
21
+ from hub_utils import download_step, upload_step
22
+
23
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # ── Config ──
27
+ IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
28
+ _data_path = Path("/data")
29
+ if IS_HF_SPACE and _data_path.exists() and os.access(_data_path, os.W_OK):
30
+ BASE_DIR = _data_path
31
+ else:
32
+ BASE_DIR = Path("data")
33
+
34
+ VOICE_MODEL_DIR = BASE_DIR / "voice_model"
35
+ LORA_MODEL_DIR = BASE_DIR / "lora_model"
36
+ GENERATED_VIDEO_DIR = BASE_DIR / "generated"
37
+ TEMP_DIR = BASE_DIR / "temp"
38
+ HF_CACHE_DIR = BASE_DIR / "hf_cache"
39
+
40
+ for d in [VOICE_MODEL_DIR, LORA_MODEL_DIR, GENERATED_VIDEO_DIR, TEMP_DIR, HF_CACHE_DIR]:
41
+ d.mkdir(parents=True, exist_ok=True)
42
+
43
+ os.environ["HF_HOME"] = str(HF_CACHE_DIR)
44
+ os.environ["TRANSFORMERS_CACHE"] = str(HF_CACHE_DIR)
45
+
46
+ FLUX_MODEL_ID = "black-forest-labs/FLUX.1-dev"
47
+ F5_SPANISH_MODEL_ID = "jpgallegoar/F5-Spanish"
48
+ MUSETALK_REPO_ID = "TMElyralab/MuseTalk"
49
+ LORA_TRIGGER_WORD = "alvaro_person"
50
+
51
+ IMAGE_WIDTH = 1024
52
+ IMAGE_HEIGHT = 1024
53
+ IMAGE_STEPS = 30
54
+ IMAGE_GUIDANCE = 3.5
55
+ TTS_SPEED = 1.0
56
+ MUSETALK_FPS = 30
57
+ MUSETALK_BBOX_SHIFT = 5
58
+ CHUNK_DURATION_S = 10
59
+ CROSSFADE_DURATION_S = 0.5
60
+
61
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
62
+ APP_VERSION = "1.0.0"
63
+
64
+ _f5_model = None
65
+ _flux_pipe = None
66
+ MUSETALK_DIR = Path("musetalk_repo")
67
+
68
+
69
+ def _clear_cache():
70
+ gc.collect()
71
+ if torch.cuda.is_available():
72
+ torch.cuda.empty_cache()
73
+ torch.cuda.synchronize()
74
+
75
+
76
+ def _unload_all():
77
+ global _f5_model, _flux_pipe
78
+ if _f5_model is not None:
79
+ del _f5_model
80
+ _f5_model = None
81
+ if _flux_pipe is not None:
82
+ del _flux_pipe
83
+ _flux_pipe = None
84
+ _clear_cache()
85
+
86
+
87
+ # ── FFmpeg utils ──
88
+
89
+ def _ffmpeg_run(cmd, description):
90
+ result = subprocess.run(cmd, capture_output=True, text=True)
91
+ if result.returncode != 0:
92
+ raise RuntimeError(f"FFmpeg failed ({description}): {result.stderr[-500:]}")
93
+
94
+
95
+ def _get_duration(file_path):
96
+ cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
97
+ "-of", "default=noprint_wrappers=1:nokey=1", file_path]
98
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
99
+ return float(result.stdout.strip())
100
+
101
+
102
+ def _concat_videos(video_paths, output_path):
103
+ list_file = Path(output_path).parent / "concat_list.txt"
104
+ with open(list_file, "w") as f:
105
+ for vp in video_paths:
106
+ f.write(f"file '{vp}'\n")
107
+ _ffmpeg_run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", str(list_file), "-c", "copy", output_path], "concat")
108
+ list_file.unlink(missing_ok=True)
109
+
110
+
111
+ def _crossfade_videos(v1, v2, output, duration=0.5):
112
+ dur1 = _get_duration(v1)
113
+ offset = dur1 - duration
114
+ _ffmpeg_run([
115
+ "ffmpeg", "-y", "-i", v1, "-i", v2,
116
+ "-filter_complex", f"[0:v][1:v]xfade=transition=fade:duration={duration}:offset={offset}[v]",
117
+ "-map", "[v]", "-c:v", "libx264", "-pix_fmt", "yuv420p", output,
118
+ ], "crossfade")
119
+
120
+
121
+ def _mux_audio_video(video, audio, output):
122
+ _ffmpeg_run([
123
+ "ffmpeg", "-y", "-i", video, "-i", audio,
124
+ "-c:v", "copy", "-c:a", "aac", "-b:a", "192k",
125
+ "-map", "0:v:0", "-map", "1:a:0", "-shortest", output,
126
+ ], "mux")
127
+
128
+
129
+ # ── TTS ──
130
+
131
+ def _load_tts():
132
+ global _f5_model
133
+ if _f5_model is not None:
134
+ return
135
+ _unload_all()
136
+ from f5_tts.api import F5TTS
137
+ finetuned_path = VOICE_MODEL_DIR / "model_last.pt"
138
+ if not finetuned_path.exists():
139
+ checkpoints = list(VOICE_MODEL_DIR.glob("*.pt")) + list(VOICE_MODEL_DIR.glob("*.safetensors"))
140
+ finetuned_path = checkpoints[0] if checkpoints else None
141
+ if finetuned_path and finetuned_path.exists():
142
+ _f5_model = F5TTS(model_path=str(finetuned_path), device=DEVICE)
143
+ else:
144
+ _f5_model = F5TTS(model_name=F5_SPANISH_MODEL_ID, device=DEVICE)
145
+
146
+
147
+ def generate_speech(text):
148
+ _load_tts()
149
+ ref = VOICE_MODEL_DIR / "reference.wav"
150
+ if not ref.exists():
151
+ raise FileNotFoundError("No reference audio found.")
152
+ output_path = str(TEMP_DIR / "tts_output.wav")
153
+ audio, sr = _f5_model.infer(ref_file=str(ref), ref_text="", gen_text=text, speed=TTS_SPEED)
154
+ sf.write(output_path, audio, sr)
155
+ return output_path
156
+
157
+
158
+ def _unload_tts():
159
+ global _f5_model
160
+ if _f5_model is not None:
161
+ del _f5_model
162
+ _f5_model = None
163
+ _clear_cache()
164
+
165
+
166
+ # ── Image generation ──
167
+
168
+ def _load_flux():
169
+ global _flux_pipe
170
+ if _flux_pipe is not None:
171
+ return
172
+ _unload_tts()
173
+ from diffusers import FluxPipeline
174
+ _flux_pipe = FluxPipeline.from_pretrained(
175
+ FLUX_MODEL_ID, torch_dtype=torch.bfloat16,
176
+ token=os.environ.get("HF_TOKEN"),
177
+ ).to(DEVICE)
178
+ lora_weights = list(LORA_MODEL_DIR.glob("*.safetensors")) or list(LORA_MODEL_DIR.glob("adapter_model.*"))
179
+ if lora_weights:
180
+ try:
181
+ _flux_pipe.load_lora_weights(str(LORA_MODEL_DIR))
182
+ except Exception as e:
183
+ logger.warning(f"Could not load LoRA: {e}")
184
+ _flux_pipe.enable_model_cpu_offload()
185
+
186
+
187
+ def _unload_flux():
188
+ global _flux_pipe
189
+ if _flux_pipe is not None:
190
+ del _flux_pipe
191
+ _flux_pipe = None
192
+ _clear_cache()
193
+
194
+
195
+ def generate_image(prompt):
196
+ _load_flux()
197
+ config_path = LORA_MODEL_DIR / "lora_config.json"
198
+ trigger = LORA_TRIGGER_WORD
199
+ if config_path.exists():
200
+ with open(config_path) as f:
201
+ trigger = json.load(f).get("trigger_word", LORA_TRIGGER_WORD)
202
+ if trigger and trigger not in prompt:
203
+ prompt = f"{trigger}, {prompt}"
204
+ output_path = str(TEMP_DIR / "generated_avatar.png")
205
+ result = _flux_pipe(
206
+ prompt=prompt, width=IMAGE_WIDTH, height=IMAGE_HEIGHT,
207
+ num_inference_steps=IMAGE_STEPS, guidance_scale=IMAGE_GUIDANCE,
208
+ )
209
+ result.images[0].save(output_path)
210
+ return output_path
211
+
212
+
213
+ # ── MuseTalk ──
214
+
215
+ def _ensure_musetalk():
216
+ try:
217
+ import mmcv
218
+ except ImportError:
219
+ for pkg in ["mmengine", "mmcv>=2.0.0", "mmdet>=3.1.0", "mmpose>=1.1.0"]:
220
+ subprocess.run([sys.executable, "-m", "mim", "install", pkg],
221
+ capture_output=True, text=True, timeout=600)
222
+
223
+ if not MUSETALK_DIR.exists():
224
+ try:
225
+ subprocess.run(
226
+ ["git", "clone", "https://github.com/TMElyralab/MuseTalk.git", str(MUSETALK_DIR)],
227
+ capture_output=True, text=True, timeout=300, check=True,
228
+ )
229
+ except Exception:
230
+ from huggingface_hub import snapshot_download
231
+ snapshot_download(repo_id=MUSETALK_REPO_ID, local_dir=str(MUSETALK_DIR), repo_type="model")
232
+
233
+ from huggingface_hub import hf_hub_download
234
+ models = [
235
+ ("TMElyralab/MuseTalk", "models/musetalk/musetalk.json"),
236
+ ("TMElyralab/MuseTalk", "models/musetalk/pytorch_model.bin"),
237
+ ("TMElyralab/MuseTalk", "models/dwpose/dw-ll_ucoco_384.onnx"),
238
+ ("TMElyralab/MuseTalk", "models/face-parse-bisenet/79999_iter.pth"),
239
+ ("TMElyralab/MuseTalk", "models/sd-vae-ft-mse/config.json"),
240
+ ("TMElyralab/MuseTalk", "models/sd-vae-ft-mse/diffusion_pytorch_model.bin"),
241
+ ("TMElyralab/MuseTalk", "models/whisper/tiny.pt"),
242
+ ]
243
+ for repo_id, filename in models:
244
+ if not (MUSETALK_DIR / filename).exists():
245
+ try:
246
+ hf_hub_download(repo_id=repo_id, filename=filename, local_dir=str(MUSETALK_DIR))
247
+ except Exception as e:
248
+ logger.warning(f"Could not download {filename}: {e}")
249
+
250
+
251
+ def _generate_lipsync(image_path, audio_path, output_path, bbox_shift):
252
+ _unload_all()
253
+ _ensure_musetalk()
254
+ try:
255
+ sys.path.insert(0, str(MUSETALK_DIR))
256
+ from musetalk.models.musetalk import MuseTalk
257
+ model = MuseTalk()
258
+ model.load_model(str(MUSETALK_DIR / "models"))
259
+ result = model.inference(
260
+ video_path=image_path, audio_path=audio_path,
261
+ bbox_shift=bbox_shift, result_dir=str(Path(output_path).parent),
262
+ )
263
+ if result and Path(result).exists():
264
+ if str(result) != output_path:
265
+ shutil.move(result, output_path)
266
+ return output_path
267
+ except Exception as e:
268
+ logger.warning(f"Python MuseTalk failed: {e}, trying CLI...")
269
+
270
+ result_dir = TEMP_DIR / "musetalk_output"
271
+ result_dir.mkdir(parents=True, exist_ok=True)
272
+ cmd = [
273
+ sys.executable, "-m", "scripts.inference",
274
+ "--video_path", image_path, "--audio_path", audio_path,
275
+ "--bbox_shift", str(bbox_shift), "--result_dir", str(result_dir),
276
+ "--fps", str(MUSETALK_FPS), "--batch_size", "8",
277
+ ]
278
+ env = os.environ.copy()
279
+ env["PYTHONPATH"] = str(MUSETALK_DIR) + ":" + env.get("PYTHONPATH", "")
280
+ proc = subprocess.run(cmd, capture_output=True, text=True, cwd=str(MUSETALK_DIR), env=env, timeout=1800)
281
+ if proc.returncode != 0:
282
+ raise RuntimeError(f"MuseTalk failed: {proc.stderr[-500:]}")
283
+ outputs = sorted(result_dir.glob("**/*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
284
+ if not outputs:
285
+ raise RuntimeError("MuseTalk did not produce output")
286
+ shutil.move(str(outputs[0]), output_path)
287
+ shutil.rmtree(result_dir, ignore_errors=True)
288
+ return output_path
289
+
290
+
291
+ def compose_long_video(image_path, audio_path, output_path, bbox_shift, progress_callback=None):
292
+ audio, sr = sf.read(audio_path)
293
+ if audio.ndim > 1:
294
+ audio = audio.mean(axis=1)
295
+ total_duration = len(audio) / sr
296
+
297
+ if total_duration <= CHUNK_DURATION_S * 1.5:
298
+ if progress_callback:
299
+ progress_callback(0.1, "Generando lip-sync...")
300
+ return _generate_lipsync(image_path, audio_path, output_path, bbox_shift)
301
+
302
+ work_dir = TEMP_DIR / "compose_work"
303
+ if work_dir.exists():
304
+ shutil.rmtree(work_dir)
305
+ work_dir.mkdir(parents=True)
306
+
307
+ from pydub import AudioSegment
308
+ from pydub.silence import detect_silence
309
+ temp_path = str(TEMP_DIR / "_temp_silence.wav")
310
+ sf.write(temp_path, audio, sr)
311
+ sound = AudioSegment.from_wav(temp_path)
312
+ silences = detect_silence(sound, min_silence_len=300, silence_thresh=-35)
313
+ boundaries = [0.0]
314
+ current = 0.0
315
+ while current + CHUNK_DURATION_S < total_duration:
316
+ target = current + CHUNK_DURATION_S
317
+ best_split, best_dist = target, float("inf")
318
+ for start_ms, end_ms in silences:
319
+ mid = (start_ms + end_ms) / 2000.0
320
+ if current + 3.0 < mid < total_duration - 1.0:
321
+ dist = abs(mid - target)
322
+ if dist < best_dist:
323
+ best_dist = dist
324
+ best_split = mid
325
+ boundaries.append(best_split)
326
+ current = best_split
327
+ boundaries.append(total_duration)
328
+ Path(temp_path).unlink(missing_ok=True)
329
+
330
+ n_chunks = len(boundaries) - 1
331
+ chunk_videos = []
332
+ for i in range(n_chunks):
333
+ if progress_callback:
334
+ progress_callback(0.1 + (i / n_chunks) * 0.7, f"Chunk {i+1}/{n_chunks}...")
335
+ start_sample = int(boundaries[i] * sr)
336
+ end_sample = int(boundaries[i + 1] * sr)
337
+ chunk_audio_path = str(work_dir / f"chunk_{i:03d}.wav")
338
+ sf.write(chunk_audio_path, audio[start_sample:end_sample], sr)
339
+ chunk_video_path = str(work_dir / f"chunk_{i:03d}.mp4")
340
+ _generate_lipsync(image_path, chunk_audio_path, chunk_video_path, bbox_shift)
341
+ chunk_videos.append(chunk_video_path)
342
+
343
+ if len(chunk_videos) == 1:
344
+ final_video = chunk_videos[0]
345
+ elif CROSSFADE_DURATION_S > 0:
346
+ current_vid = chunk_videos[0]
347
+ for i in range(1, len(chunk_videos)):
348
+ merged = str(work_dir / f"merged_{i:03d}.mp4")
349
+ try:
350
+ _crossfade_videos(current_vid, chunk_videos[i], merged, CROSSFADE_DURATION_S)
351
+ except Exception:
352
+ _concat_videos([current_vid, chunk_videos[i]], merged)
353
+ current_vid = merged
354
+ final_video = current_vid
355
+ else:
356
+ final_video = str(work_dir / "concat.mp4")
357
+ _concat_videos(chunk_videos, final_video)
358
+
359
+ _mux_audio_video(final_video, audio_path, output_path)
360
+ shutil.rmtree(work_dir, ignore_errors=True)
361
+ return output_path
362
+
363
+
364
+ # ── Gradio handlers ──
365
+
366
+ def download_models_from_hub(project_name):
367
+ if not project_name or not project_name.strip():
368
+ return "Error: Debes introducir un nombre de proyecto"
369
+ name = project_name.strip()
370
+ try:
371
+ status_parts = []
372
+ for step, local_dir, label in [
373
+ ("step3_voice", VOICE_MODEL_DIR, "voz"),
374
+ ("step4_lora", LORA_MODEL_DIR, "LoRA"),
375
+ ]:
376
+ if local_dir.exists():
377
+ shutil.rmtree(local_dir)
378
+ local_dir.mkdir(parents=True)
379
+ download_step(name, step, str(BASE_DIR))
380
+ src = BASE_DIR / name / step
381
+ if src.exists():
382
+ for f in src.iterdir():
383
+ shutil.move(str(f), str(local_dir / f.name))
384
+ status_parts.append(label)
385
+ shutil.rmtree(BASE_DIR / name, ignore_errors=True)
386
+ return f"OK - Descargados: {', '.join(status_parts)}"
387
+ except Exception as e:
388
+ return f"Error: {e}"
389
+
390
+
391
+ def full_pipeline_handler(project_name, text, scene_prompt, bbox_shift, progress=gr.Progress()):
392
+ if not project_name or not project_name.strip():
393
+ return None, "Error: Debes introducir un nombre de proyecto"
394
+ if not text.strip():
395
+ return None, "Error: Introduce texto para hablar"
396
+
397
+ voice_ready = any(VOICE_MODEL_DIR.glob("*.pt")) or any(VOICE_MODEL_DIR.glob("*.safetensors"))
398
+ lora_ready = any(LORA_MODEL_DIR.glob("*.safetensors")) or any(LORA_MODEL_DIR.glob("adapter_model.*"))
399
+ if not voice_ready:
400
+ return None, "Error: Modelo de voz no encontrado. Descarga desde el Hub primero."
401
+ if not lora_ready:
402
+ return None, "Error: LoRA no encontrado. Descarga desde el Hub primero."
403
+
404
+ try:
405
+ progress(0.0, desc="Generando voz...")
406
+ audio_path = generate_speech(text)
407
+
408
+ progress(0.2, desc="Generando imagen...")
409
+ image_path = generate_image(scene_prompt)
410
+ _unload_flux()
411
+
412
+ progress(0.4, desc="Generando lip-sync...")
413
+ output_path = str(GENERATED_VIDEO_DIR / "final_output.mp4")
414
+ compose_long_video(
415
+ image_path=image_path, audio_path=audio_path,
416
+ output_path=output_path, bbox_shift=int(bbox_shift),
417
+ progress_callback=lambda p, m: progress(0.4 + p * 0.6, desc=m),
418
+ )
419
+ return output_path, "OK - Video generado!"
420
+ except Exception as e:
421
+ logger.error(f"Pipeline failed:\n{traceback.format_exc()}")
422
+ return None, f"Error: {e}"
423
+
424
+
425
+ def save_to_hub(project_name):
426
+ if not project_name or not project_name.strip():
427
+ return "Error: Debes introducir un nombre de proyecto"
428
+ videos = list(GENERATED_VIDEO_DIR.glob("*.mp4"))
429
+ if not videos:
430
+ return "Error: No hay video para guardar."
431
+ try:
432
+ return upload_step(project_name.strip(), "step5_video", str(GENERATED_VIDEO_DIR))
433
+ except Exception as e:
434
+ return f"Error: {e}"
435
+
436
+
437
+ # ── UI ──
438
+
439
+ with gr.Blocks(title="Talking Head - Full Pipeline", theme=gr.themes.Soft()) as demo:
440
+ gr.Markdown(f"# Talking Head - Pipeline Completo `v{APP_VERSION}`\nTexto -> Video final (todo en uno)")
441
+
442
+ project_name = gr.Textbox(
443
+ label="Nombre del proyecto",
444
+ placeholder="mi_proyecto",
445
+ info="Obligatorio. Se usa como carpeta en el Hub.",
446
+ )
447
+
448
+ gr.Markdown("### 1. Descargar modelos del Hub")
449
+ download_btn = gr.Button("Descargar modelos del Hub", variant="secondary")
450
+ download_status = gr.Textbox(label="Estado", interactive=False)
451
+
452
+ gr.Markdown("### 2. Generar video")
453
+ with gr.Row():
454
+ with gr.Column():
455
+ full_text = gr.Textbox(label="Texto a hablar", lines=6, placeholder="Escribe el texto aqui...")
456
+ full_scene = gr.Textbox(
457
+ label="Prompt de escena",
458
+ value="portrait photo, professional lighting, neutral background",
459
+ )
460
+ full_bbox = gr.Slider(-20, 20, value=MUSETALK_BBOX_SHIFT, step=1, label="Bbox Shift")
461
+ full_btn = gr.Button("Generar Video", variant="primary")
462
+ with gr.Column():
463
+ full_video = gr.Video(label="Video final")
464
+ full_status = gr.Textbox(label="Estado", interactive=False)
465
+
466
+ gr.Markdown("### 3. Guardar video en Hub")
467
+ save_btn = gr.Button("Guardar en Hub", variant="secondary")
468
+ save_status = gr.Textbox(label="Estado guardado", interactive=False)
469
+
470
+ download_btn.click(download_models_from_hub, inputs=[project_name], outputs=[download_status])
471
+ full_btn.click(
472
+ full_pipeline_handler,
473
+ inputs=[project_name, full_text, full_scene, full_bbox],
474
+ outputs=[full_video, full_status],
475
+ )
476
+ save_btn.click(save_to_hub, inputs=[project_name], outputs=[save_status])
477
+
478
+ if __name__ == "__main__":
479
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
hub_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hub utilities for uploading/downloading step data to HF Dataset repo."""
2
+ import os
3
+ import logging
4
+ from pathlib import Path
5
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_tree
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ HF_DATASET_REPO_ID = "baenacoco/talking-head-avatar"
10
+
11
+
12
+ def _get_api():
13
+ token = os.environ.get("HF_TOKEN")
14
+ if not token:
15
+ raise ValueError("HF_TOKEN no encontrado en variables de entorno")
16
+ api = HfApi(token=token)
17
+ api.create_repo(repo_id=HF_DATASET_REPO_ID, repo_type="dataset", exist_ok=True)
18
+ return api
19
+
20
+
21
+ def upload_step(name: str, step_folder: str, local_dir: str):
22
+ """Upload a local directory to {name}/{step_folder}/ in the dataset repo."""
23
+ api = _get_api()
24
+ api.upload_folder(
25
+ folder_path=local_dir,
26
+ path_in_repo=f"{name}/{step_folder}",
27
+ repo_id=HF_DATASET_REPO_ID,
28
+ repo_type="dataset",
29
+ )
30
+ logger.info(f"Uploaded {local_dir} -> {name}/{step_folder}")
31
+ return f"Subido a Hub: {name}/{step_folder}"
32
+
33
+
34
+ def download_step(name: str, step_folder: str, local_dir: str):
35
+ """Download {name}/{step_folder}/ from the dataset repo to a local directory."""
36
+ from huggingface_hub import snapshot_download
37
+ token = os.environ.get("HF_TOKEN")
38
+ snapshot_download(
39
+ repo_id=HF_DATASET_REPO_ID,
40
+ repo_type="dataset",
41
+ local_dir=local_dir,
42
+ allow_patterns=[f"{name}/{step_folder}/**"],
43
+ token=token,
44
+ )
45
+ logger.info(f"Downloaded {name}/{step_folder} -> {local_dir}")
46
+ return f"Descargado de Hub: {name}/{step_folder}"
47
+
48
+
49
+ def list_projects() -> list[str]:
50
+ """List project names (top-level folders) in the dataset repo."""
51
+ token = os.environ.get("HF_TOKEN")
52
+ try:
53
+ api = HfApi(token=token)
54
+ entries = list(api.list_repo_tree(
55
+ repo_id=HF_DATASET_REPO_ID, repo_type="dataset", path_in_repo="",
56
+ ))
57
+ return sorted(set(
58
+ e.rfilename.split("/")[0] if hasattr(e, "rfilename") else e.path.split("/")[0]
59
+ for e in entries
60
+ if ("/" in getattr(e, "rfilename", "")) or hasattr(e, "path")
61
+ ))
62
+ except Exception as e:
63
+ logger.warning(f"Could not list projects: {e}")
64
+ return []
packages.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ffmpeg
2
+ libgl1-mesa-glx
3
+ libglib2.0-0
4
+ libsm6
5
+ libxext6
6
+ libxrender-dev
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools>=69.0.0
2
+ gradio>=5.9.1
3
+ torch>=2.1.0
4
+ torchaudio>=2.1.0
5
+ torchvision>=0.16.0
6
+ transformers>=4.36.0,<5.0.0
7
+ diffusers>=0.25.0
8
+ accelerate>=0.25.0
9
+ safetensors>=0.4.0
10
+ peft>=0.7.0
11
+ huggingface_hub>=0.20.0
12
+ numpy>=1.24.0
13
+ Pillow>=10.0.0
14
+ soundfile>=0.12.0
15
+ pydub>=0.25.1
16
+ f5-tts>=0.3.0
17
+ sentencepiece>=0.1.99
18
+ protobuf>=3.20.0
19
+ openmim>=0.3.9