baenacoco commited on
Commit
8013339
·
verified ·
1 Parent(s): 984594b

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +6 -7
  2. app.py +277 -0
  3. hub_utils.py +64 -0
  4. packages.txt +1 -0
  5. requirements.txt +6 -0
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: Talking Head Audio
3
- emoji: 🏢
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.9.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Talking Head - Audio
3
+ emoji: 🎤
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ hardware: t4-medium
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Space 2: Extract Audio
2
+
3
+ Uploads videos -> extracts audio -> cleans/segments -> saves to Hub.
4
+ GPU: T4 medium (no ML model needed, pure signal processing)
5
+ """
6
+ import logging
7
+ import os
8
+ import shutil
9
+ import subprocess
10
+ import traceback
11
+ from pathlib import Path
12
+
13
+ import gradio as gr
14
+ import numpy as np
15
+ import soundfile as sf
16
+
17
+ from hub_utils import upload_step
18
+
19
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # ── Config ──
23
+ IS_HF_SPACE = os.environ.get("SPACE_ID") is not None
24
+ _data_path = Path("/data")
25
+ if IS_HF_SPACE and _data_path.exists() and os.access(_data_path, os.W_OK):
26
+ BASE_DIR = _data_path
27
+ else:
28
+ BASE_DIR = Path("data")
29
+
30
+ AUDIO_DIR = BASE_DIR / "audio"
31
+ TEMP_DIR = BASE_DIR / "temp"
32
+
33
+ for d in [AUDIO_DIR, TEMP_DIR]:
34
+ d.mkdir(parents=True, exist_ok=True)
35
+
36
+ AUDIO_SAMPLE_RATE = 16000
37
+ TARGET_AUDIO_DURATION_MIN = 15
38
+ MAX_AUDIO_DURATION_MIN = 30
39
+ VAD_AGGRESSIVENESS = 2
40
+
41
+ APP_VERSION = "1.0.0"
42
+
43
+
44
+ # ── FFmpeg ──
45
+
46
+ def _ffmpeg_extract_audio(video_path, output_path, sample_rate=16000):
47
+ cmd = [
48
+ "ffmpeg", "-y", "-i", video_path,
49
+ "-vn", "-acodec", "pcm_s16le",
50
+ "-ar", str(sample_rate), "-ac", "1",
51
+ output_path,
52
+ ]
53
+ result = subprocess.run(cmd, capture_output=True, text=True)
54
+ if result.returncode != 0:
55
+ raise RuntimeError(f"FFmpeg failed: {result.stderr[-500:]}")
56
+
57
+
58
+ # ── Audio processing ──
59
+
60
+ def _apply_vad(audio, sr, aggressiveness=2):
61
+ frame_duration_ms = 30
62
+ frame_size = int(sr * frame_duration_ms / 1000)
63
+ energies = []
64
+ for i in range(0, len(audio) - frame_size, frame_size):
65
+ frame = audio[i:i + frame_size]
66
+ rms = np.sqrt(np.mean(frame ** 2))
67
+ energies.append(rms)
68
+ if not energies:
69
+ return []
70
+ energies = np.array(energies)
71
+ nonzero = energies[energies > 0]
72
+ threshold = np.percentile(nonzero, 15 + aggressiveness * 10) if len(nonzero) > 0 else 0.005
73
+ threshold = max(threshold, 0.002)
74
+
75
+ segments = []
76
+ is_speech = False
77
+ start = 0
78
+ for i, energy in enumerate(energies):
79
+ sample_pos = i * frame_size
80
+ if energy > threshold and not is_speech:
81
+ start = sample_pos
82
+ is_speech = True
83
+ elif energy <= threshold and is_speech:
84
+ end = sample_pos
85
+ duration = (end - start) / sr
86
+ if duration >= 1.0:
87
+ segments.append({"start_sample": start, "end_sample": end, "duration_s": duration})
88
+ is_speech = False
89
+ if is_speech:
90
+ end = len(audio)
91
+ duration = (end - start) / sr
92
+ if duration >= 1.0:
93
+ segments.append({"start_sample": start, "end_sample": end, "duration_s": duration})
94
+ return segments
95
+
96
+
97
+ def _reduce_noise(audio, sr):
98
+ import noisereduce as nr
99
+ return nr.reduce_noise(y=audio, sr=sr, prop_decrease=0.7)
100
+
101
+
102
+ def _normalize_audio(audio):
103
+ peak = np.max(np.abs(audio))
104
+ if peak > 0:
105
+ audio = audio / peak * 0.95
106
+ return audio
107
+
108
+
109
+ def _split_into_segments(audio, sr, segment_sec=10.0):
110
+ seg_samples = int(segment_sec * sr)
111
+ min_samples = int(2.0 * sr)
112
+ parts = []
113
+ for i in range(0, len(audio), seg_samples):
114
+ part = audio[i:i + seg_samples]
115
+ if len(part) >= min_samples:
116
+ parts.append(part)
117
+ return parts
118
+
119
+
120
+ def extract_and_clean_audio(video_paths, target_duration_min, clean_audio, progress_callback=None):
121
+ temp_audio_dir = TEMP_DIR / "raw_audio"
122
+ if temp_audio_dir.exists():
123
+ shutil.rmtree(temp_audio_dir)
124
+ temp_audio_dir.mkdir(parents=True)
125
+
126
+ if AUDIO_DIR.exists():
127
+ shutil.rmtree(AUDIO_DIR)
128
+ AUDIO_DIR.mkdir(parents=True)
129
+
130
+ all_audio = []
131
+ for i, vpath in enumerate(video_paths):
132
+ if progress_callback:
133
+ progress_callback(i / len(video_paths) * 0.2, f"Extrayendo audio del video {i+1}...")
134
+ raw_path = str(temp_audio_dir / f"raw_{i}.wav")
135
+ _ffmpeg_extract_audio(vpath, raw_path, AUDIO_SAMPLE_RATE)
136
+ audio, sr = sf.read(raw_path)
137
+ if audio.ndim > 1:
138
+ audio = audio.mean(axis=1)
139
+ all_audio.append(audio)
140
+
141
+ full_audio = np.concatenate(all_audio)
142
+ full_audio = _normalize_audio(full_audio)
143
+
144
+ if clean_audio:
145
+ logger.info("Clean audio mode: skipping noise reduction and VAD")
146
+ if progress_callback:
147
+ progress_callback(0.5, "Dividiendo audio en segmentos...")
148
+ selected_parts = _split_into_segments(full_audio, AUDIO_SAMPLE_RATE, segment_sec=10.0)
149
+ else:
150
+ if progress_callback:
151
+ progress_callback(0.3, "Reduccion de ruido...")
152
+ full_audio = _reduce_noise(full_audio, AUDIO_SAMPLE_RATE)
153
+ full_audio = _normalize_audio(full_audio)
154
+
155
+ if progress_callback:
156
+ progress_callback(0.4, "Deteccion de actividad vocal...")
157
+ segments = _apply_vad(full_audio, AUDIO_SAMPLE_RATE, VAD_AGGRESSIVENESS)
158
+ segments.sort(key=lambda s: s["duration_s"], reverse=True)
159
+
160
+ target_samples = int(target_duration_min * 60 * AUDIO_SAMPLE_RATE)
161
+ max_samples = int(MAX_AUDIO_DURATION_MIN * 60 * AUDIO_SAMPLE_RATE)
162
+ selected_parts = []
163
+ total_samples = 0
164
+ for seg in segments:
165
+ if total_samples >= target_samples:
166
+ break
167
+ if total_samples + seg["end_sample"] - seg["start_sample"] > max_samples:
168
+ continue
169
+ part = full_audio[seg["start_sample"]:seg["end_sample"]]
170
+ selected_parts.append(part)
171
+ total_samples += len(part)
172
+
173
+ if not selected_parts:
174
+ raise ValueError("No se encontraron segmentos de audio. Revisa que los videos contengan audio.")
175
+
176
+ if progress_callback:
177
+ progress_callback(0.7, "Guardando segmentos...")
178
+
179
+ segment_paths = []
180
+ for i, part in enumerate(selected_parts):
181
+ seg_path = AUDIO_DIR / f"segment_{i:04d}.wav"
182
+ sf.write(str(seg_path), part, AUDIO_SAMPLE_RATE)
183
+ segment_paths.append(str(seg_path))
184
+
185
+ clean_full = np.concatenate(selected_parts)
186
+ full_path = AUDIO_DIR / "full_clean_audio.wav"
187
+ sf.write(str(full_path), clean_full, AUDIO_SAMPLE_RATE)
188
+
189
+ total_duration = len(clean_full) / AUDIO_SAMPLE_RATE
190
+ shutil.rmtree(temp_audio_dir, ignore_errors=True)
191
+
192
+ return {
193
+ "full_audio_path": str(full_path),
194
+ "segments": segment_paths,
195
+ "total_duration_s": total_duration,
196
+ }
197
+
198
+
199
+ # ── Gradio handlers ──
200
+
201
+ def process_videos(project_name, videos, audio_duration_min, clean_audio, progress=gr.Progress()):
202
+ if not project_name or not project_name.strip():
203
+ return None, "Error: Debes introducir un nombre de proyecto"
204
+ if not videos:
205
+ return None, "Error: No se han subido videos"
206
+
207
+ video_paths = [v.name if hasattr(v, "name") else v for v in videos]
208
+ logger.info(f"=== Audio Extraction Started === Videos: {len(video_paths)}")
209
+
210
+ try:
211
+ result = extract_and_clean_audio(
212
+ video_paths,
213
+ target_duration_min=audio_duration_min,
214
+ clean_audio=clean_audio,
215
+ progress_callback=lambda p, m: progress(p, desc=m),
216
+ )
217
+ status = (
218
+ f"OK - {result['total_duration_s']/60:.1f} min audio, "
219
+ f"{len(result['segments'])} segmentos"
220
+ )
221
+ logger.info(f"=== Audio Extraction Complete === {status}")
222
+ return result["full_audio_path"], status
223
+
224
+ except Exception as e:
225
+ logger.error(f"=== Audio Extraction Failed ===\n{traceback.format_exc()}")
226
+ return None, f"Error: {e}"
227
+
228
+
229
+ def save_to_hub(project_name):
230
+ if not project_name or not project_name.strip():
231
+ return "Error: Debes introducir un nombre de proyecto"
232
+ name = project_name.strip()
233
+ segments = list(AUDIO_DIR.glob("segment_*.wav"))
234
+ if not segments:
235
+ return "Error: No hay audio para guardar. Procesa videos primero."
236
+ try:
237
+ return upload_step(name, "step2_audio", str(AUDIO_DIR))
238
+ except Exception as e:
239
+ return f"Error: {e}"
240
+
241
+
242
+ # ── UI ──
243
+
244
+ with gr.Blocks(title="Talking Head - Audio", theme=gr.themes.Soft()) as demo:
245
+ gr.Markdown(f"# Talking Head - Extraer Audio `v{APP_VERSION}`\nExtrae y limpia audio de videos para entrenamiento de voz")
246
+
247
+ project_name = gr.Textbox(
248
+ label="Nombre del proyecto",
249
+ placeholder="mi_proyecto",
250
+ info="Obligatorio. Se usa como carpeta en el Hub.",
251
+ )
252
+
253
+ with gr.Row():
254
+ with gr.Column():
255
+ video_input = gr.File(
256
+ label="Videos (MP4/MOV/AVI/MKV)", file_count="multiple",
257
+ file_types=[".mp4", ".mov", ".avi", ".mkv"],
258
+ )
259
+ audio_dur = gr.Slider(5, 30, value=TARGET_AUDIO_DURATION_MIN, step=1, label="Duracion audio objetivo (min)")
260
+ noise_red = gr.Checkbox(value=True, label="Audio limpio / Podcast (conservar todo, sin filtrar)")
261
+ process_btn = gr.Button("Procesar Videos", variant="primary")
262
+ with gr.Column():
263
+ audio_output = gr.Audio(label="Audio extraido")
264
+ status_box = gr.Textbox(label="Estado", interactive=False)
265
+
266
+ save_btn = gr.Button("Guardar en Hub", variant="secondary")
267
+ save_status = gr.Textbox(label="Estado guardado", interactive=False)
268
+
269
+ process_btn.click(
270
+ process_videos,
271
+ inputs=[project_name, video_input, audio_dur, noise_red],
272
+ outputs=[audio_output, status_box],
273
+ )
274
+ save_btn.click(save_to_hub, inputs=[project_name], outputs=[save_status])
275
+
276
+ if __name__ == "__main__":
277
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
hub_utils.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hub utilities for uploading/downloading step data to HF Dataset repo."""
2
+ import os
3
+ import logging
4
+ from pathlib import Path
5
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_tree
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ HF_DATASET_REPO_ID = "baenacoco/talking-head-avatar"
10
+
11
+
12
+ def _get_api():
13
+ token = os.environ.get("HF_TOKEN")
14
+ if not token:
15
+ raise ValueError("HF_TOKEN no encontrado en variables de entorno")
16
+ api = HfApi(token=token)
17
+ api.create_repo(repo_id=HF_DATASET_REPO_ID, repo_type="dataset", exist_ok=True)
18
+ return api
19
+
20
+
21
+ def upload_step(name: str, step_folder: str, local_dir: str):
22
+ """Upload a local directory to {name}/{step_folder}/ in the dataset repo."""
23
+ api = _get_api()
24
+ api.upload_folder(
25
+ folder_path=local_dir,
26
+ path_in_repo=f"{name}/{step_folder}",
27
+ repo_id=HF_DATASET_REPO_ID,
28
+ repo_type="dataset",
29
+ )
30
+ logger.info(f"Uploaded {local_dir} -> {name}/{step_folder}")
31
+ return f"Subido a Hub: {name}/{step_folder}"
32
+
33
+
34
+ def download_step(name: str, step_folder: str, local_dir: str):
35
+ """Download {name}/{step_folder}/ from the dataset repo to a local directory."""
36
+ from huggingface_hub import snapshot_download
37
+ token = os.environ.get("HF_TOKEN")
38
+ snapshot_download(
39
+ repo_id=HF_DATASET_REPO_ID,
40
+ repo_type="dataset",
41
+ local_dir=local_dir,
42
+ allow_patterns=[f"{name}/{step_folder}/**"],
43
+ token=token,
44
+ )
45
+ logger.info(f"Downloaded {name}/{step_folder} -> {local_dir}")
46
+ return f"Descargado de Hub: {name}/{step_folder}"
47
+
48
+
49
+ def list_projects() -> list[str]:
50
+ """List project names (top-level folders) in the dataset repo."""
51
+ token = os.environ.get("HF_TOKEN")
52
+ try:
53
+ api = HfApi(token=token)
54
+ entries = list(api.list_repo_tree(
55
+ repo_id=HF_DATASET_REPO_ID, repo_type="dataset", path_in_repo="",
56
+ ))
57
+ return sorted(set(
58
+ e.rfilename.split("/")[0] if hasattr(e, "rfilename") else e.path.split("/")[0]
59
+ for e in entries
60
+ if ("/" in getattr(e, "rfilename", "")) or hasattr(e, "path")
61
+ ))
62
+ except Exception as e:
63
+ logger.warning(f"Could not list projects: {e}")
64
+ return []
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ setuptools>=69.0.0
2
+ gradio>=5.9.1
3
+ numpy>=1.24.0
4
+ soundfile>=0.12.0
5
+ noisereduce>=3.0.0
6
+ huggingface_hub>=0.20.0