ferdous31 commited on
Commit
fcb73f5
·
verified ·
1 Parent(s): 2be0f11

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile (1) +52 -0
  2. README.md +27 -4
  3. app (1).py +294 -0
  4. cookies.txt +0 -0
  5. requirements (1).txt +32 -0
Dockerfile (1) ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # Métadonnées
4
+ LABEL maintainer="dubbing-pipeline"
5
+ LABEL description="Automated video dubbing pipeline - HuggingFace Space"
6
+
7
+ # Variables d'environnement
8
+ ENV PYTHONUNBUFFERED=1 \
9
+ PYTHONDONTWRITEBYTECODE=1 \
10
+ PIP_NO_CACHE_DIR=1 \
11
+ TRANSFORMERS_CACHE=/app/.cache/huggingface \
12
+ HF_HOME=/app/.cache/huggingface
13
+
14
+ # Dépendances système
15
+ # Note : ffmpeg et libsndfile1 sont requis pour le traitement audio/video
16
+ # git-lfs pour les gros modèles HuggingFace
17
+ RUN apt-get update && apt-get install -y --no-install-recommends \
18
+ ffmpeg \
19
+ libsndfile1 \
20
+ libsndfile1-dev \
21
+ git \
22
+ git-lfs \
23
+ wget \
24
+ curl \
25
+ espeak-ng \
26
+ libespeak-ng1 \
27
+ && apt-get clean \
28
+ && rm -rf /var/lib/apt/lists/*
29
+
30
+ # Dossier de travail
31
+ WORKDIR /app
32
+
33
+ # Créer les dossiers nécessaires
34
+ RUN mkdir -p /app/.cache/huggingface /tmp/dubbing_cache
35
+
36
+ # Copier le requirements
37
+ COPY requirements.txt .
38
+
39
+ # Installer les dépendances Python
40
+ # On installe PyTorch CPU en premier (plus léger, Spaces gratuit n'a pas de GPU)
41
+ RUN pip install --upgrade pip && \
42
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
43
+ pip install -r requirements.txt
44
+
45
+ # Copier le code de l'application
46
+ COPY app.py .
47
+
48
+ # Port Gradio
49
+ EXPOSE 7860
50
+
51
+ # Commande de démarrage
52
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,34 @@
1
  ---
2
- title: Tp Project2
3
- emoji: 🦀
4
- colorFrom: green
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  license: mit
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Video Dubbing Pipeline
3
+ emoji: 🎬
4
+ colorFrom: blue
5
  colorTo: purple
6
  sdk: docker
7
  pinned: false
8
  license: mit
9
+ app_port: 7860
10
  ---
11
 
12
+ # Automated Video Dubbing Pipeline
13
+
14
+ Pipeline automatique de doublage vidéo utilisant uniquement des modèles open-weights.
15
+
16
+ ## Comment utiliser
17
+
18
+ 1. Entrez un lien YouTube (vidéo de 30s à 1min)
19
+ 2. Choisissez la langue cible
20
+ 3. Cliquez sur "Lancer le doublage"
21
+ 4. Attendez 5-15 minutes selon la durée
22
+
23
+ ## Stack technique
24
+
25
+ - **ASR** : OpenAI Whisper (medium)
26
+ - **Traduction** : Qwen2.5-1.5B-Instruct
27
+ - **TTS** : Facebook MMS-TTS
28
+ - **Voice Cloning** : Coqui XTTS-v2 (optionnel)
29
+ - **Mixing** : pydub + ffmpeg
30
+ - **Interface** : Gradio
31
+
32
+ ## Langues supportées
33
+
34
+ French, Arabic, Spanish, German
app (1).py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import datetime
5
+ import subprocess
6
+ import tempfile
7
+ import shutil
8
+ import numpy as np
9
+ import torch
10
+ import srt
11
+ import gradio as gr
12
+ from pathlib import Path
13
+ from pydub import AudioSegment
14
+ from pydub.effects import speedup
15
+ from functools import reduce
16
+
17
+ import whisper
18
+ from transformers import (
19
+ AutoTokenizer, AutoModelForCausalLM,
20
+ VitsModel, AutoTokenizer as TTSTokenizer
21
+ )
22
+
23
+ # ============================================================
24
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
25
+
26
+ SUPPORTED_LANGUAGES = {
27
+ "French": ("facebook/mms-tts-fra", "fr"),
28
+ "Arabic": ("facebook/mms-tts-ara", "ar"),
29
+ "Spanish": ("facebook/mms-tts-spa", "es"),
30
+ "German": ("facebook/mms-tts-deu", "de"),
31
+ "English": ("facebook/mms-tts-eng", "en"),
32
+ }
33
+
34
+ # Cache des modèles (pour éviter de re-télécharger à chaque requête)
35
+ _model_cache = {}
36
+
37
+
38
+ def get_whisper():
39
+ if "whisper" not in _model_cache:
40
+ _model_cache["whisper"] = whisper.load_model("base", device=DEVICE)
41
+ return _model_cache["whisper"]
42
+
43
+
44
+ def get_llm():
45
+ mid = "Qwen/Qwen2.5-1.5B-Instruct"
46
+ if "llm" not in _model_cache:
47
+ tok = AutoTokenizer.from_pretrained(mid)
48
+ mdl = AutoModelForCausalLM.from_pretrained(
49
+ mid, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
50
+ device_map="auto"
51
+ )
52
+ _model_cache["llm"] = (tok, mdl)
53
+ return _model_cache["llm"]
54
+
55
+
56
+ def get_tts(lang: str):
57
+ model_id = SUPPORTED_LANGUAGES[lang][0]
58
+ key = f"tts_{lang}"
59
+ if key not in _model_cache:
60
+ tok = TTSTokenizer.from_pretrained(model_id)
61
+ mdl = VitsModel.from_pretrained(model_id).to(DEVICE)
62
+ mdl.eval()
63
+ _model_cache[key] = (tok, mdl)
64
+ return _model_cache[key]
65
+
66
+
67
+ # ---- Pipeline functions ----
68
+
69
+ def download_video(url: str, work_dir: Path) -> dict:
70
+ video = work_dir / "video.mp4"
71
+ audio = work_dir / "audio.wav"
72
+ # Utilise les cookies si disponibles (contourne le blocage bot YouTube)
73
+ import os as _os
74
+ cookies_path = "/app/cookies.txt"
75
+ yt_cmd = ["yt-dlp", "-f",
76
+ "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]",
77
+ "--merge-output-format", "mp4",
78
+ "-o", str(video), "--no-playlist", url]
79
+ if _os.path.exists(cookies_path):
80
+ yt_cmd = ["yt-dlp", "--cookies", cookies_path, "-f",
81
+ "bestvideo[height<=480][ext=mp4]+bestaudio[ext=m4a]/best[height<=480][ext=mp4]",
82
+ "--merge-output-format", "mp4",
83
+ "-o", str(video), "--no-playlist", url]
84
+ subprocess.run(yt_cmd, check=True, capture_output=True)
85
+ subprocess.run([
86
+ "ffmpeg", "-y", "-i", str(video),
87
+ "-ac", "1", "-ar", "16000", "-vn", str(audio)
88
+ ], check=True, capture_output=True)
89
+ probe = subprocess.run([
90
+ "ffprobe", "-v", "error", "-show_entries", "format=duration",
91
+ "-of", "json", str(audio)
92
+ ], capture_output=True, text=True)
93
+ duration = float(json.loads(probe.stdout)["format"]["duration"])
94
+ return {"video": video, "audio": audio, "duration": duration}
95
+
96
+
97
+ def transcribe(audio_path: Path) -> list:
98
+ model = get_whisper()
99
+ result = model.transcribe(str(audio_path), word_timestamps=True, verbose=False)
100
+ segments = [
101
+ {"text": s["text"].strip(), "start": round(s["start"], 3),
102
+ "end": round(s["end"], 3), "duration": round(s["end"] - s["start"], 3)}
103
+ for s in result["segments"] if s["text"].strip()
104
+ ]
105
+ lang = result.get("language", "english").capitalize()
106
+ return segments, lang
107
+
108
+
109
+ def translate_segment(text: str, src: str, tgt: str) -> str:
110
+ tok, mdl = get_llm()
111
+ sys_p = (f"You are a professional subtitle translator. Translate from {src} to {tgt}. "
112
+ f"Output ONLY the translation, nothing else.")
113
+ msgs = [{"role": "system", "content": sys_p}, {"role": "user", "content": text}]
114
+ input_text = tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
115
+ inputs = tok(input_text, return_tensors="pt").to(DEVICE)
116
+ with torch.no_grad():
117
+ out = mdl.generate(**inputs, max_new_tokens=150, temperature=0.3,
118
+ do_sample=True, repetition_penalty=1.1,
119
+ pad_token_id=tok.eos_token_id)
120
+ gen = out[0][inputs.input_ids.shape[1]:]
121
+ tr = tok.decode(gen, skip_special_tokens=True).strip()
122
+ lines = [l.strip() for l in tr.splitlines() if l.strip()]
123
+ return lines[0] if lines else tr
124
+
125
+
126
+ def build_dubbed_audio(segments: list, tgt_lang: str, total_s: float, out: Path) -> Path:
127
+ tts_tok, tts_mdl = get_tts(tgt_lang)
128
+ sr = tts_mdl.config.sampling_rate
129
+ track = AudioSegment.silent(duration=int(total_s * 1000))
130
+ for seg in segments:
131
+ text = seg["translated_text"].strip()
132
+ if not text:
133
+ continue
134
+ inputs = tts_tok(text, return_tensors="pt").to(DEVICE)
135
+ with torch.no_grad():
136
+ wav = tts_mdl(**inputs).waveform[0].cpu().numpy()
137
+ wav_i16 = (wav * 32767).astype(np.int16)
138
+ seg_aud = AudioSegment(wav_i16.tobytes(), frame_rate=sr, sample_width=2, channels=1)
139
+ target_s = seg["duration"]
140
+ actual_s = len(seg_aud) / 1000
141
+ if actual_s > target_s and target_s > 0.1:
142
+ spd = min(actual_s / target_s, 2.0)
143
+ seg_aud = speedup(seg_aud, spd, chunk_size=50, crossfade=25)
144
+ track = track.overlay(seg_aud, position=int(seg["start"] * 1000))
145
+ track.export(str(out), format="wav")
146
+ return out
147
+
148
+
149
+ def mix_audio(orig: Path, dubbed: Path, segments: list, out: Path) -> Path:
150
+ original = AudioSegment.from_wav(str(orig)).set_frame_rate(44100).set_channels(2)
151
+ dub = AudioSegment.from_wav(str(dubbed)).set_frame_rate(44100).set_channels(2)
152
+ total = len(original)
153
+ if len(dub) < total:
154
+ dub = dub + AudioSegment.silent(duration=total - len(dub), frame_rate=44100)
155
+ dub = dub[:total]
156
+ parts, prev = [], 0
157
+ for seg in segments:
158
+ s, e = int(seg["start"]*1000), int(seg["end"]*1000)
159
+ if s > prev:
160
+ parts.append(original[prev:s])
161
+ chunk = original[s:e] + (-20) # ~10% volume
162
+ parts.append(chunk)
163
+ prev = e
164
+ if prev < total:
165
+ parts.append(original[prev:])
166
+ ducked = reduce(lambda a, b: a + b, parts) if parts else original
167
+ final = ducked.overlay(dub + (-0.9)) # dub at ~90%
168
+ final.export(str(out), format="wav")
169
+ return out
170
+
171
+
172
+ def burn_subtitles(video: Path, audio: Path, srt_path: Path, out: Path) -> Path:
173
+ srt_esc = str(srt_path).replace("\\", "/")
174
+ cmd = [
175
+ "ffmpeg", "-y",
176
+ "-i", str(video),
177
+ "-i", str(audio),
178
+ "-vf", f"subtitles={srt_esc}:force_style='FontSize=18,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,Outline=2,Bold=1'",
179
+ "-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
180
+ "-c:a", "aac", "-b:a", "128k",
181
+ "-map", "0:v:0", "-map", "1:a:0", "-shortest", str(out)
182
+ ]
183
+ r = subprocess.run(cmd, capture_output=True, text=True)
184
+ if r.returncode != 0:
185
+ # fallback sans sous-titres burned
186
+ cmd2 = ["ffmpeg", "-y", "-i", str(video), "-i", str(audio),
187
+ "-c:v", "copy", "-c:a", "aac", "-b:a", "128k",
188
+ "-map", "0:v:0", "-map", "1:a:0", "-shortest", str(out)]
189
+ subprocess.run(cmd2, check=True, capture_output=True)
190
+ return out
191
+
192
+
193
+ # ---- PIPELINE COMPLET ----
194
+
195
+ def run_pipeline(youtube_url: str, target_language: str, progress=gr.Progress()) -> str:
196
+ """
197
+ Pipeline principal appelé par Gradio.
198
+ Retourne le chemin vers la vidéo finale.
199
+ """
200
+ if not youtube_url.strip():
201
+ raise gr.Error("Veuillez entrer un URL YouTube valide")
202
+
203
+ work_dir = Path(tempfile.mkdtemp(prefix="dubbing_"))
204
+
205
+ try:
206
+ progress(0.05, desc="Téléchargement de la vidéo...")
207
+ files = download_video(youtube_url, work_dir)
208
+
209
+ progress(0.20, desc="Transcription Whisper...")
210
+ segments, src_lang = transcribe(files["audio"])
211
+
212
+ # SRT original
213
+ subs_orig = [srt.Subtitle(i+1,
214
+ datetime.timedelta(seconds=s["start"]),
215
+ datetime.timedelta(seconds=s["end"]),
216
+ s["text"]) for i, s in enumerate(segments)]
217
+
218
+ progress(0.40, desc="Traduction en cours...")
219
+ translated = []
220
+ for seg in segments:
221
+ tr = translate_segment(seg["text"], src_lang, target_language)
222
+ translated.append({**seg, "translated_text": tr})
223
+
224
+ srt_file = work_dir / "translated.srt"
225
+ subs_tr = [srt.Subtitle(i+1,
226
+ datetime.timedelta(seconds=s["start"]),
227
+ datetime.timedelta(seconds=s["end"]),
228
+ s["translated_text"]) for i, s in enumerate(translated)]
229
+ srt_file.write_text(srt.compose(subs_tr), encoding="utf-8")
230
+
231
+ progress(0.60, desc="Génération audio (TTS)...")
232
+ dubbed_wav = work_dir / "dubbed.wav"
233
+ build_dubbed_audio(translated, target_language, files["duration"], dubbed_wav)
234
+
235
+ progress(0.80, desc="Mixage audio...")
236
+ mixed_wav = work_dir / "mixed.wav"
237
+ mix_audio(files["audio"], dubbed_wav, translated, mixed_wav)
238
+
239
+ progress(0.90, desc="Création vidéo finale...")
240
+ final_video = work_dir / "final.mp4"
241
+ burn_subtitles(files["video"], mixed_wav, srt_file, final_video)
242
+
243
+ # Copier dans un endroit permanent pour Gradio
244
+ output_path = Path("/tmp/output_dubbed.mp4")
245
+ shutil.copy(final_video, output_path)
246
+
247
+ progress(1.0, desc="Terminé !")
248
+ return str(output_path)
249
+
250
+ except Exception as e:
251
+ raise gr.Error(f"Erreur pipeline : {str(e)[:300]}")
252
+
253
+ finally:
254
+ # Nettoyage du dossier temporaire
255
+ shutil.rmtree(work_dir, ignore_errors=True)
256
+
257
+
258
+ # ---- INTERFACE GRADIO ----
259
+
260
+ with gr.Blocks(title="Video Dubbing Pipeline", theme=gr.themes.Soft()) as demo:
261
+ gr.Markdown("""
262
+ # 🎬 Automated Video Dubbing Pipeline
263
+ Entrez un lien YouTube (30s-1min) et choisissez la langue cible.
264
+ Le pipeline transcrit, traduit, génère une voix et produit une vidéo doublée.
265
+
266
+ > **Note :** Le traitement prend environ 5-15 minutes selon la durée de la vidéo.
267
+ """)
268
+
269
+ with gr.Row():
270
+ with gr.Column(scale=2):
271
+ url_input = gr.Textbox(
272
+ label="YouTube URL",
273
+ placeholder="https://www.youtube.com/watch?v=...",
274
+ lines=1
275
+ )
276
+ lang_choice = gr.Dropdown(
277
+ choices=list(SUPPORTED_LANGUAGES.keys()),
278
+ value="French",
279
+ label="Langue cible"
280
+ )
281
+ run_btn = gr.Button("🚀 Lancer le doublage", variant="primary")
282
+
283
+ with gr.Column(scale=3):
284
+ video_output = gr.Video(label="Vidéo doublée")
285
+
286
+ run_btn.click(
287
+ fn=run_pipeline,
288
+ inputs=[url_input, lang_choice],
289
+ outputs=video_output
290
+ )
291
+
292
+
293
+ if __name__ == "__main__":
294
+ demo.launch(server_name="0.0.0.0", server_port=7860)
cookies.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements (1).txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML
2
+ # Note: torch est installé séparément dans le Dockerfile (version CPU)
3
+ transformers>=4.40.0
4
+ accelerate>=0.27.0
5
+
6
+ # ASR
7
+ openai-whisper>=20231117
8
+
9
+ # TTS
10
+ # TTS — MMS-TTS est inclus dans transformers, pas d'install séparée
11
+ # Voice cloning via kokoro (compatible Python 3.12+)
12
+ kokoro>=0.9.4
13
+ misaki[en]
14
+
15
+ # Audio
16
+ pydub>=0.25.1
17
+ soundfile>=0.12.1
18
+ librosa>=0.10.0
19
+ scipy>=1.11.0
20
+
21
+ # Vidéo / subtitles
22
+ srt>=3.5.3
23
+
24
+ # Téléchargement YouTube
25
+ yt-dlp>=2024.1.0
26
+
27
+ # Interface
28
+ gradio>=4.20.0
29
+
30
+ # Utils
31
+ numpy>=1.24.0
32
+ tqdm>=4.65.0