Spaces:

omarbajouk
/

CapsulesVideo

Sleeping

App Files Files Community

omarbajouk commited on Oct 26, 2025

Commit

ab9c83b

verified ·

1 Parent(s): 6f8b96c

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -119

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # ============================================================
-# CPAS Bruxelles — Créateur de Capsules (Gradio + Edge-TTS)
-# Version "Space HF" optimisée (imports différés)
-# Mode présentateur = VIDÉO DIRECTE (pas de SadTalker)
 # ============================================================
-import os, json, re, uuid, shutil, traceback, gc
 from typing import Optional
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageOps
@@ -89,6 +89,13 @@ def _safe_name(stem, ext=".mp4"):
 import asyncio
 import edge_tts
 from pydub import AudioSegment
 EDGE_VOICES = {}
@@ -130,6 +137,8 @@ def get_edge_voices(lang="fr"):
         return [v for k, v in EDGE_VOICES.items() if k.startswith("nl-")]
     return list(EDGE_VOICES.values())
 async def _edge_tts_async(text, voice, outfile):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(outfile)
@@ -173,6 +182,7 @@ def tts_gtts(text: str, lang: str = "fr") -> str:
 def _normalize_audio_to_wav(in_path: str) -> str:
     # Convertit n'importe quel format (mp3/wav) en WAV standard (44.1kHz stéréo)
     wav_path = os.path.join(TMP_DIR, f"norm_{uuid.uuid4().hex}.wav")
     snd = AudioSegment.from_file(in_path)
     snd = snd.set_frame_rate(44100).set_channels(2).set_sample_width(2)
@@ -248,6 +258,56 @@ def make_background(titre, sous_titre, texte_ecran, theme, logo_path, logo_pos,
     bg.save(out)
     return out
 # ============================================================
 # SOUS-TITRES .SRT
 # ============================================================
@@ -302,114 +362,95 @@ def _write_video_with_fallback(final_clip, out_path_base, fps=25):
     raise RuntimeError(last_err or "FFmpeg a échoué")
 # ============================================================
-# BUILD CAPSULE — Pipeline vidéo direct
 # ============================================================
 def build_capsule(titre, sous_titre, texte_voix, texte_ecran, theme,
                   image_fond=None, logo_path=None, logo_pos="haut-gauche",
                   fond_mode="plein écran",
-                  video_presentateur=None,
-                  source_audio_option="Garder la voix originale de la vidéo",
                   position_presentateur="bottom-right", plein=False,
-                  langue="fr", speaker=None):
-    from moviepy.editor import ImageClip, AudioFileClip, CompositeVideoClip, VideoFileClip
-    import moviepy.video.fx.all as vfx
-    # 1) AUDIO PRINCIPAL
-    audio_wav = None
-    if video_presentateur and source_audio_option == "Garder la voix originale de la vidéo":
-        vclip = VideoFileClip(video_presentateur)
-        if vclip.audio is None:
-            # Pas d'audio dans la vidéo => on force TTS
-            print("[AUDIO] Vidéo sans audio, génération TTS.")
-            try:
-                audio_mp = tts_edge(texte_voix, voice=speaker or ("fr-FR-DeniseNeural" if langue == "fr" else "nl-NL-MaaikeNeural"))
-            except Exception as e:
-                print(f"[Capsule] Erreur TTS Edge ({e}), fallback gTTS.")
-                audio_mp = tts_gtts(texte_voix, lang=langue)
-            audio_wav = audio_mp if audio_mp.endswith(".wav") else _normalize_audio_to_wav(audio_mp)
-        else:
-            audio_wav = os.path.join(TMP_DIR, f"orig_{uuid.uuid4().hex}.wav")
-            vclip.audio.write_audiofile(audio_wav, fps=44100, logger=None)
-            vclip.close()
-    else:
-        # Remplacer par voix IA (Edge-TTS/gTTS)
         try:
-            audio_mp = tts_edge(texte_voix, voice=speaker or ("fr-FR-DeniseNeural" if langue == "fr" else "nl-NL-MaaikeNeural"))
         except Exception as e:
-            print(f"[Capsule] Erreur TTS Edge ({e}), fallback gTTS.")
-            audio_mp = tts_gtts(texte_voix, lang=langue)
-        audio_wav = audio_mp if audio_mp.endswith(".wav") else _normalize_audio_to_wav(audio_mp)
-    audio = AudioFileClip(audio_wav)
-    dur = float(audio.duration or 5.0)
-    target_fps = 25
-    # 2) FOND
     fond_path = make_background(titre, sous_titre, texte_ecran, theme,
                                 logo_path, logo_pos, image_fond, fond_mode)
-    # 3) CLIPS
-    bg = ImageClip(fond_path).set_duration(dur)
-    clips = [bg]
-    if video_presentateur and os.path.exists(video_presentateur):
-        v = VideoFileClip(video_presentateur)
-        if source_audio_option == "Remplacer par voix IA":
-            v = v.without_audio()
-        if plein:
-            v = v.resize((W, H)).set_position(("center", "center"))
-        else:
-            v = v.resize(width=520)
-            pos_map = {
-                "bottom-right": ("right", "bottom"),
-                "bottom-left": ("left", "bottom"),
-                "top-right": ("right", "top"),
-                "top-left": ("left", "top"),
-                "center": ("center", "center"),
-            }
-            v = v.set_position(pos_map.get(position_presentateur, ("right", "bottom")))
-        # Si la vidéo est plus courte, on boucle visuellement
-        if v.duration and v.duration < dur:
-            v = v.fx(vfx.loop, duration=dur)
-        else:
-            v = v.subclip(0, dur)
-        clips.append(v)
-    # 4) Composition + export
     final = CompositeVideoClip(clips).set_audio(audio.set_fps(44100))
     name = _safe_name(f"{titre}_{langue}")
     out_base = os.path.join(OUT_DIR, name)
     out = _write_video_with_fallback(final, out_base, fps=target_fps)
-    # 5) Sous-titres + manifest
-    srt_path = None
-    if texte_voix and texte_voix.strip():
-        # Génère SRT basé sur le texte voix (utile si on remplace l'audio)
-        # Si l'option garde voix originale est choisie, le SRT sera calé en durée
-        srt_path = write_srt(texte_voix, dur)
     capsules.append({
         "file": out,
         "title": titre,
         "langue": langue,
-        "voice": speaker or ("voix-originale" if source_audio_option.startswith("Garder") else "edge-tts"),
         "theme": theme,
         "duration": round(dur, 1)
     })
     _save_manifest()
-    # 6) Nettoyage
     try:
         audio.close()
         final.close()
         bg.close()
-        if os.path.exists(audio_wav): pass  # conservé si besoin par MoviePy jusqu'à la fin
     except Exception as e:
         print(f"[Clean] Erreur nettoyage : {e}")
     gc.collect()
-    return out, f"✅ Capsule {langue.upper()} créée ({dur:.1f}s, {('voix originale' if source_audio_option.startswith('Garder') else 'voix IA')})", srt_path
 # ============================================================
 # GESTION / ASSEMBLAGE
@@ -467,10 +508,11 @@ def deplacer_capsule(index, direction):
 # ============================================================
 print("[INIT] Lancement de Gradio...")
 init_edge_voices()
-with gr.Blocks(title="Créateur de Capsules CPAS – Vidéo directe",
                theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🎬 Créateur de Capsules CPAS — Présentateur vidéo direct (sans SadTalker)")
     with gr.Tab("Créer une capsule"):
         with gr.Row():
@@ -481,57 +523,48 @@ with gr.Blocks(title="Créateur de Capsules CPAS – Vidéo directe",
                 logo_path = gr.Image(label="🏛 Logo", type="filepath")
                 logo_pos = gr.Radio(["haut-gauche","haut-droite","centre"],
                                     label="Position logo", value="haut-gauche")
-                # 🎥 Vidéo du présentateur
-                video_presentateur = gr.Video(label="🎥 Vidéo du présentateur (mp4, mov, mkv…)", type="filepath")
-                # 🎧 Option audio
-                source_audio_option = gr.Radio(
-                    ["Garder la voix originale de la vidéo", "Remplacer par voix IA"],
-                    label="Option audio", value="Garder la voix originale de la vidéo"
-                )
                 position_presentateur = gr.Radio(["bottom-right","bottom-left","top-right","top-left","center"],
-                                                 label="Position du présentateur", value="bottom-right")
                 plein = gr.Checkbox(label="Plein écran présentateur", value=False)
             with gr.Column():
                 titre = gr.Textbox(label="Titre", value="Aide médicale urgente / Dringende medische hulp")
                 sous_titre = gr.Textbox(label="Sous-titre", value="Soins accessibles à tous / Toegankelijke zorg voor iedereen")
                 theme = gr.Radio(list(THEMES.keys()), label="Thème", value="Bleu Professionnel")
-                # Langue/voix pour Edge-TTS (si on remplace l'audio)
-                langue = gr.Radio(["fr", "nl"], label="Langue de la voix IA", value="fr")
                 def maj_voix(lang):
                     try:
                         voices = get_edge_voices(lang)
-                        # Valeur par défaut raisonnable
-                        default = "fr-FR-DeniseNeural" if (lang == "fr" and "fr-FR-DeniseNeural" in voices) else (voices[0] if voices else None)
-                        return gr.update(choices=voices, value=default)
-                    except Exception:
                         return gr.update(choices=[], value=None)
                 speaker_id = gr.Dropdown(
                     label="🎙 Voix Edge-TTS",
                     choices=get_edge_voices("fr"),
                     value="fr-FR-DeniseNeural",
                     info="Liste dynamique des voix Edge-TTS (FR & NL)"
                 )
                 langue.change(maj_voix, [langue], [speaker_id])
-                texte_voix = gr.Textbox(
-                    label="Texte voix off (utilisé si 'Remplacer par voix IA')",
-                    lines=4,
-                    value="Bonjour, le CPAS de Bruxelles vous aide pour vos soins de santé."
                 )
                 texte_ecran = gr.Textbox(label="Texte à l'écran", lines=4,
                                          value="💊 Aides médicales\n🏥 Soins urgents\n📋 Formalités simplifiées")
                 btn = gr.Button("🎬 Créer Capsule", variant="primary")
         sortie = gr.Video(label="Capsule générée")
-        srt_out = gr.File(label="Sous-titres .srt (si texte fourni)")
         statut = gr.Markdown()
     with gr.Tab("Gestion & Assemblage"):
@@ -557,19 +590,12 @@ with gr.Blocks(title="Créateur de Capsules CPAS – Vidéo directe",
         sortie_finale = gr.Video(label="Vidéo finale")
         btn_asm.click(lambda: assemble_final(), [], [sortie_finale, message])
-    # Callback création
-    def creer_capsule_ui(t, st, tv, te, th,
-                         img, fmode, logo, pos_logo,
-                         vid_pres, src_audio_opt,
-                         pos_p, plein_opt, lang, speaker):
         try:
-            vid, msg, srt = build_capsule(
-                t, st, tv, te, th,
-                image_fond=img, logo_path=logo, logo_pos=pos_logo, fond_mode=fmode,
-                video_presentateur=vid_pres, source_audio_option=src_audio_opt,
-                position_presentateur=pos_p, plein=plein_opt,
-                langue=lang, speaker=speaker
-            )
             return vid, srt, msg, table_capsules()
         except Exception as e:
             return None, None, f"❌ Erreur: {e}\n\n{traceback.format_exc()}", table_capsules()
@@ -578,11 +604,12 @@ with gr.Blocks(title="Créateur de Capsules CPAS – Vidéo directe",
         creer_capsule_ui,
         [titre, sous_titre, texte_voix, texte_ecran, theme,
          image_fond, fond_mode, logo_path, logo_pos,
-         video_presentateur, source_audio_option,
-         position_presentateur, plein,
-         langue, speaker_id],
         [sortie, srt_out, statut, liste]
     )
 if __name__ == "__main__":
     demo.launch()

+# app.py
 # ============================================================
+# CPAS Bruxelles — Créateur de Capsules (Gradio + Kokoro + SadTalker)
+# Version "Space HF" optimisée (chargement rapide, imports différés)
 # ============================================================
+import os, json, re, uuid, shutil, traceback, gc, subprocess
 from typing import Optional
 import gradio as gr
 from PIL import Image, ImageDraw, ImageFont, ImageFilter, ImageOps
 import asyncio
 import edge_tts
 from pydub import AudioSegment
+import soundfile as sf
+# ============================================================
+# 🔊 CHARGEMENT DYNAMIQUE DES VOIX EDGE-TTS (FR/NL)
+# ============================================================
 EDGE_VOICES = {}
         return [v for k, v in EDGE_VOICES.items() if k.startswith("nl-")]
     return list(EDGE_VOICES.values())
 async def _edge_tts_async(text, voice, outfile):
     communicate = edge_tts.Communicate(text, voice)
     await communicate.save(outfile)
 def _normalize_audio_to_wav(in_path: str) -> str:
     # Convertit n'importe quel format (mp3/wav) en WAV standard (44.1kHz stéréo)
+    from pydub import AudioSegment
     wav_path = os.path.join(TMP_DIR, f"norm_{uuid.uuid4().hex}.wav")
     snd = AudioSegment.from_file(in_path)
     snd = snd.set_frame_rate(44100).set_channels(2).set_sample_width(2)
     bg.save(out)
     return out
+# ============================================================
+# SadTalker — appel subprocess (image -> visage animé)
+# ============================================================
+def _check_sadtalker_ready() -> Optional[str]:
+    base = os.path.join(ROOT, "SadTalker")
+    if not os.path.isdir(base):
+        return "Dossier SadTalker manquant. Ajoutez 'SadTalker/' à la racine du Space (voir README)."
+    ck = os.path.join(base, "checkpoints")
+    needed = [
+        "audio2exp.pt",
+        "GFPGANv1.4.pth",
+        "epoch_20.pth",
+        "mapping_00229-model.pth.tar",
+        "shape_predictor_68_face_landmarks.dat",
+    ]
+    missing = [f for f in needed if not os.path.exists(os.path.join(ck, f))]
+    if missing:
+        return "Checkpoints SadTalker manquants: " + ", ".join(missing)
+    return None
+def generate_sadtalker_video(image_path, audio_path, output_dir=TMP_DIR, fps=25) -> Optional[str]:
+    err = _check_sadtalker_ready()
+    if err:
+        # Pas d’échec brutal : on renvoie None (le fond seul sera utilisé)
+        print(f"[SadTalker] {err}")
+        return None
+    try:
+        os.makedirs(output_dir, exist_ok=True)
+        out_path = os.path.join(output_dir, f"sadtalker_{uuid.uuid4().hex[:6]}.mp4")
+        cmd = [
+            "python", "inference.py",
+            "--driven_audio", audio_path,
+            "--source_image", image_path,
+            "--result_dir", output_dir,
+            "--still", "--enhancer", "gfpgan",
+            "--fps", str(fps),
+        ]
+        subprocess.run(cmd, cwd=os.path.join(ROOT, "SadTalker"), check=True)
+        # Récupérer le dernier mp4 créé
+        candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
+        latest = max(candidates, key=os.path.getctime) if candidates else None
+        if latest:
+            # Harmoniser le nom
+            shutil.move(latest, out_path)
+            return out_path
+        return None
+    except Exception as e:
+        print("[SadTalker] Erreur:", e)
+        return None
 # ============================================================
 # SOUS-TITRES .SRT
 # ============================================================
     raise RuntimeError(last_err or "FFmpeg a échoué")
 # ============================================================
+# BUILD CAPSULE — Pipeline complet (corrigé)
 # ============================================================
 def build_capsule(titre, sous_titre, texte_voix, texte_ecran, theme,
                   image_fond=None, logo_path=None, logo_pos="haut-gauche",
                   fond_mode="plein écran",
+                  image_presentateur=None, voix_type="Féminine",
                   position_presentateur="bottom-right", plein=False,
+                  moteur_voix="Parler-TTS (offline)", langue="fr", speaker=None):
+    # 1) TTS (Edge multivoix ou fallback)
+    try:
+        audio_mp = tts_edge(texte_voix, voice=speaker or ("fr-FR-DeniseNeural" if langue == "fr" else "nl-NL-MaaikeNeural"))
+    except Exception as e:
+        print(f"[Capsule] Erreur TTS Edge ({e}), fallback gTTS.")
+        audio_mp = tts_gtts(texte_voix, lang=langue)
+    # S'assurer qu'on a un WAV
+    audio_wav = audio_mp
+    if not audio_mp.lower().endswith(".wav"):
         try:
+            audio_wav = _normalize_audio_to_wav(audio_mp)
         except Exception as e:
+            print(f"[Audio] Normalisation échouée ({e}), on garde {audio_mp}")
+    # 2) Fond (PIL)
     fond_path = make_background(titre, sous_titre, texte_ecran, theme,
                                 logo_path, logo_pos, image_fond, fond_mode)
+    # 3) MoviePy (imports lents ici seulement)
+    from moviepy.editor import ImageClip, AudioFileClip, CompositeVideoClip, VideoFileClip
+    import moviepy.video.fx.all as vfx
+    audio = AudioFileClip(audio_wav)
+    dur = float(audio.duration or 5.0)
+    target_fps = 25
+    bg = ImageClip(fond_path).set_duration(dur)
+    # 4) SadTalker (optionnel)
+    clips = [bg]
+    if image_presentateur and os.path.exists(image_presentateur):
+        vpath = generate_sadtalker_video(image_presentateur, audio_wav, fps=target_fps)
+        if vpath and os.path.exists(vpath):
+            v = VideoFileClip(vpath).without_audio().fx(vfx.loop, duration=dur)
+            if plein:
+                v = v.resize((W, H)).set_position(("center", "center"))
+            else:
+                v = v.resize(width=520)
+                pos_map = {
+                    "bottom-right": ("right", "bottom"),
+                    "bottom-left": ("left", "bottom"),
+                    "top-right": ("right", "top"),
+                    "top-left": ("left", "top"),
+                    "center": ("center", "center"),
+                }
+                v = v.set_position(pos_map.get(position_presentateur, ("right", "bottom")))
+            clips.append(v)
+    # 5) Composition + export
     final = CompositeVideoClip(clips).set_audio(audio.set_fps(44100))
     name = _safe_name(f"{titre}_{langue}")
     out_base = os.path.join(OUT_DIR, name)
     out = _write_video_with_fallback(final, out_base, fps=target_fps)
+    # 6) Sous-titres + manifest
+    srt_path = write_srt(texte_voix, dur)
     capsules.append({
         "file": out,
         "title": titre,
         "langue": langue,
+        "voice": speaker or voix_type,
         "theme": theme,
         "duration": round(dur, 1)
     })
     _save_manifest()
+    # 7) Nettoyage
     try:
         audio.close()
         final.close()
         bg.close()
+        if os.path.exists(audio_mp): os.remove(audio_mp)
+        if audio_wav != audio_mp and os.path.exists(audio_wav): os.remove(audio_wav)
     except Exception as e:
         print(f"[Clean] Erreur nettoyage : {e}")
     gc.collect()
+    return out, f"✅ Capsule {langue.upper()} créée ({dur:.1f}s, voix {speaker or voix_type})", srt_path
 # ============================================================
 # GESTION / ASSEMBLAGE
 # ============================================================
 print("[INIT] Lancement de Gradio...")
 init_edge_voices()
+with gr.Blocks(title="Créateur de Capsules CPAS – SadTalker + Kokoro",
                theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🎬 Créateur de Capsules CPAS – Version complète (SadTalker + Kokoro)")
+    gr.Markdown("**Astuce** : pour un démarrage instantané, chargez le dossier `SadTalker/checkpoints/` dans le Space (voir README).")
     with gr.Tab("Créer une capsule"):
         with gr.Row():
                 logo_path = gr.Image(label="🏛 Logo", type="filepath")
                 logo_pos = gr.Radio(["haut-gauche","haut-droite","centre"],
                                     label="Position logo", value="haut-gauche")
+                image_presentateur = gr.Image(label="🧑‍🎨 Image du présentateur (portrait pour SadTalker)", type="filepath")
                 position_presentateur = gr.Radio(["bottom-right","bottom-left","top-right","top-left","center"],
+                                                 label="Position", value="bottom-right")
                 plein = gr.Checkbox(label="Plein écran présentateur", value=False)
             with gr.Column():
                 titre = gr.Textbox(label="Titre", value="Aide médicale urgente / Dringende medische hulp")
                 sous_titre = gr.Textbox(label="Sous-titre", value="Soins accessibles à tous / Toegankelijke zorg voor iedereen")
                 theme = gr.Radio(list(THEMES.keys()), label="Thème", value="Bleu Professionnel")
+                langue = gr.Radio(["fr", "nl"], label="Langue de la voix", value="fr")
                 def maj_voix(lang):
                     try:
                         voices = get_edge_voices(lang)
+                        return gr.update(choices=voices, value=voices[0] if voices else None)
+                    except Exception as e:
                         return gr.update(choices=[], value=None)
                 speaker_id = gr.Dropdown(
                     label="🎙 Voix Edge-TTS",
                     choices=get_edge_voices("fr"),
                     value="fr-FR-DeniseNeural",
                     info="Liste dynamique des voix Edge-TTS (FR & NL)"
                 )
                 langue.change(maj_voix, [langue], [speaker_id])
+                voix_type = gr.Radio(["Féminine","Masculine"], label="Voix IA", value="Féminine")
+                moteur_voix = gr.Radio(
+                    ["Kokoro (HuggingFace, offline)", "gTTS (en ligne)"],
+                    label="Moteur voix",
+                    value="Kokoro (HuggingFace, offline)"
                 )
+                texte_voix = gr.Textbox(label="Texte voix off", lines=4,
+                                        value="Bonjour, le CPAS de Bruxelles vous aide pour vos soins de santé.")
                 texte_ecran = gr.Textbox(label="Texte à l'écran", lines=4,
                                          value="💊 Aides médicales\n🏥 Soins urgents\n📋 Formalités simplifiées")
                 btn = gr.Button("🎬 Créer Capsule", variant="primary")
         sortie = gr.Video(label="Capsule générée")
+        srt_out = gr.File(label="Sous-titres .srt")
         statut = gr.Markdown()
     with gr.Tab("Gestion & Assemblage"):
         sortie_finale = gr.Video(label="Vidéo finale")
         btn_asm.click(lambda: assemble_final(), [], [sortie_finale, message])
+    def creer_capsule_ui(t, st, tv, te, th, img, fmode, logo, pos_logo, ip, vx, pos_p, plein, motor, lang, speaker):
         try:
+            vid, msg, srt = build_capsule(t, st, tv, te, th,
+                                          img, logo, pos_logo, fmode,
+                                          ip, vx, pos_p, plein,
+                                          motor, lang, speaker=speaker)
             return vid, srt, msg, table_capsules()
         except Exception as e:
             return None, None, f"❌ Erreur: {e}\n\n{traceback.format_exc()}", table_capsules()
         creer_capsule_ui,
         [titre, sous_titre, texte_voix, texte_ecran, theme,
          image_fond, fond_mode, logo_path, logo_pos,
+         image_presentateur, voix_type, position_presentateur,
+         plein, moteur_voix, langue, speaker_id],
         [sortie, srt_out, statut, liste]
     )
 if __name__ == "__main__":
     demo.launch()