Spaces:

RobotsMali
/

RobotsMali_ASR_DEMO

Runtime error

App Files Files Community

binaryMao commited on Mar 16

Commit

6a368d1

verified ·

1 Parent(s): 7b84d76

Update app.py

Browse files

Files changed (1) hide show

app.py +255 -38

app.py CHANGED Viewed

@@ -4,12 +4,16 @@ from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 from nemo.collections.asr.models import EncDecCTCModel, EncDecRNNTModel
 import gradio as gr
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 10.0
 print(f"✅ Démarrage sur device: {DEVICE}")
 print(f"✅ Gradio version: {gr.__version__}")
 # Dictionnaire des modèles RobotsMali
 MODELS = {
@@ -28,23 +32,28 @@ _cache = {}
 def get_model(name):
     if name in _cache:
         return _cache[name]
     print(f"📥 Chargement du modèle: {name}")
     # Gestion agressive de la mémoire
     if len(_cache) >= 1:
         _cache.clear()
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     try:
         repo, arch_type = MODELS[name]
         print(f"📦 Téléchargement depuis {repo}...")
         folder = snapshot_download(repo, local_dir_use_symlinks=False)
-        print(f"📁 Dossier: {folder}")
         nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
@@ -52,17 +61,54 @@ def get_model(name):
             raise FileNotFoundError(f"Aucun fichier .nemo trouvé dans {folder}")
         print(f"🔧 Restauration du modèle depuis {nemo_file}")
-        if arch_type == "ctc":
-            model = EncDecCTCModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
-        else:
-            model = EncDecRNNTModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
         model.eval()
         if DEVICE == "cuda":
             model = model.half()
         print(f"✅ Modèle {name} chargé avec succès")
         _cache[name] = model
         return model
@@ -71,96 +117,267 @@ def get_model(name):
         print(traceback.format_exc())
         raise e
-def pipeline(audio_in, model_name):
     if not audio_in:
-        yield "❌ Erreur", "Aucun audio détecté."
         return
     tmp_dir = tempfile.mkdtemp()
     try:
-        yield "⏳ Traitement de l'audio...", ""
-        # Vérification que le fichier audio existe
         if not os.path.exists(audio_in):
-            yield "❌ Erreur", f"Fichier audio introuvable: {audio_in}"
             return
-        wav_path = os.path.join(tmp_dir, "input.wav")
-        # Conversion audio
         cmd = f"ffmpeg -y -i {shlex.quote(audio_in)} -ac 1 -ar 16000 {shlex.quote(wav_path)}"
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
         if result.returncode != 0:
-            yield "❌ Erreur", f"Erreur FFmpeg: {result.stderr}"
             return
         if not os.path.exists(wav_path) or os.path.getsize(wav_path) == 0:
-            yield "❌ Erreur", "Fichier audio converti vide"
             return
-        # Segmentation
         seg_pattern = os.path.join(tmp_dir, 'seg_%03d.wav')
         cmd = f"ffmpeg -i {shlex.quote(wav_path)} -f segment -segment_time {SEGMENT_DURATION} -c copy {shlex.quote(seg_pattern)}"
         subprocess.run(cmd, shell=True, capture_output=True)
         valid_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
         if not valid_segments:
-            yield "❌ Erreur", "Fichier audio vide ou incompatible."
             return
-        print(f"🔊 {len(valid_segments)} segments à transcrire")
         model = get_model(model_name)
         with torch.inference_mode():
-            batch_hyp = model.transcribe(valid_segments, batch_size=4, return_hypotheses=True)
-        results = [hyp.text if hasattr(hyp, 'text') else str(hyp) for hyp in batch_hyp]
-        yield "✅ Succès", " ".join(results)
     except Exception as e:
         print(traceback.format_exc())
-        yield "❌ Erreur", str(e)
     finally:
         if os.path.exists(tmp_dir):
             shutil.rmtree(tmp_dir)
 # Interface Gradio
 with gr.Blocks(title="RobotsMali ASR", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🤖 RobotsMali - Reconnaissance Vocale")
     with gr.Row():
-        with gr.Column():
             audio_input = gr.Audio(
-                label="Audio",
                 type="filepath",
-                sources=["upload", "microphone"]
             )
             model_input = gr.Dropdown(
                 choices=list(MODELS.keys()),
-                value="Soloni V3 (TDT-CTC)",
-                label="Modèle"
             )
-            run_btn = gr.Button("🚀 DÉMARRER", variant="primary")
         with gr.Column():
-            status = gr.Markdown("### État : En attente")
-            text_output = gr.Textbox(label="Transcription", lines=12)
     run_btn.click(
         fn=pipeline,
         inputs=[audio_input, model_input],
-        outputs=[status, text_output]
     )
 # Point d'entrée - Configuration pour Hugging Face Spaces
 if __name__ == "__main__":
     print("🚀 Lancement de l'application RobotsMali ASR...")
     # Configuration simple pour Spaces
     demo.queue().launch(
         server_name="0.0.0.0",
-        server_port=7860
-        # Pas de show_api, pas de share - juste l'essentiel
     )

 from nemo.collections import asr as nemo_asr
 from nemo.collections.asr.models import EncDecCTCModel, EncDecRNNTModel
 import gradio as gr
+import time
+import psutil
+import humanize
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 10.0
 print(f"✅ Démarrage sur device: {DEVICE}")
 print(f"✅ Gradio version: {gr.__version__}")
+print(f"✅ Mémoire disponible: {humanize.naturalsize(psutil.virtual_memory().available)}")
 # Dictionnaire des modèles RobotsMali
 MODELS = {
 def get_model(name):
     if name in _cache:
+        print(f"✅ Modèle {name} déjà en cache")
         return _cache[name]
     print(f"📥 Chargement du modèle: {name}")
     # Gestion agressive de la mémoire
     if len(_cache) >= 1:
+        print("🧹 Nettoyage du cache...")
         _cache.clear()
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+            print(f"🧹 Mémoire GPU libérée: {torch.cuda.memory_allocated()/1e9:.2f}GB")
     try:
         repo, arch_type = MODELS[name]
         print(f"📦 Téléchargement depuis {repo}...")
+        start_time = time.time()
         folder = snapshot_download(repo, local_dir_use_symlinks=False)
+        download_time = time.time() - start_time
+        print(f"📁 Dossier: {folder} (téléchargé en {download_time:.1f}s)")
         nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
             raise FileNotFoundError(f"Aucun fichier .nemo trouvé dans {folder}")
         print(f"🔧 Restauration du modèle depuis {nemo_file}")
+        print(f"📊 Taille du fichier: {humanize.naturalsize(os.path.getsize(nemo_file))}")
+        # Chargement avec gestion d'erreur
+        try:
+            if arch_type == "ctc":
+                model = EncDecCTCModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
+            else:
+                model = EncDecRNNTModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
+        except Exception as e:
+            print(f"⚠️ Erreur de chargement spécifique, tentative avec ASRModel: {e}")
+            model = nemo_asr.models.ASRModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
+        # === CORRECTION : Patch de la configuration pour éviter l'erreur key_phrase_items_list ===
+        try:
+            if hasattr(model, 'cfg'):
+                def remove_key_phrase_items_list(config):
+                    if isinstance(config, dict):
+                        if 'key_phrase_items_list' in config:
+                            del config['key_phrase_items_list']
+                            print("✅ Clé problématique key_phrase_items_list supprimée")
+                        for key, value in config.items():
+                            if isinstance(value, (dict, list)):
+                                remove_key_phrase_items_list(value)
+                    elif isinstance(config, list):
+                        for item in config:
+                            if isinstance(item, (dict, list)):
+                                remove_key_phrase_items_list(item)
+                remove_key_phrase_items_list(model.cfg)
+                # Désactiver boosting_tree si présent
+                if 'decoding' in model.cfg:
+                    if 'greedy' in model.cfg.decoding:
+                        if 'boosting_tree' in model.cfg.decoding.greedy:
+                            model.cfg.decoding.greedy.boosting_tree = None
+                            print("✅ Boosting_tree désactivé")
+        except Exception as e:
+            print(f"⚠️ Avertissement lors du patch de config: {e}")
         model.eval()
         if DEVICE == "cuda":
             model = model.half()
+            print(f"🎯 Modèle converti en half precision")
         print(f"✅ Modèle {name} chargé avec succès")
+        if DEVICE == "cuda":
+            print(f"📊 Mémoire GPU utilisée: {torch.cuda.memory_allocated()/1e9:.2f}GB")
         _cache[name] = model
         return model
         print(traceback.format_exc())
         raise e
+def get_audio_info(filepath):
+    """Récupère les informations d'un fichier audio"""
+    try:
+        cmd = f"ffprobe -v quiet -print_format json -show_format -show_streams {shlex.quote(filepath)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        if result.returncode == 0:
+            import json
+            info = json.loads(result.stdout)
+            streams = info.get('streams', [])
+            audio_stream = next((s for s in streams if s.get('codec_type') == 'audio'), None)
+            if audio_stream:
+                duration = float(info['format'].get('duration', 0))
+                return {
+                    'duration': duration,
+                    'sample_rate': audio_stream.get('sample_rate', '?'),
+                    'channels': audio_stream.get('channels', '?'),
+                    'codec': audio_stream.get('codec_name', '?'),
+                    'size': humanize.naturalsize(int(info['format'].get('size', 0)))
+                }
+    except:
+        pass
+    return None
+def format_time(seconds):
+    """Formate le temps en MM:SS"""
+    minutes = int(seconds // 60)
+    seconds = int(seconds % 60)
+    return f"{minutes:02d}:{seconds:02d}"
+def pipeline(audio_in, model_name, progress=gr.Progress()):
     if not audio_in:
+        yield "❌ Erreur", "Aucun audio détecté.", gr.update(visible=False)
         return
     tmp_dir = tempfile.mkdtemp()
     try:
+        # === PHASE 1: Analyse de l'audio original ===
+        yield "🔍 Analyse du fichier audio...", "", gr.update(visible=False)
+        audio_info = get_audio_info(audio_in)
+        if audio_info:
+            duration = audio_info['duration']
+            duration_str = format_time(duration)
+            info_text = f"""
+            📊 **Informations audio :**
+            - Durée : {duration_str} ({duration:.1f} secondes)
+            - Taille : {audio_info['size']}
+            - Fréquence : {audio_info['sample_rate']} Hz
+            - Canaux : {audio_info['channels']}
+            - Codec : {audio_info['codec']}
+            """
+        else:
+            duration = 0
+            info_text = "ℹ️ Impossible de lire les métadonnées audio"
+        yield f"⏳ Préparation... ({duration_str if duration > 0 else '??'})", info_text, gr.update(visible=False)
+        # === PHASE 2: Conversion audio ===
+        yield f"🔄 Conversion audio (étape 1/3)...", info_text, gr.update(visible=False)
+        progress(0.1, desc="Conversion audio...")
+        wav_path = os.path.join(tmp_dir, "input.wav")
+        # Vérification que le fichier source existe
         if not os.path.exists(audio_in):
+            yield "❌ Erreur", f"Fichier audio introuvable: {audio_in}", gr.update(visible=False)
             return
+        # Conversion avec FFmpeg
         cmd = f"ffmpeg -y -i {shlex.quote(audio_in)} -ac 1 -ar 16000 {shlex.quote(wav_path)}"
         result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
         if result.returncode != 0:
+            error_msg = f"Erreur FFmpeg: {result.stderr[:200]}..."
+            yield "❌ Erreur", error_msg, gr.update(visible=False)
             return
         if not os.path.exists(wav_path) or os.path.getsize(wav_path) == 0:
+            yield "❌ Erreur", "Fichier audio converti vide", gr.update(visible=False)
             return
+        # Info sur l'audio converti
+        converted_size = humanize.naturalsize(os.path.getsize(wav_path))
+        info_text += f"\n- Après conversion : {converted_size}"
+        # === PHASE 3: Segmentation ===
+        yield f"✂️ Segmentation audio (étape 2/3)...", info_text, gr.update(visible=False)
+        progress(0.3, desc="Segmentation...")
         seg_pattern = os.path.join(tmp_dir, 'seg_%03d.wav')
         cmd = f"ffmpeg -i {shlex.quote(wav_path)} -f segment -segment_time {SEGMENT_DURATION} -c copy {shlex.quote(seg_pattern)}"
         subprocess.run(cmd, shell=True, capture_output=True)
         valid_segments = sorted(glob.glob(os.path.join(tmp_dir, "seg_*.wav")))
         if not valid_segments:
+            yield "❌ Erreur", "Fichier audio vide ou incompatible après segmentation.", gr.update(visible=False)
             return
+        nb_segments = len(valid_segments)
+        estimated_time = nb_segments * 2  # Estimation ~2 secondes par segment
+        info_text += f"\n📦 **Segments :** {nb_segments} (durée ~{format_time(duration)})"
+        yield f"🎯 Transcription en cours (étape 3/3)... {nb_segments} segments", info_text, gr.update(visible=False)
+        progress(0.5, desc=f"Transcription de {nb_segments} segments...")
+        print(f"🔊 {nb_segments} segments à transcrire")
+        # === PHASE 4: Chargement du modèle ===
+        yield f"🤖 Chargement du modèle {model_name}...", info_text, gr.update(visible=False)
         model = get_model(model_name)
+        # === PHASE 5: Transcription ===
+        yield f"📝 Transcription en cours... (0/{nb_segments})", info_text, gr.update(visible=False)
+        all_results = []
+        batch_size = 4
         with torch.inference_mode():
+            for i in range(0, len(valid_segments), batch_size):
+                batch = valid_segments[i:i+batch_size]
+                batch_hyp = model.transcribe(batch, batch_size=len(batch), return_hypotheses=True)
+                batch_results = [hyp.text if hasattr(hyp, 'text') else str(hyp) for hyp in batch_hyp]
+                all_results.extend(batch_results)
+                # Mise à jour de la progression
+                processed = min(i + batch_size, nb_segments)
+                progress_val = 0.5 + (0.5 * processed / nb_segments)
+                progress(progress_val, desc=f"Transcription {processed}/{nb_segments}")
+                # Afficher les résultats partiels
+                partial_text = " ".join(all_results)
+                yield f"📝 Transcription en cours... ({processed}/{nb_segments})", info_text, gr.update(value=partial_text, visible=True)
+        final_text = " ".join(all_results)
+        # === FIN ===
+        success_text = f"""
+        ✅ **Transcription terminée !**
+        - Modèle : {model_name}
+        - Durée audio : {format_time(duration) if duration > 0 else '?'}
+        - Segments : {nb_segments}
+        {info_text}
+        """
+        yield "✅ Succès", success_text, gr.update(value=final_text, visible=True)
     except Exception as e:
         print(traceback.format_exc())
+        error_msg = f"❌ Erreur: {str(e)}"
+        yield error_msg, "", gr.update(visible=False)
     finally:
         if os.path.exists(tmp_dir):
             shutil.rmtree(tmp_dir)
+            print(f"🧹 Nettoyage du répertoire temporaire: {tmp_dir}")
 # Interface Gradio
 with gr.Blocks(title="RobotsMali ASR", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 RobotsMali - Reconnaissance Vocale
+    **Transcription automatique de l'audio en texte** avec les modèles de RobotsMali.
+    Choisissez un modèle, uploadez un fichier audio ou enregistrez-vous, et lancez la transcription !
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
             audio_input = gr.Audio(
+                label="🎤 Audio",
                 type="filepath",
+                sources=["upload", "microphone"],
+                waveform_options=gr.WaveformOptions(
+                    waveform_color="#3498db",
+                    waveform_progress_color="#2ecc71",
+                )
             )
             model_input = gr.Dropdown(
                 choices=list(MODELS.keys()),
+                value=list(MODELS.keys())[4],  # Soloni V3 par défaut
+                label="🧠 Modèle",
+                info="Sélectionnez le modèle de transcription"
             )
+            run_btn = gr.Button("🚀 DÉMARRER LA TRANSCRIPTION", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            status = gr.Markdown("### 📊 État : En attente")
+            audio_info = gr.Markdown("ℹ️ Chargez un audio pour voir ses informations")
+    with gr.Row():
         with gr.Column():
+            text_output = gr.Textbox(
+                label="📝 Transcription",
+                lines=8,
+                placeholder="La transcription apparaîtra ici...",
+                interactive=False,
+                visible=False
+            )
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                ["exemples/audio1.wav"],
+                ["exemples/audio2.mp3"],
+            ],
+            inputs=audio_input,
+            label="📋 Exemples (si disponibles)"
+        )
+    # Footer
+    gr.Markdown("""
+    ---
+    ### 📌 Notes
+    - Les fichiers audio sont traités localement et supprimés après transcription
+    - Durée maximale recommandée : 5 minutes
+    - Modèles entraînés par [RobotsMali](https://huggingface.co/RobotsMali)
+    """)
+    # Gestionnaire d'événements
     run_btn.click(
         fn=pipeline,
         inputs=[audio_input, model_input],
+        outputs=[status, audio_info, text_output]
+    )
+    # Afficher les infos audio quand un fichier est chargé
+    def on_audio_upload(audio):
+        if audio:
+            info = get_audio_info(audio)
+            if info:
+                duration_str = format_time(info['duration'])
+                return f"""
+                ### ✅ Audio chargé
+                - Durée : {duration_str} ({info['duration']:.1f}s)
+                - Taille : {info['size']}
+                - Format : {info['sample_rate']}Hz, {info['channels']} canaux
+                Cliquez sur **DÉMARRER** pour transcrire !
+                """
+            else:
+                return "✅ Audio chargé. Prêt pour la transcription."
+        return "ℹ️ Chargez un audio pour voir ses informations"
+    audio_input.change(
+        fn=on_audio_upload,
+        inputs=audio_input,
+        outputs=audio_info
     )
 # Point d'entrée - Configuration pour Hugging Face Spaces
 if __name__ == "__main__":
     print("🚀 Lancement de l'application RobotsMali ASR...")
+    print(f"📊 Mémoire système: {humanize.naturalsize(psutil.virtual_memory().total)}")
+    print(f"🎯 Device: {DEVICE}")
     # Configuration simple pour Spaces
     demo.queue().launch(
         server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
     )