Spaces:

RobotsMali
/

RobotsMali_ASR_DEMO

Runtime error

App Files Files Community

binaryMao commited on Mar 2

Commit

697b4cb

verified ·

1 Parent(s): 5824c95

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -19

app.py CHANGED Viewed

@@ -3,11 +3,14 @@ import os, shlex, subprocess, tempfile, traceback, time, glob, gc, shutil
 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 10.0
 MODELS = {
     "Soloba V3 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
     "Soloba V2 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v2", "ctc"),
@@ -23,24 +26,33 @@ MODELS = {
 _cache = {}
 def get_model(name):
     if name in _cache:
         return _cache[name]
-    # NETTOYAGE MÉMOIRE AGRESSIF (Crucial pour les modèles 0.6b sur CPU Free)
     if len(_cache) >= 1:
         _cache.clear()
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
-    repo, _ = MODELS[name]
-    print(f"⏳ Chargement de {name}...")
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
-    model = nemo_asr.models.ASRModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
     model.eval()
     if DEVICE == "cuda":
         model = model.half()
@@ -49,15 +61,15 @@ def get_model(name):
 def pipeline(audio_in, model_name):
     if not audio_in:
-        yield "❌ Erreur", "Veuillez fournir un fichier audio."
         return
     tmp_dir = tempfile.mkdtemp()
     try:
-        yield "⏳ Préparation de l'audio...", ""
-        wav_path = os.path.join(tmp_dir, "clean.wav")
-        # Conversion simple et robuste
         subprocess.run(f"ffmpeg -y -i {shlex.quote(audio_in)} -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
         # Segmentation
@@ -67,7 +79,7 @@ def pipeline(audio_in, model_name):
         valid_segments = [f for f in valid_segments if os.path.getsize(f) > 1000]
         if not valid_segments:
-            yield "❌ Erreur", "Audio trop court ou illisible."
             return
         yield f"🎙️ Transcription ({len(valid_segments)} segments)...", ""
@@ -75,6 +87,7 @@ def pipeline(audio_in, model_name):
         model = get_model(model_name)
         with torch.inference_mode():
             batch_hyp = model.transcribe(
                 valid_segments,
                 batch_size=4,
@@ -84,31 +97,32 @@ def pipeline(audio_in, model_name):
         results = []
         for hyp in batch_hyp:
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             if text: results.append(text)
-        yield "✅ Terminé", " ".join(results)
     except Exception as e:
         yield "❌ Erreur", str(e)
     finally:
         if os.path.exists(tmp_dir):
             shutil.rmtree(tmp_dir)
-# --- INTERFACE (Argument show_copy_button supprimé) ---
-with gr.Blocks() as demo:
-    gr.Markdown("# 🤖 RobotsMali Speech-to-Text")
     with gr.Row():
         with gr.Column():
-            audio_input = gr.Audio(label="Audio", type="filepath")
             model_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
-            run_btn = gr.Button("🚀 TRANSCRIRE", variant="primary")
         with gr.Column():
-            status = gr.Markdown("### État : Prêt")
-            # Correction ici : show_copy_button est retiré pour compatibilité
-            text_output = gr.Textbox(label="Résultat", lines=15)
     run_btn.click(fn=pipeline, inputs=[audio_input, model_input], outputs=[status, text_output])

 import torch
 from huggingface_hub import snapshot_download
 from nemo.collections import asr as nemo_asr
+# Imports spécifiques pour éviter l'erreur "Abstract Class"
+from nemo.collections.asr.models import EncDecCTCModel, EncDecRNNTModel
 import gradio as gr
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SEGMENT_DURATION = 10.0
+# Dictionnaire complet (Nom: (Repo, Type))
 MODELS = {
     "Soloba V3 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v3", "ctc"),
     "Soloba V2 (CTC)":           ("RobotsMali/soloba-ctc-0.6b-v2", "ctc"),
 _cache = {}
 def get_model(name):
+    """Charge le modèle en forçant la classe concrète (CTC ou RNNT)."""
     if name in _cache:
         return _cache[name]
+    # Libération agressive de la RAM avant chargement
     if len(_cache) >= 1:
         _cache.clear()
         gc.collect()
         if torch.cuda.is_available(): torch.cuda.empty_cache()
+    repo, arch_type = MODELS[name]
+    print(f"⏳ Préparation du modèle {name} ({arch_type})...")
     folder = snapshot_download(repo, local_dir_use_symlinks=False)
     nemo_file = next((os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nemo")), None)
+    # Utilisation de la classe spécifique pour contourner l'erreur ASRModel
+    try:
+        if arch_type == "ctc":
+            model = EncDecCTCModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
+        else:
+            model = EncDecRNNTModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
+    except Exception as e:
+        print(f"⚠️ Erreur de chargement spécifique, tentative générique : {e}")
+        model = nemo_asr.models.ASRModel.restore_from(nemo_file, map_location=torch.device(DEVICE))
     model.eval()
     if DEVICE == "cuda":
         model = model.half()
 def pipeline(audio_in, model_name):
     if not audio_in:
+        yield "❌ Erreur", "Aucun audio détecté."
         return
     tmp_dir = tempfile.mkdtemp()
     try:
+        yield "⏳ Traitement de l'audio...", ""
+        # Normalisation FFmpeg
+        wav_path = os.path.join(tmp_dir, "input.wav")
         subprocess.run(f"ffmpeg -y -i {shlex.quote(audio_in)} -ac 1 -ar 16000 {wav_path}", shell=True, check=True)
         # Segmentation
         valid_segments = [f for f in valid_segments if os.path.getsize(f) > 1000]
         if not valid_segments:
+            yield "❌ Erreur", "Fichier audio vide ou incompatible."
             return
         yield f"🎙️ Transcription ({len(valid_segments)} segments)...", ""
         model = get_model(model_name)
         with torch.inference_mode():
+            # Mode stable sans Lhotse
             batch_hyp = model.transcribe(
                 valid_segments,
                 batch_size=4,
         results = []
         for hyp in batch_hyp:
+            # Gère les formats de sortie CTC et RNNT
             text = hyp.text if hasattr(hyp, 'text') else str(hyp)
             if text: results.append(text)
+        yield "✅ Succès", " ".join(results)
     except Exception as e:
+        print(traceback.format_exc())
         yield "❌ Erreur", str(e)
     finally:
         if os.path.exists(tmp_dir):
             shutil.rmtree(tmp_dir)
+# --- UI GRADIO ---
+with gr.Blocks(theme=gr.themes.Default()) as demo:
+    gr.Markdown("# 🤖 RobotsMali - Reconnaissance Vocale")
     with gr.Row():
         with gr.Column():
+            audio_input = gr.Audio(label="Audio", type="filepath", sources=["upload", "microphone"])
             model_input = gr.Dropdown(choices=list(MODELS.keys()), value="Soloni V3 (TDT-CTC)", label="Modèle")
+            run_btn = gr.Button("🚀 DÉMARRER", variant="primary")
         with gr.Column():
+            status = gr.Markdown("### État : En attente")
+            text_output = gr.Textbox(label="Transcription", lines=12)
     run_btn.click(fn=pipeline, inputs=[audio_input, model_input], outputs=[status, text_output])