Spaces:

jojonocode
/

yawostt

Sleeping

App Files Files Community

jojonocode commited on Feb 25

Commit

782ca36

verified ·

1 Parent(s): 7328021

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -54

app.py CHANGED Viewed

@@ -1,71 +1,72 @@
 import gradio as gr
-import torch
 import librosa
 import numpy as np
-from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
-# Configuration du modèle basée sur vos logs [cite: 53]
 MODEL_NAME = "abiyo27/whisper-small-ewe-2"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-# Chargement optimisé du processeur et du modèle [cite: 55, 57]
-processor = WhisperProcessor.from_pretrained(MODEL_NAME)
-model = WhisperForConditionalGeneration.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch_dtype
-).to(device)
-# Création d'une pipeline pour la rapidité et la gestion automatique du chunking
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    model_kwargs={"torch_dtype": torch_dtype},
-    device=device,
-)
-def transcribe(audio, state=""):
-    """
-    Fonction de transcription robuste gérant les fichiers et le microphone.
-    """
     if audio is None:
-        return state
-    # Chargement et rééchantillonnage à 16000Hz comme requis [cite: 59, 65]
     sr, y = audio
     y = y.astype(np.float32)
-    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1
-    # Transcription via pipeline (plus rapide pour les longs fichiers)
-    # On force la tâche de transcription comme spécifié dans vos logs [cite: 69]
-    result = pipe(
-        {"sampling_rate": sr, "raw": y},
-        generate_kwargs={"task": "transcribe", "max_new_tokens": 256}
-    )
-    return result["text"]
 def stream_transcribe(audio, state=""):
-    """
-    Version optimisée pour le streaming temps réel.
-    """
-    if audio is None:
         return state
-    # Traitement rapide par segments
-    sr, y = audio
-    y = y.astype(np.float32)
-    y /= np.max(np.abs(y)) if np.max(np.abs(y)) > 0 else 1
-    result = pipe({"sampling_rate": sr, "raw": y})
-    return result["text"]
-# Interface Gradio
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(f"# 🎙️ Ewe STT - ")
-    gr.Markdown("Transcription automatique du français vers l'Ewe ou transcription directe de l'Ewe.")
     with gr.Tabs():
         # Onglet 1: Fichier et Enregistrement classique
@@ -75,17 +76,17 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             with gr.Row():
                 transcribe_btn = gr.Button("Transcrire", variant="primary")
             output_text = gr.Textbox(label="Transcription Ewe", placeholder="Le texte apparaîtra ici...")
             transcribe_btn.click(
                 fn=transcribe,
                 inputs=audio_input,
                 outputs=output_text,
-                api_name="predict" # Point d'accès API standard
             )
         # Onglet 2: Streaming temps réel
         with gr.TabItem("Temps Réel (Streaming)"):
-            gr.Markdown("*Note: Parle et regarde *")
             stream_input = gr.Audio(
                 label="Microphone",
                 sources=["microphone"],
@@ -93,7 +94,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 type="numpy"
             )
             stream_output = gr.Textbox(label="Flux de transcription direct")
             stream_input.stream(
                 fn=stream_transcribe,
                 inputs=stream_input,
@@ -102,10 +103,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             )
     gr.HTML("""
-        <div style="text-align: center; color: #666;">
-            Modèle utilisé : <b>yawo stt-ewe-2</b> | O(FP16)
         </div>
     """)
 if __name__ == "__main__":
     demo.queue().launch()

+import os
 import gradio as gr
 import librosa
 import numpy as np
+import ctranslate2
+from faster_whisper import WhisperModel
+# --- 1. CONFIGURATION ET CONVERSION DU MODÈLE ---
 MODEL_NAME = "abiyo27/whisper-small-ewe-2"
+CT2_MODEL_DIR = "whisper-small-ewe-2-ct2"
+# Si le modèle n'a pas encore été converti, on le fait au démarrage
+if not os.path.exists(CT2_MODEL_DIR):
+    print(f"⏳ Conversion de {MODEL_NAME} au format CTranslate2 (int8)...")
+    print("Cela prendra environ une minute au premier lancement.")
+    # On télécharge et on convertit ton modèle HF en int8 (optimisé CPU)
+    converter = ctranslate2.converters.TransformersConverter(MODEL_NAME)
+    converter.convert(output_dir=CT2_MODEL_DIR, quantization="int8")
+    print("✅ Conversion terminée !")
+# --- 2. CHARGEMENT OPTIMISÉ (FASTER-WHISPER) ---
+print("🚀 Chargement du modèle faster-whisper en mémoire...")
+# compute_type="int8" est le secret pour une vitesse fulgurante sur CPU
+model = WhisperModel(CT2_MODEL_DIR, device="cpu", compute_type="int8", cpu_threads=2)
+# --- 3. FONCTIONS DE TRAITEMENT ---
+def preprocess_audio(audio):
+    """Gère le rééchantillonnage strict à 16kHz de manière optimisée."""
     if audio is None:
+        return None
     sr, y = audio
     y = y.astype(np.float32)
+    # Normalisation
+    if np.max(np.abs(y)) > 0:
+        y /= np.max(np.abs(y))
+    # Faster-whisper exige 16000Hz
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+    return y
+def transcribe(audio, state=""):
+    """Transcription de fichier ou micro complet."""
+    y = preprocess_audio(audio)
+    if y is None:
+        return state
+    # beam_size=5 donne une bonne précision. task="transcribe" forcé.
+    segments, info = model.transcribe(y, beam_size=5, task="transcribe")
+    # On assemble les segments de texte générés
+    text = " ".join([segment.text for segment in segments])
+    return text.strip()
 def stream_transcribe(audio, state=""):
+    """Transcription pour le streaming (plus agressive sur la vitesse)."""
+    y = preprocess_audio(audio)
+    if y is None:
         return state
+    # beam_size=1 pour privilégier la vitesse extrême en streaming
+    segments, info = model.transcribe(y, beam_size=1, task="transcribe")
+    text = " ".join([segment.text for segment in segments])
+    return text.strip()
+# --- 4. INTERFACE GRADIO ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"# 🎙️ Ewe STT - Faster Whisper CPU")
+    gr.Markdown("Transcription ultra-rapide optimisée pour processeur. Traduction automatique du français vers l'Ewe ou transcription directe.")
     with gr.Tabs():
         # Onglet 1: Fichier et Enregistrement classique
             with gr.Row():
                 transcribe_btn = gr.Button("Transcrire", variant="primary")
             output_text = gr.Textbox(label="Transcription Ewe", placeholder="Le texte apparaîtra ici...")
             transcribe_btn.click(
                 fn=transcribe,
                 inputs=audio_input,
                 outputs=output_text,
+                api_name="predict"
             )
         # Onglet 2: Streaming temps réel
         with gr.TabItem("Temps Réel (Streaming)"):
+            gr.Markdown("*Note : Le streaming sur CPU gratuit reste expérimental, parlez clairement.*")
             stream_input = gr.Audio(
                 label="Microphone",
                 sources=["microphone"],
                 type="numpy"
             )
             stream_output = gr.Textbox(label="Flux de transcription direct")
             stream_input.stream(
                 fn=stream_transcribe,
                 inputs=stream_input,
             )
     gr.HTML("""
+        <div style="text-align: center; color: #666; margin-top: 20px;">
+            Modèle utilisé : <b>yawo stt-ewe-2</b> | Optimisation : <b>CTranslate2 (INT8)</b>
         </div>
     """)
 if __name__ == "__main__":
+    # La queue est importante pour gérer plusieurs requêtes sans planter le CPU
     demo.queue().launch()