Spaces:

Gamortsey
/

Translator

Runtime error

App Files Files Community

Gamortsey commited on Aug 19, 2025

Commit

e6584a7

verified ·

1 Parent(s): f6b0045

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -73

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 # app.py
 import os
 import tempfile
-from flask import Flask, request, Response, jsonify
 from flask_cors import CORS
 import torch
 import torchaudio
 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
@@ -13,27 +16,28 @@ from transformers import (
 )
 # ---------- Configuration ----------
-# Use small CPU-friendly models for free HF Spaces
-WHISPER_MODEL = "openai/whisper-small"
-NLLB_MODEL = "facebook/nllb-200-distilled-600M"
-# Map frontend language names -> (whisper_lang_code, nllb_src_code)
 LANG_MAP = {
-    # language_key: (whisper_language_arg, nllb_src_lang_tag)
-    "akan": (None, "aka_Latn"),     # if you have a specialized Akan whisper model, change whisper arg
     "hausa": ("ha", "hau_Latn"),
     "swahili": ("sw", "swh_Latn"),
     "french": ("fr", "fra_Latn"),
-    "arabic": ("ar", "arb_Arab"),   # nllb code for Arabic variants may vary (this is illustrative)
     "english": ("en", None),
 }
-DEVICE = torch.device("cpu")  # Free Spaces = CPU-only
 app = Flask(__name__)
 CORS(app)
-# ---------- Lazy model manager ----------
 class ModelManager:
     def __init__(self):
         self.whisper_processor = None
@@ -45,26 +49,28 @@ class ModelManager:
     def load(self):
         if self._loaded:
             return
-        # Whisper processor & model
-        print("Loading Whisper processor/model (small)...")
-        self.whisper_processor = AutoProcessor.from_pretrained(WHISPER_MODEL)
-        self.whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            WHISPER_MODEL
-        ).to(DEVICE)
-        # NLLB tokenizer & model (600M)
-        print("Loading NLLB tokenizer/model (600M)...")
-        self.nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
-        self.nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL).to(DEVICE)
         self._loaded = True
-        print("Models loaded.")
     def transcribe(self, audio_path, whisper_language_arg=None):
-        # loads and runs whisper-small to produce transcription string
         if self.whisper_processor is None or self.whisper_model is None:
-            raise RuntimeError("Whisper not loaded")
         waveform, sr = torchaudio.load(audio_path)
         if sr != 16000:
             waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
@@ -88,33 +94,44 @@ class ModelManager:
         return decoded[0].strip()
     def translate_to_english(self, src_text, nllb_src_lang_tag):
-        # returns english translation string using nllb
         if not nllb_src_lang_tag:
-            # if src already english or no mapping, return original
             return src_text
         if self.nllb_tokenizer is None or self.nllb_model is None:
-            raise RuntimeError("NLLB not loaded")
-        # set src_lang on tokenizer (some NLLB tokenizers use this attribute)
         try:
             self.nllb_tokenizer.src_lang = nllb_src_lang_tag
         except Exception:
             pass
         inputs = self.nllb_tokenizer(src_text, return_tensors="pt").to(DEVICE)
-        forced_bos_token_id = self.nllb_tokenizer.convert_tokens_to_ids("eng_Latn")
         with torch.no_grad():
-            translated_tokens = self.nllb_model.generate(
-                **inputs,
-                forced_bos_token_id=forced_bos_token_id,
-                max_length=512,
-                num_beams=4,
-                no_repeat_ngram_size=2,
-                early_stopping=True
-            )
-        out = self.nllb_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
-        return out.strip()
 model_manager = ModelManager()
@@ -122,10 +139,10 @@ model_manager = ModelManager()
 @app.route("/transcribe", methods=["POST"])
 def transcribe_endpoint():
     """
-    Accepts:
-      - multipart form file field: 'audio' (wav/mp3 etc.)
-      - form field 'language' (one of keys in LANG_MAP: akan, hausa, swahili, french, arabic, english)
-    Returns:
       - Plain text body with the translated text (Content-Type: text/plain)
     """
     if "audio" not in request.files:
@@ -133,32 +150,30 @@ def transcribe_endpoint():
     audio_file = request.files["audio"]
     language = (request.form.get("language") or request.args.get("language") or "english").lower()
     if language not in LANG_MAP:
         return Response(f"Unsupported language: {language}", status=400, mimetype="text/plain")
     whisper_lang_arg, nllb_src_tag = LANG_MAP[language]
-    # Ensure models loaded lazily (first request)
     try:
         model_manager.load()
     except Exception as e:
         return Response(f"Model loading failed: {e}", status=500, mimetype="text/plain")
-    # Save audio to temp file
-    tmp_fd, tmp_path = tempfile.mkstemp(suffix=os.path.splitext(audio_file.filename)[1] or ".wav")
     os.close(tmp_fd)
     audio_file.save(tmp_path)
     try:
-        # Transcribe (may be slow on CPU)
         transcription = model_manager.transcribe(tmp_path, whisper_language_arg=whisper_lang_arg)
         if not transcription:
-            return Response("", status=204, mimetype="text/plain")  # empty body if nothing transcribed
-        # Translate to English (if applicable)
         translation = model_manager.translate_to_english(transcription, nllb_src_tag)
-        # Return only the translated text (plain text)
         return Response(translation, status=200, mimetype="text/plain")
     except Exception as e:
         return Response(f"Processing failed: {e}", status=500, mimetype="text/plain")
@@ -168,37 +183,108 @@ def transcribe_endpoint():
         except Exception:
             pass
-# Optional: a tiny Gradio UI for testing (mounted)
 try:
     import gradio as gr
     def _ui_transcribe(audio, language):
-        # audio comes as file path from gradio
         if audio is None:
             return "No audio", ""
-        # call local endpoint function for consistent behavior
-        whisper_lang, nllb_tag = LANG_MAP.get(language.lower(), (None, None))
-        model_manager.load()
-        trans = model_manager.transcribe(audio, whisper_lang)
-        trans_en = model_manager.translate_to_english(trans, nllb_tag)
-        return trans, trans_en
-    demo = gr.Interface(
-        fn=_ui_transcribe,
-        inputs=[
-            gr.Audio(source="microphone", type="filepath"),
-            gr.Dropdown(choices=list(LANG_MAP.keys()), label="Language", value="english")
-        ],
-        outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Translation (English)")],
-        title="Multilingual Transcriber (server)"
-    )
-    # mount gradio app under /ui so the REST API remains at /transcribe
-    from gradio.routes import MountableApp
-    app = gr.mount_gradio_app(app, demo, path="/ui")
 except Exception as e:
     print("Gradio UI unavailable or failed to mount:", e)
 if __name__ == "__main__":
-    # For local debug only
-    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)), debug=False)

 # app.py
 import os
 import tempfile
+from pathlib import Path
+from flask import Flask, request, Response, redirect
 from flask_cors import CORS
 import torch
 import torchaudio
+# Transformers imports (lazy loaded in ModelManager.load to reduce startup overhead)
 from transformers import (
     AutoProcessor,
     AutoModelForSpeechSeq2Seq,
 )
 # ---------- Configuration ----------
+# Use smaller models suitable for CPU-only Hugging Face Spaces (free tier)
+WHISPER_MODEL = os.environ.get("WHISPER_MODEL", "openai/whisper-small")
+NLLB_MODEL = os.environ.get("NLLB_MODEL", "facebook/nllb-200-distilled-600M")
+# Map frontend language names -> (whisper_lang_arg, nllb_src_lang_tag)
+# Adjust tags if you have different NLLB language tags for specific dialects
 LANG_MAP = {
+    "akan": (None, "aka_Latn"),
     "hausa": ("ha", "hau_Latn"),
     "swahili": ("sw", "swh_Latn"),
     "french": ("fr", "fra_Latn"),
+    "arabic": ("ar", "arb_Arab"),
     "english": ("en", None),
 }
+# Force CPU for free Spaces
+DEVICE = torch.device("cpu")
 app = Flask(__name__)
 CORS(app)
+# ---------- Model manager (lazy load) ----------
 class ModelManager:
     def __init__(self):
         self.whisper_processor = None
     def load(self):
         if self._loaded:
             return
+        print(f"Loading Whisper model: {WHISPER_MODEL}")
+        try:
+            self.whisper_processor = AutoProcessor.from_pretrained(WHISPER_MODEL)
+            self.whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(WHISPER_MODEL).to(DEVICE)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load Whisper model ({WHISPER_MODEL}): {e}")
+        print(f"Loading NLLB tokenizer/model: {NLLB_MODEL}")
+        try:
+            self.nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
+            self.nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL).to(DEVICE)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load NLLB model ({NLLB_MODEL}): {e}")
         self._loaded = True
+        print("Models loaded successfully.")
     def transcribe(self, audio_path, whisper_language_arg=None):
         if self.whisper_processor is None or self.whisper_model is None:
+            raise RuntimeError("Whisper model not loaded")
+        # Load audio and resample if needed
         waveform, sr = torchaudio.load(audio_path)
         if sr != 16000:
             waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
         return decoded[0].strip()
     def translate_to_english(self, src_text, nllb_src_lang_tag):
+        if not src_text:
+            return ""
         if not nllb_src_lang_tag:
+            # Already English or no NLLB mapping — return source
             return src_text
         if self.nllb_tokenizer is None or self.nllb_model is None:
+            raise RuntimeError("NLLB model not loaded")
+        # Set tokenizer source lang if supported
         try:
             self.nllb_tokenizer.src_lang = nllb_src_lang_tag
         except Exception:
             pass
         inputs = self.nllb_tokenizer(src_text, return_tensors="pt").to(DEVICE)
+        # Attempt to get forced BOS token id for English; fallback to no forced token
+        forced_bos = None
+        try:
+            forced_bos = self.nllb_tokenizer.convert_tokens_to_ids("eng_Latn")
+        except Exception:
+            forced_bos = None
+        gen_kwargs = {
+            "max_length": 512,
+            "num_beams": 4,
+            "no_repeat_ngram_size": 2,
+            "early_stopping": True
+        }
+        if forced_bos is not None:
+            gen_kwargs["forced_bos_token_id"] = forced_bos
         with torch.no_grad():
+            translated_tokens = self.nllb_model.generate(**inputs, **gen_kwargs)
+        translated = self.nllb_tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
+        return translated.strip()
 model_manager = ModelManager()
 @app.route("/transcribe", methods=["POST"])
 def transcribe_endpoint():
     """
+    POST multipart/form-data:
+      - field 'audio': file (wav/mp3/ogg etc.)
+      - field 'language': string key (akan, hausa, swahili, french, arabic, english)
+    Response:
       - Plain text body with the translated text (Content-Type: text/plain)
     """
     if "audio" not in request.files:
     audio_file = request.files["audio"]
     language = (request.form.get("language") or request.args.get("language") or "english").lower()
     if language not in LANG_MAP:
         return Response(f"Unsupported language: {language}", status=400, mimetype="text/plain")
     whisper_lang_arg, nllb_src_tag = LANG_MAP[language]
+    # Load models (lazy)
     try:
         model_manager.load()
     except Exception as e:
         return Response(f"Model loading failed: {e}", status=500, mimetype="text/plain")
+    # Save audio to a temp file
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=Path(audio_file.filename).suffix or ".wav")
     os.close(tmp_fd)
     audio_file.save(tmp_path)
     try:
         transcription = model_manager.transcribe(tmp_path, whisper_language_arg=whisper_lang_arg)
         if not transcription:
+            # nothing transcribed -> return empty body (204)
+            return Response("", status=204, mimetype="text/plain")
         translation = model_manager.translate_to_english(transcription, nllb_src_tag)
         return Response(translation, status=200, mimetype="text/plain")
     except Exception as e:
         return Response(f"Processing failed: {e}", status=500, mimetype="text/plain")
         except Exception:
             pass
+# ---------- Robust Gradio UI mount (optional) ----------
+gradio_mounted = False
 try:
     import gradio as gr
+    import soundfile as sf
+    import numpy as np
     def _ui_transcribe(audio, language):
+        """
+        Accept many audio input shapes from different gradio versions:
+          - filepath (str)
+          - tuple (sr, ndarray)
+          - ndarray (numpy)
+        We normalize to a temporary wav file.
+        """
         if audio is None:
             return "No audio", ""
+        audio_path = None
+        if isinstance(audio, str) and Path(audio).exists():
+            audio_path = audio
+        elif isinstance(audio, (tuple, list)) and len(audio) >= 2:
+            sr, data = audio[0], audio[1]
+            tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sf.write(tmp.name, data, sr)
+            audio_path = tmp.name
+        elif isinstance(audio, (np.ndarray,)) or hasattr(audio, "shape"):
+            sr = 16000
+            tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sf.write(tmp.name, audio, sr)
+            audio_path = tmp.name
+        else:
+            try:
+                audio_path = getattr(audio, "name", None)
+            except Exception:
+                audio_path = None
+        if not audio_path:
+            return "Unsupported audio format from Gradio", ""
+        try:
+            model_manager.load()
+            whisper_lang, nllb_tag = LANG_MAP.get(language.lower(), (None, None))
+            transcription = model_manager.transcribe(audio_path, whisper_language_arg=whisper_lang)
+            translation = model_manager.translate_to_english(transcription, nllb_tag)
+            return transcription, translation
+        finally:
+            # try cleanup
+            try:
+                if audio_path and Path(audio_path).exists() and "/tmp" in str(audio_path):
+                    os.remove(audio_path)
+            except Exception:
+                pass
+    demo = None
+    try:
+        # modern API
+        audio_component = gr.Audio(source="microphone", type="filepath")
+        dropdown = gr.Dropdown(choices=list(LANG_MAP.keys()), value="english", label="Language")
+        demo = gr.Interface(
+            fn=_ui_transcribe,
+            inputs=[audio_component, dropdown],
+            outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Translation (English)")],
+            title="Multilingual Transcriber (server)"
+        )
+    except TypeError:
+        # fallback for older gradio versions
+        try:
+            audio_component = gr.inputs.Audio(source="microphone", type="filepath")
+            dropdown = gr.inputs.Dropdown(choices=list(LANG_MAP.keys()), default="english")
+            outputs = [gr.outputs.Textbox(), gr.outputs.Textbox()]
+            demo = gr.Interface(fn=_ui_transcribe, inputs=[audio_component, dropdown], outputs=outputs,
+                                title="Multilingual Transcriber (server)")
+        except Exception as e:
+            print("Gradio fallback constructor failed:", e)
+            demo = None
+    except Exception as e:
+        print("Gradio constructor failed:", e)
+        demo = None
+    if demo is not None:
+        try:
+            app = gr.mount_gradio_app(app, demo, path="/ui")
+            gradio_mounted = True
+            print("Gradio mounted at /ui")
+        except Exception as e:
+            print("Failed to mount Gradio app:", e)
+            gradio_mounted = False
+    else:
+        print("Gradio demo not created; continuing without mounted UI.")
 except Exception as e:
     print("Gradio UI unavailable or failed to mount:", e)
+    gradio_mounted = False
+# Root endpoint: redirect to /ui if mounted, otherwise status text
+@app.route("/")
+def index():
+    if gradio_mounted:
+        return redirect("/ui")
+    return Response("Server running. REST endpoint available at /transcribe", status=200, mimetype="text/plain")
 if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    app.run(host="0.0.0.0", port=port, debug=False)