Spaces:

TGPro1
/

STTR

Sleeping

App Files Files Community

STTR commited on Jan 4

Commit

df4ae9b

1 Parent(s): 87733fb

Add SeamlessM4T v2 Large STT + NLLB-200 with T4 GPU

Browse files

Files changed (3) hide show

README.md +19 -7
app.py +95 -94
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -1,12 +1,24 @@
 ---
-title: STTR
-emoji: 👁
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 6.2.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: STTR - Speech Translation
+emoji: 🌍
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: "4.44.0"
 app_file: app.py
+pinned: true
+license: mit
+hardware: t4-small
 ---
+# 🌍 STTR - Speech-to-Text & Translation API
+**Meta AI Models:**
+- 🎤 **SeamlessM4T v2 Large** - STT (101 languages)
+- 🌍 **NLLB-200** - Translation (200 languages + Darija!)
+- 🎭 **SeamlessExpressive** - Expressive Speech Translation
+**API Endpoints:**
+- `/stt` - Speech-to-Text
+- `/translate` - Text Translation
+- `/expressive` - Expressive Speech Translation

app.py CHANGED Viewed

@@ -1,28 +1,41 @@
 import gradio as gr
-from transformers import AutoProcessor, SeamlessM4Tv2ForSpeechToText, AutoModelForSeq2SeqLM, AutoTokenizer
 import torch
 import numpy as np
 # ============================================================
-# 🚀 Load Models
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Device: {device}")
 # SeamlessM4T v2 Large for STT
 print("📥 Loading SeamlessM4T v2 Large...")
-stt_model_name = "facebook/seamless-m4t-v2-large"
-stt_processor = AutoProcessor.from_pretrained(stt_model_name)
-stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(stt_model_name).to(device)
-print("✅ SeamlessM4T v2 Large loaded")
 # NLLB-200 for Translation
 print("📥 Loading NLLB-200...")
-nllb_model_name = "facebook/nllb-200-distilled-600M"
-nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name)
-nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name).to(device)
-print("✅ NLLB-200 loaded")
 print("🎉 All models ready!")
@@ -31,122 +44,110 @@ print("🎉 All models ready!")
 # ============================================================
 NLLB_LANGS = {
-    "English": "eng_Latn",
-    "French": "fra_Latn",
-    "Arabic": "arb_Arab",
-    "Moroccan Arabic": "ary_Arab",
-    "Spanish": "spa_Latn",
-    "German": "deu_Latn",
-    "Italian": "ita_Latn",
-    "Portuguese": "por_Latn",
-    "Chinese": "zho_Hans",
-    "Japanese": "jpn_Jpan",
-    "Korean": "kor_Hang",
-    "Russian": "rus_Cyrl",
-    "Turkish": "tur_Latn",
-    "Dutch": "nld_Latn",
-    "Hindi": "hin_Deva",
 }
 STT_LANGS = {
-    "English": "eng",
-    "French": "fra",
-    "Arabic": "arb",
-    "Spanish": "spa",
-    "German": "deu",
-    "Italian": "ita",
-    "Portuguese": "por",
-    "Chinese": "cmn",
-    "Japanese": "jpn",
-    "Korean": "kor",
-    "Russian": "rus",
-    "Turkish": "tur",
-    "Dutch": "nld",
-    "Hindi": "hin",
 }
 # ============================================================
-# STT Function (SeamlessM4T v2 Large)
 # ============================================================
 def stt(audio, src_lang):
     """Speech-to-Text using SeamlessM4T v2 Large"""
     if audio is None:
-        return ""
-    # Handle tuple input from Gradio
-    if isinstance(audio, tuple):
-        sample_rate, audio_data = audio
-        audio_data = audio_data.astype(np.float32)
-        if audio_data.max() > 1.0:
-            audio_data = audio_data / 32768.0
-    else:
-        return "Error: Invalid audio format"
-    src_code = STT_LANGS.get(src_lang, "eng")
-    inputs = stt_processor(
-        audios=audio_data,
-        sampling_rate=sample_rate,
-        return_tensors="pt"
-    ).to(device)
-    with torch.no_grad():
-        output_tokens = stt_model.generate(
-            **inputs,
-            tgt_lang=src_code,
-            generate_speech=False
-        )
-    text = stt_processor.decode(output_tokens[0], skip_special_tokens=True)
-    return text
 # ============================================================
-# Translation Function (NLLB-200)
 # ============================================================
 def translate(text, src_lang, tgt_lang):
     """Translation using NLLB-200"""
-    if not text.strip():
         return ""
-    src_code = NLLB_LANGS.get(src_lang, "eng_Latn")
-    tgt_code = NLLB_LANGS.get(tgt_lang, "fra_Latn")
-    nllb_tokenizer.src_lang = src_code
-    inputs = nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-    forced_bos_token_id = nllb_tokenizer.convert_tokens_to_ids(tgt_code)
-    with torch.no_grad():
-        outputs = nllb_model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512, num_beams=5)
-    return nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
 # ============================================================
 # Gradio Interface
 # ============================================================
-with gr.Blocks(title="STTR - Speech & Translation API") as demo:
-    gr.Markdown("# 🌍 STTR - Speech-to-Text & Translation API")
-    gr.Markdown("**SeamlessM4T v2 Large** for STT + **NLLB-200** for Translation")
-    with gr.Tab("🎤 STT (Speech-to-Text)"):
-        with gr.Row():
-            stt_audio = gr.Audio(label="Record/Upload Audio", type="numpy")
-            stt_lang = gr.Dropdown(list(STT_LANGS.keys()), label="Language", value="English")
         stt_output = gr.Textbox(label="Transcription", lines=3)
         stt_btn = gr.Button("🎤 Transcribe", variant="primary")
-        stt_btn.click(stt, inputs=[stt_audio, stt_lang], outputs=stt_output, api_name="stt")
     with gr.Tab("🌍 Translation"):
         with gr.Row():
-            trans_text = gr.Textbox(label="Text to translate", lines=3)
-        with gr.Row():
-            trans_src = gr.Dropdown(list(NLLB_LANGS.keys()), label="Source", value="English")
-            trans_tgt = gr.Dropdown(list(NLLB_LANGS.keys()), label="Target", value="French")
         trans_output = gr.Textbox(label="Translation", lines=3)
         trans_btn = gr.Button("🌍 Translate", variant="primary")
-        trans_btn.click(translate, inputs=[trans_text, trans_src, trans_tgt], outputs=trans_output, api_name="translate")
 demo.launch()

 import gradio as gr
+from transformers import (
+    AutoProcessor,
+    SeamlessM4Tv2ForSpeechToText,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer
+)
 import torch
 import numpy as np
 # ============================================================
+# 🚀 Device Setup
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Device: {device}")
+# ============================================================
+# 📥 Load Models
+# ============================================================
 # SeamlessM4T v2 Large for STT
 print("📥 Loading SeamlessM4T v2 Large...")
+STT_MODEL = "facebook/seamless-m4t-v2-large"
+stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
+stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
+stt_model = stt_model.to(device)
+stt_model.eval()
+print("✅ SeamlessM4T v2 Large loaded!")
 # NLLB-200 for Translation
 print("📥 Loading NLLB-200...")
+NLLB_MODEL = "facebook/nllb-200-distilled-600M"
+nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
+nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
+nllb_model = nllb_model.to(device)
+nllb_model.eval()
+print("✅ NLLB-200 loaded!")
 print("🎉 All models ready!")
 # ============================================================
 NLLB_LANGS = {
+    "English": "eng_Latn", "French": "fra_Latn", "Arabic": "arb_Arab",
+    "Moroccan Arabic": "ary_Arab", "Spanish": "spa_Latn", "German": "deu_Latn",
+    "Italian": "ita_Latn", "Portuguese": "por_Latn", "Chinese": "zho_Hans",
+    "Japanese": "jpn_Jpan", "Korean": "kor_Hang", "Russian": "rus_Cyrl",
+    "Turkish": "tur_Latn", "Dutch": "nld_Latn", "Hindi": "hin_Deva",
 }
 STT_LANGS = {
+    "English": "eng", "French": "fra", "Arabic": "arb", "Spanish": "spa",
+    "German": "deu", "Italian": "ita", "Portuguese": "por", "Chinese": "cmn",
+    "Japanese": "jpn", "Korean": "kor", "Russian": "rus", "Turkish": "tur",
+    "Dutch": "nld", "Hindi": "hin",
 }
 # ============================================================
+# STT Function
 # ============================================================
 def stt(audio, src_lang):
     """Speech-to-Text using SeamlessM4T v2 Large"""
     if audio is None:
+        return "No audio provided"
+    try:
+        if isinstance(audio, tuple):
+            sample_rate, audio_data = audio
+            audio_data = audio_data.astype(np.float32)
+            if np.abs(audio_data).max() > 1.0:
+                audio_data = audio_data / 32768.0
+        else:
+            return "Invalid audio format"
+        src_code = STT_LANGS.get(src_lang, "eng")
+        inputs = stt_processor(
+            audios=audio_data,
+            sampling_rate=sample_rate,
+            return_tensors="pt"
+        ).to(device)
+        with torch.no_grad():
+            output_tokens = stt_model.generate(
+                **inputs,
+                tgt_lang=src_code,
+                generate_speech=False
+            )
+        text = stt_processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
+        return text
+    except Exception as e:
+        return f"Error: {str(e)}"
 # ============================================================
+# Translation Function
 # ============================================================
 def translate(text, src_lang, tgt_lang):
     """Translation using NLLB-200"""
+    if not text or not text.strip():
         return ""
+    try:
+        src_code = NLLB_LANGS.get(src_lang, "eng_Latn")
+        tgt_code = NLLB_LANGS.get(tgt_lang, "fra_Latn")
+        nllb_tokenizer.src_lang = src_code
+        inputs = nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+        forced_bos_token_id = nllb_tokenizer.convert_tokens_to_ids(tgt_code)
+        with torch.no_grad():
+            outputs = nllb_model.generate(
+                **inputs,
+                forced_bos_token_id=forced_bos_token_id,
+                max_length=512,
+                num_beams=5
+            )
+        return nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    except Exception as e:
+        return f"Error: {str(e)}"
 # ============================================================
 # Gradio Interface
 # ============================================================
+with gr.Blocks(title="STTR API", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🌍 STTR - Speech & Translation API")
+    gr.Markdown("**SeamlessM4T v2 Large** + **NLLB-200** (200 languages + Darija!)")
+    with gr.Tab("🎤 Speech-to-Text"):
+        stt_audio = gr.Audio(label="Audio", type="numpy")
+        stt_lang = gr.Dropdown(list(STT_LANGS.keys()), label="Language", value="English")
         stt_output = gr.Textbox(label="Transcription", lines=3)
         stt_btn = gr.Button("🎤 Transcribe", variant="primary")
+        stt_btn.click(stt, [stt_audio, stt_lang], stt_output, api_name="stt")
     with gr.Tab("🌍 Translation"):
+        trans_text = gr.Textbox(label="Text", lines=3)
         with gr.Row():
+            trans_src = gr.Dropdown(list(NLLB_LANGS.keys()), label="From", value="English")
+            trans_tgt = gr.Dropdown(list(NLLB_LANGS.keys()), label="To", value="French")
         trans_output = gr.Textbox(label="Translation", lines=3)
         trans_btn = gr.Button("🌍 Translate", variant="primary")
+        trans_btn.click(translate, [trans_text, trans_src, trans_tgt], trans_output, api_name="translate")
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 transformers>=4.40.0
 torch>=2.0.0
 sentencepiece
 protobuf
 gradio>=4.0.0
 numpy
 scipy
-torchaudio

 transformers>=4.40.0
 torch>=2.0.0
+torchaudio
 sentencepiece
 protobuf
 gradio>=4.0.0
 numpy
 scipy
+accelerate