Spaces:

TGPro1
/

STTR

Sleeping

App Files Files Community

STTR commited on Jan 4

Commit

40e1a06

1 Parent(s): 30d00e8

Add complete Gradio UI with voice translation

Browse files

Files changed (2) hide show

README.md +70 -12
app.py +196 -128

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: STTR - Speech Translation
 emoji: 🌍
-colorFrom: blue
-colorTo: purple
 sdk: gradio
 sdk_version: "4.44.0"
 app_file: app.py
@@ -11,14 +11,72 @@ license: mit
 hardware: t4-small
 ---
-# 🌍 STTR - Speech & Translation API
-## Meta AI Models:
-- 🎤 **SeamlessM4T v2 Large** - STT (101 languages)
-- 🌍 **NLLB-200** - Translation (200 languages + Darija!)
-- 🎭 **SeamlessExpressive** - Expressive Speech Translation (preserves tone!)
-## API Endpoints:
-- `/stt` - Speech-to-Text
-- `/translate` - Text Translation
-- `/expressive` - Expressive Speech-to-Speech Translation

 ---
+title: Instant Translat - AI Voice Translation
 emoji: 🌍
+colorFrom: purple
+colorTo: blue
 sdk: gradio
 sdk_version: "4.44.0"
 app_file: app.py
 hardware: t4-small
 ---
+# 🌍 Instant Translat - AI Voice Translation
+**Real-time voice translation with AI - 200+ languages including Moroccan Darija**
+## ✨ Features
+- 🎤 **Speech-to-Text** - SeamlessM4T v2 Large (101 languages)
+- 🌍 **Translation** - NLLB-200 (200 languages + Moroccan Darija)
+- 🔊 **Text-to-Speech** - Fish Audio S1 (Natural voice)
+- 🎭 **Voice Cloning** - Hear translation in your own voice!
+- 🧠 **Smart Mode** - Auto language detection
+## 🌍 Supported Languages
+- 🇲🇦 **Moroccan Arabic (Darija)** - الدارجة المغربية
+- 🇸🇦 Arabic (MSA)
+- 🇫🇷 French
+- 🇬🇧 English
+- 🇪🇸 Spanish
+- 🇩🇪 German
+- 🇮🇹 Italian
+- 🇵🇹 Portuguese
+- 🇨🇳 Chinese
+- 🇯🇵 Japanese
+- 🇰🇷 Korean
+- 🇷🇺 Russian
+- And 190+ more languages!
+## 🎯 How to Use
+1. **Select Languages**: Choose your source and target languages
+2. **Record**: Click the microphone button and speak clearly
+3. **Translate**: Click "Translate" button
+4. **Listen**: Hear the translation with natural voice
+5. **Voice Clone**: Enable to hear translation in your own voice!
+## 🔧 Technology
+- **STT**: Meta's SeamlessM4T v2 Large
+- **Translation**: Meta's NLLB-200
+- **TTS**: Fish Audio S1
+- **Voice Cloning**: Fish Audio API
+- **Framework**: Gradio + PyTorch
+## 🔒 Privacy & Security
+- ✅ No data stored
+- ✅ Real-time processing
+- ✅ Secure API calls
+- ✅ Open source
+## 📱 Use Cases
+- 🗣️ Real-time conversations
+- 📚 Language learning
+- 🌐 Travel assistance
+- 💼 Business meetings
+- 🎓 Education
+## 🚀 Coming Soon
+- 💳 Premium features with Apple Pay & Google Pay
+- 📱 Mobile app (iOS & Android)
+- 🎯 More languages
+- 🔊 More voice options
+---
+**Made with ❤️ using Meta AI models**

app.py CHANGED Viewed

@@ -4,32 +4,31 @@ from transformers import (
     SeamlessM4Tv2ForSpeechToText,
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
-    SeamlessM4Tv2Model,
 )
 import torch
 import numpy as np
-import torchaudio
 # ============================================================
-# 🚀 Device Setup
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Device: {device}")
 # ============================================================
-# 📥 Load Models
 # ============================================================
-# 1. SeamlessM4T v2 Large for STT
-print("📥 Loading SeamlessM4T v2 Large (STT)...")
 STT_MODEL = "facebook/seamless-m4t-v2-large"
 stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
 stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
 stt_model = stt_model.to(device).eval()
 print("✅ SeamlessM4T v2 Large loaded!")
-# 2. NLLB-200 for Translation
 print("📥 Loading NLLB-200...")
 NLLB_MODEL = "facebook/nllb-200-distilled-600M"
 nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
@@ -37,60 +36,67 @@ nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
 nllb_model = nllb_model.to(device).eval()
 print("✅ NLLB-200 loaded!")
-# 3. SeamlessExpressive for Expressive Speech Translation
-print("📥 Loading SeamlessExpressive...")
-EXPRESSIVE_MODEL = "facebook/seamless-expressive"
-try:
-    exp_processor = AutoProcessor.from_pretrained(EXPRESSIVE_MODEL)
-    exp_model = SeamlessM4Tv2Model.from_pretrained(EXPRESSIVE_MODEL)
-    exp_model = exp_model.to(device).eval()
-    EXPRESSIVE_AVAILABLE = True
-    print("✅ SeamlessExpressive loaded!")
-except Exception as e:
-    EXPRESSIVE_AVAILABLE = False
-    print(f"⚠️ SeamlessExpressive not available: {e}")
 print("🎉 All models ready!")
 # ============================================================
 # Language Codes
 # ============================================================
 NLLB_LANGS = {
-    "English": "eng_Latn", "French": "fra_Latn", "Arabic": "arb_Arab",
-    "Moroccan Arabic": "ary_Arab", "Spanish": "spa_Latn", "German": "deu_Latn",
-    "Italian": "ita_Latn", "Portuguese": "por_Latn", "Chinese": "zho_Hans",
-    "Japanese": "jpn_Jpan", "Korean": "kor_Hang", "Russian": "rus_Cyrl",
-    "Turkish": "tur_Latn", "Dutch": "nld_Latn", "Hindi": "hin_Deva",
 }
 STT_LANGS = {
-    "English": "eng", "French": "fra", "Arabic": "arb", "Spanish": "spa",
-    "German": "deu", "Italian": "ita", "Portuguese": "por", "Chinese": "cmn",
-    "Japanese": "jpn", "Korean": "kor", "Russian": "rus", "Turkish": "tur",
 }
-EXPRESSIVE_LANGS = ["English", "French", "German", "Spanish", "Italian", "Chinese"]
 # ============================================================
-# STT Function (SeamlessM4T v2 Large)
 # ============================================================
-def stt(audio, src_lang):
-    """Speech-to-Text using SeamlessM4T v2 Large"""
     if audio is None:
-        return "No audio provided"
     try:
         if isinstance(audio, tuple):
             sample_rate, audio_data = audio
             audio_data = audio_data.astype(np.float32)
             if np.abs(audio_data).max() > 1.0:
                 audio_data = audio_data / 32768.0
         else:
-            return "Invalid audio format"
-        src_code = STT_LANGS.get(src_lang, "eng")
         inputs = stt_processor(
             audios=audio_data,
@@ -105,28 +111,16 @@ def stt(audio, src_lang):
                 generate_speech=False
             )
-        text = stt_processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
-        return text
-    except Exception as e:
-        return f"Error: {str(e)}"
-# ============================================================
-# Translation Function (NLLB-200)
-# ============================================================
-def translate(text, src_lang, tgt_lang):
-    """Translation using NLLB-200"""
-    if not text or not text.strip():
-        return ""
-    try:
-        src_code = NLLB_LANGS.get(src_lang, "eng_Latn")
-        tgt_code = NLLB_LANGS.get(tgt_lang, "fra_Latn")
-        nllb_tokenizer.src_lang = src_code
-        inputs = nllb_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
-        forced_bos_token_id = nllb_tokenizer.convert_tokens_to_ids(tgt_code)
         with torch.no_grad():
             outputs = nllb_model.generate(
@@ -136,91 +130,165 @@ def translate(text, src_lang, tgt_lang):
                 num_beams=5
             )
-        return nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    except Exception as e:
-        return f"Error: {str(e)}"
-# ============================================================
-# Expressive Speech Translation (SeamlessExpressive)
-# ============================================================
-def expressive_translate(audio, src_lang, tgt_lang):
-    """Expressive Speech-to-Speech Translation"""
-    if not EXPRESSIVE_AVAILABLE:
-        return None, "SeamlessExpressive not available"
-    if audio is None:
-        return None, "No audio provided"
     try:
-        if isinstance(audio, tuple):
-            sample_rate, audio_data = audio
-            audio_data = audio_data.astype(np.float32)
-            if np.abs(audio_data).max() > 1.0:
-                audio_data = audio_data / 32768.0
-        else:
-            return None, "Invalid audio format"
-        src_code = STT_LANGS.get(src_lang, "eng")
-        tgt_code = STT_LANGS.get(tgt_lang, "fra")
-        inputs = exp_processor(
-            audios=audio_data,
-            sampling_rate=sample_rate,
-            return_tensors="pt"
-        ).to(device)
-        with torch.no_grad():
-            output = exp_model.generate(
-                **inputs,
-                tgt_lang=tgt_code,
-                return_intermediate_token_ids=True
             )
-        # Get audio output
-        audio_output = output.audio_sequences[0].cpu().numpy()
-        # Get text
-        text = exp_processor.decode(output.sequences[0].tolist(), skip_special_tokens=True)
-        return (16000, audio_output), text
-    except Exception as e:
-        return None, f"Error: {str(e)}"
 # ============================================================
 # Gradio Interface
 # ============================================================
-with gr.Blocks(title="STTR API", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🌍 STTR - Speech & Translation API")
-    gr.Markdown("**Meta AI Models:** SeamlessM4T v2 Large + NLLB-200 + SeamlessExpressive")
-    with gr.Tab("🎤 Speech-to-Text"):
-        stt_audio = gr.Audio(label="Audio", type="numpy")
-        stt_lang = gr.Dropdown(list(STT_LANGS.keys()), label="Language", value="English")
-        stt_output = gr.Textbox(label="Transcription", lines=3)
-        stt_btn = gr.Button("🎤 Transcribe", variant="primary")
-        stt_btn.click(stt, [stt_audio, stt_lang], stt_output, api_name="stt")
-    with gr.Tab("🌍 Translation"):
-        trans_text = gr.Textbox(label="Text", lines=3)
-        with gr.Row():
-            trans_src = gr.Dropdown(list(NLLB_LANGS.keys()), label="From", value="English")
-            trans_tgt = gr.Dropdown(list(NLLB_LANGS.keys()), label="To", value="French")
-        trans_output = gr.Textbox(label="Translation", lines=3)
-        trans_btn = gr.Button("🌍 Translate", variant="primary")
-        trans_btn.click(translate, [trans_text, trans_src, trans_tgt], trans_output, api_name="translate")
-    with gr.Tab("🎭 Expressive (S2S)"):
-        gr.Markdown("**SeamlessExpressive** - Preserves tone, emotion & style!")
-        exp_audio = gr.Audio(label="Input Audio", type="numpy")
-        with gr.Row():
-            exp_src = gr.Dropdown(EXPRESSIVE_LANGS, label="From", value="English")
-            exp_tgt = gr.Dropdown(EXPRESSIVE_LANGS, label="To", value="French")
-        exp_output_audio = gr.Audio(label="Translated Audio")
-        exp_output_text = gr.Textbox(label="Translated Text")
-        exp_btn = gr.Button("🎭 Translate with Expression", variant="primary")
-        exp_btn.click(expressive_translate, [exp_audio, exp_src, exp_tgt], [exp_output_audio, exp_output_text], api_name="expressive")
-demo.launch()

     SeamlessM4Tv2ForSpeechToText,
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
 )
 import torch
 import numpy as np
+import requests
+import os
 # ============================================================
+# Device Setup
 # ============================================================
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🖥️ Device: {device}")
 # ============================================================
+# Load Models
 # ============================================================
+# SeamlessM4T v2 Large for STT
+print("📥 Loading SeamlessM4T v2 Large...")
 STT_MODEL = "facebook/seamless-m4t-v2-large"
 stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
 stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
 stt_model = stt_model.to(device).eval()
 print("✅ SeamlessM4T v2 Large loaded!")
+# NLLB-200 for Translation
 print("📥 Loading NLLB-200...")
 NLLB_MODEL = "facebook/nllb-200-distilled-600M"
 nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
 nllb_model = nllb_model.to(device).eval()
 print("✅ NLLB-200 loaded!")
 print("🎉 All models ready!")
 # ============================================================
 # Language Codes
 # ============================================================
 NLLB_LANGS = {
+    "🇲🇦 Moroccan Arabic (Darija)": "ary_Arab",
+    "🇸🇦 Arabic": "arb_Arab",
+    "🇫🇷 French": "fra_Latn",
+    "🇬🇧 English": "eng_Latn",
+    "🇪🇸 Spanish": "spa_Latn",
+    "🇩🇪 German": "deu_Latn",
+    "🇮🇹 Italian": "ita_Latn",
+    "🇵🇹 Portuguese": "por_Latn",
+    "🇨🇳 Chinese": "zho_Hans",
+    "🇯🇵 Japanese": "jpn_Jpan",
+    "🇰🇷 Korean": "kor_Hang",
+    "🇷🇺 Russian": "rus_Cyrl",
+    "🇹🇷 Turkish": "tur_Latn",
+    "🇳🇱 Dutch": "nld_Latn",
+    "🇮🇳 Hindi": "hin_Deva",
 }
 STT_LANGS = {
+    "🇲🇦 Moroccan Arabic (Darija)": "arb",
+    "🇸🇦 Arabic": "arb",
+    "🇫🇷 French": "fra",
+    "🇬🇧 English": "eng",
+    "🇪🇸 Spanish": "spa",
+    "🇩🇪 German": "deu",
+    "🇮🇹 Italian": "ita",
+    "🇵🇹 Portuguese": "por",
+    "🇨🇳 Chinese": "cmn",
+    "🇯🇵 Japanese": "jpn",
+    "🇰🇷 Korean": "kor",
+    "🇷🇺 Russian": "rus",
 }
+# Fish Audio API
+FISH_AUDIO_API_KEY = os.environ.get('FISH_AUDIO_API_KEY', '')
 # ============================================================
+# Functions
 # ============================================================
+def translate_audio(audio, source_lang, target_lang, enable_voice_clone):
+    """Complete translation pipeline"""
     if audio is None:
+        return None, "❌ Please record audio first"
     try:
+        # 1. STT
         if isinstance(audio, tuple):
             sample_rate, audio_data = audio
             audio_data = audio_data.astype(np.float32)
             if np.abs(audio_data).max() > 1.0:
                 audio_data = audio_data / 32768.0
         else:
+            return None, "❌ Invalid audio format"
+        src_code = STT_LANGS.get(source_lang, "eng")
         inputs = stt_processor(
             audios=audio_data,
                 generate_speech=False
             )
+        transcript = stt_processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)
+        # 2. Translation
+        src_nllb = NLLB_LANGS.get(source_lang, "eng_Latn")
+        tgt_nllb = NLLB_LANGS.get(target_lang, "fra_Latn")
+        nllb_tokenizer.src_lang = src_nllb
+        inputs = nllb_tokenizer(transcript, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+        forced_bos_token_id = nllb_tokenizer.convert_tokens_to_ids(tgt_nllb)
         with torch.no_grad():
             outputs = nllb_model.generate(
                 num_beams=5
             )
+        translation = nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # 3. TTS with Fish Audio
+        tts_audio = None
+        if FISH_AUDIO_API_KEY:
+            tts_audio = generate_tts(translation, enable_voice_clone, audio if enable_voice_clone else None)
+        result_text = f"""
+### 🎤 {source_lang}
+{transcript}
+### 🌍 {target_lang}
+{translation}
+"""
+        return tts_audio, result_text
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def generate_tts(text, clone_voice=False, reference_audio=None):
+    """Generate TTS using Fish Audio"""
+    if not FISH_AUDIO_API_KEY:
+        return None
     try:
+        headers = {'Authorization': f'Bearer {FISH_AUDIO_API_KEY}'}
+        if clone_voice and reference_audio:
+            # Voice cloning
+            import tempfile
+            import scipy.io.wavfile as wavfile
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as f:
+                wavfile.write(f.name, reference_audio[0], reference_audio[1])
+                audio_path = f.name
+            with open(audio_path, 'rb') as f:
+                files = {'reference_audio': ('ref.wav', f.read(), 'audio/wav')}
+            data = {
+                'text': text,
+                'format': 'mp3',
+                'mp3_bitrate': '192',
+                'latency': 'balanced',
+                'normalize': 'true',
+            }
+            response = requests.post(
+                'https://api.fish.audio/v1/tts',
+                headers=headers,
+                files=files,
+                data=data,
+                timeout=120
+            )
+            os.remove(audio_path)
+        else:
+            # Standard TTS
+            payload = {
+                'text': text,
+                'format': 'mp3',
+                'mp3_bitrate': 192,
+            }
+            response = requests.post(
+                'https://api.fish.audio/v1/tts',
+                headers=headers,
+                json=payload,
+                timeout=60
             )
+        if response.status_code == 200:
+            import tempfile
+            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as f:
+                f.write(response.content)
+                return f.name
+        return None
+    except:
+        return None
 # ============================================================
 # Gradio Interface
 # ============================================================
+with gr.Blocks(theme=gr.themes.Soft(), title="Instant Translat") as demo:
+    gr.Markdown("""
+    # 🌍 Instant Translat - AI Voice Translation
+    **Real-time voice translation powered by Meta AI**
+    - 🎤 **STT**: SeamlessM4T v2 Large (101 languages)
+    - 🌍 **Translation**: NLLB-200 (200 languages + Darija)
+    - 🔊 **TTS**: Fish Audio S1 (Natural voice)
+    - 🎭 **Voice Cloning**: Your voice in any language
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                label="🎤 Record Your Voice",
+                type="numpy",
+                sources=["microphone"]
+            )
+            source_lang = gr.Dropdown(
+                choices=list(NLLB_LANGS.keys()),
+                value="🇲🇦 Moroccan Arabic (Darija)",
+                label="🗣️ Source Language"
+            )
+            target_lang = gr.Dropdown(
+                choices=list(NLLB_LANGS.keys()),
+                value="🇬🇧 English",
+                label="🎯 Target Language"
+            )
+            voice_clone = gr.Checkbox(
+                label="🎭 Clone Voice (Use your voice for translation)",
+                value=True
+            )
+            translate_btn = gr.Button(
+                "🌍 Translate",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="🔊 Translation Audio")
+            text_output = gr.Markdown(label="📝 Translation Text")
+    translate_btn.click(
+        translate_audio,
+        inputs=[audio_input, source_lang, target_lang, voice_clone],
+        outputs=[audio_output, text_output]
+    )
+    gr.Markdown("""
+    ## 🎯 How to Use
+    1. **Select Languages**: Choose your source and target languages
+    2. **Record**: Click the microphone and speak clearly
+    3. **Translate**: Click the translate button
+    4. **Listen**: Hear the translation in natural voice (or your cloned voice!)
+    ## 🌍 Supported Languages
+    - 🇲🇦 **Moroccan Darija** (Moroccan Arabic)
+    - 🇸🇦 Arabic (MSA)
+    - 🇫🇷 French
+    - 🇬🇧 English
+    - 🇪🇸 Spanish
+    - 🇩🇪 German
+    - And 190+ more languages!
+    ## 🔒 Privacy
+    - No data is stored
+    - Real-time processing
+    - Secure API calls
+    """)
+if __name__ == "__main__":
+    demo.launch()