STTR commited on
Commit
30d00e8
Β·
1 Parent(s): df4ae9b

Add SeamlessExpressive + SeamlessM4T v2 Large + NLLB-200

Browse files
Files changed (2) hide show
  1. README.md +5 -5
  2. app.py +85 -12
README.md CHANGED
@@ -11,14 +11,14 @@ license: mit
11
  hardware: t4-small
12
  ---
13
 
14
- # 🌍 STTR - Speech-to-Text & Translation API
15
 
16
- **Meta AI Models:**
17
  - 🎀 **SeamlessM4T v2 Large** - STT (101 languages)
18
  - 🌍 **NLLB-200** - Translation (200 languages + Darija!)
19
- - 🎭 **SeamlessExpressive** - Expressive Speech Translation
20
 
21
- **API Endpoints:**
22
  - `/stt` - Speech-to-Text
23
  - `/translate` - Text Translation
24
- - `/expressive` - Expressive Speech Translation
 
11
  hardware: t4-small
12
  ---
13
 
14
+ # 🌍 STTR - Speech & Translation API
15
 
16
+ ## Meta AI Models:
17
  - 🎀 **SeamlessM4T v2 Large** - STT (101 languages)
18
  - 🌍 **NLLB-200** - Translation (200 languages + Darija!)
19
+ - 🎭 **SeamlessExpressive** - Expressive Speech Translation (preserves tone!)
20
 
21
+ ## API Endpoints:
22
  - `/stt` - Speech-to-Text
23
  - `/translate` - Text Translation
24
+ - `/expressive` - Expressive Speech-to-Speech Translation
app.py CHANGED
@@ -3,10 +3,12 @@ from transformers import (
3
  AutoProcessor,
4
  SeamlessM4Tv2ForSpeechToText,
5
  AutoModelForSeq2SeqLM,
6
- AutoTokenizer
 
7
  )
8
  import torch
9
  import numpy as np
 
10
 
11
  # ============================================================
12
  # πŸš€ Device Setup
@@ -19,24 +21,35 @@ print(f"πŸ–₯️ Device: {device}")
19
  # πŸ“₯ Load Models
20
  # ============================================================
21
 
22
- # SeamlessM4T v2 Large for STT
23
- print("πŸ“₯ Loading SeamlessM4T v2 Large...")
24
  STT_MODEL = "facebook/seamless-m4t-v2-large"
25
  stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
26
  stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
27
- stt_model = stt_model.to(device)
28
- stt_model.eval()
29
  print("βœ… SeamlessM4T v2 Large loaded!")
30
 
31
- # NLLB-200 for Translation
32
  print("πŸ“₯ Loading NLLB-200...")
33
  NLLB_MODEL = "facebook/nllb-200-distilled-600M"
34
  nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
35
  nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
36
- nllb_model = nllb_model.to(device)
37
- nllb_model.eval()
38
  print("βœ… NLLB-200 loaded!")
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  print("πŸŽ‰ All models ready!")
41
 
42
  # ============================================================
@@ -55,11 +68,12 @@ STT_LANGS = {
55
  "English": "eng", "French": "fra", "Arabic": "arb", "Spanish": "spa",
56
  "German": "deu", "Italian": "ita", "Portuguese": "por", "Chinese": "cmn",
57
  "Japanese": "jpn", "Korean": "kor", "Russian": "rus", "Turkish": "tur",
58
- "Dutch": "nld", "Hindi": "hin",
59
  }
60
 
 
 
61
  # ============================================================
62
- # STT Function
63
  # ============================================================
64
 
65
  def stt(audio, src_lang):
@@ -97,7 +111,7 @@ def stt(audio, src_lang):
97
  return f"Error: {str(e)}"
98
 
99
  # ============================================================
100
- # Translation Function
101
  # ============================================================
102
 
103
  def translate(text, src_lang, tgt_lang):
@@ -126,13 +140,61 @@ def translate(text, src_lang, tgt_lang):
126
  except Exception as e:
127
  return f"Error: {str(e)}"
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  # ============================================================
130
  # Gradio Interface
131
  # ============================================================
132
 
133
  with gr.Blocks(title="STTR API", theme=gr.themes.Soft()) as demo:
134
  gr.Markdown("# 🌍 STTR - Speech & Translation API")
135
- gr.Markdown("**SeamlessM4T v2 Large** + **NLLB-200** (200 languages + Darija!)")
136
 
137
  with gr.Tab("🎀 Speech-to-Text"):
138
  stt_audio = gr.Audio(label="Audio", type="numpy")
@@ -149,5 +211,16 @@ with gr.Blocks(title="STTR API", theme=gr.themes.Soft()) as demo:
149
  trans_output = gr.Textbox(label="Translation", lines=3)
150
  trans_btn = gr.Button("🌍 Translate", variant="primary")
151
  trans_btn.click(translate, [trans_text, trans_src, trans_tgt], trans_output, api_name="translate")
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  demo.launch()
 
3
  AutoProcessor,
4
  SeamlessM4Tv2ForSpeechToText,
5
  AutoModelForSeq2SeqLM,
6
+ AutoTokenizer,
7
+ SeamlessM4Tv2Model,
8
  )
9
  import torch
10
  import numpy as np
11
+ import torchaudio
12
 
13
  # ============================================================
14
  # πŸš€ Device Setup
 
21
  # πŸ“₯ Load Models
22
  # ============================================================
23
 
24
+ # 1. SeamlessM4T v2 Large for STT
25
+ print("πŸ“₯ Loading SeamlessM4T v2 Large (STT)...")
26
  STT_MODEL = "facebook/seamless-m4t-v2-large"
27
  stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
28
  stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
29
+ stt_model = stt_model.to(device).eval()
 
30
  print("βœ… SeamlessM4T v2 Large loaded!")
31
 
32
+ # 2. NLLB-200 for Translation
33
  print("πŸ“₯ Loading NLLB-200...")
34
  NLLB_MODEL = "facebook/nllb-200-distilled-600M"
35
  nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
36
  nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
37
+ nllb_model = nllb_model.to(device).eval()
 
38
  print("βœ… NLLB-200 loaded!")
39
 
40
+ # 3. SeamlessExpressive for Expressive Speech Translation
41
+ print("πŸ“₯ Loading SeamlessExpressive...")
42
+ EXPRESSIVE_MODEL = "facebook/seamless-expressive"
43
+ try:
44
+ exp_processor = AutoProcessor.from_pretrained(EXPRESSIVE_MODEL)
45
+ exp_model = SeamlessM4Tv2Model.from_pretrained(EXPRESSIVE_MODEL)
46
+ exp_model = exp_model.to(device).eval()
47
+ EXPRESSIVE_AVAILABLE = True
48
+ print("βœ… SeamlessExpressive loaded!")
49
+ except Exception as e:
50
+ EXPRESSIVE_AVAILABLE = False
51
+ print(f"⚠️ SeamlessExpressive not available: {e}")
52
+
53
  print("πŸŽ‰ All models ready!")
54
 
55
  # ============================================================
 
68
  "English": "eng", "French": "fra", "Arabic": "arb", "Spanish": "spa",
69
  "German": "deu", "Italian": "ita", "Portuguese": "por", "Chinese": "cmn",
70
  "Japanese": "jpn", "Korean": "kor", "Russian": "rus", "Turkish": "tur",
 
71
  }
72
 
73
+ EXPRESSIVE_LANGS = ["English", "French", "German", "Spanish", "Italian", "Chinese"]
74
+
75
  # ============================================================
76
+ # STT Function (SeamlessM4T v2 Large)
77
  # ============================================================
78
 
79
  def stt(audio, src_lang):
 
111
  return f"Error: {str(e)}"
112
 
113
  # ============================================================
114
+ # Translation Function (NLLB-200)
115
  # ============================================================
116
 
117
  def translate(text, src_lang, tgt_lang):
 
140
  except Exception as e:
141
  return f"Error: {str(e)}"
142
 
143
+ # ============================================================
144
+ # Expressive Speech Translation (SeamlessExpressive)
145
+ # ============================================================
146
+
147
+ def expressive_translate(audio, src_lang, tgt_lang):
148
+ """Expressive Speech-to-Speech Translation"""
149
+ if not EXPRESSIVE_AVAILABLE:
150
+ return None, "SeamlessExpressive not available"
151
+
152
+ if audio is None:
153
+ return None, "No audio provided"
154
+
155
+ try:
156
+ if isinstance(audio, tuple):
157
+ sample_rate, audio_data = audio
158
+ audio_data = audio_data.astype(np.float32)
159
+ if np.abs(audio_data).max() > 1.0:
160
+ audio_data = audio_data / 32768.0
161
+ else:
162
+ return None, "Invalid audio format"
163
+
164
+ src_code = STT_LANGS.get(src_lang, "eng")
165
+ tgt_code = STT_LANGS.get(tgt_lang, "fra")
166
+
167
+ inputs = exp_processor(
168
+ audios=audio_data,
169
+ sampling_rate=sample_rate,
170
+ return_tensors="pt"
171
+ ).to(device)
172
+
173
+ with torch.no_grad():
174
+ output = exp_model.generate(
175
+ **inputs,
176
+ tgt_lang=tgt_code,
177
+ return_intermediate_token_ids=True
178
+ )
179
+
180
+ # Get audio output
181
+ audio_output = output.audio_sequences[0].cpu().numpy()
182
+
183
+ # Get text
184
+ text = exp_processor.decode(output.sequences[0].tolist(), skip_special_tokens=True)
185
+
186
+ return (16000, audio_output), text
187
+
188
+ except Exception as e:
189
+ return None, f"Error: {str(e)}"
190
+
191
  # ============================================================
192
  # Gradio Interface
193
  # ============================================================
194
 
195
  with gr.Blocks(title="STTR API", theme=gr.themes.Soft()) as demo:
196
  gr.Markdown("# 🌍 STTR - Speech & Translation API")
197
+ gr.Markdown("**Meta AI Models:** SeamlessM4T v2 Large + NLLB-200 + SeamlessExpressive")
198
 
199
  with gr.Tab("🎀 Speech-to-Text"):
200
  stt_audio = gr.Audio(label="Audio", type="numpy")
 
211
  trans_output = gr.Textbox(label="Translation", lines=3)
212
  trans_btn = gr.Button("🌍 Translate", variant="primary")
213
  trans_btn.click(translate, [trans_text, trans_src, trans_tgt], trans_output, api_name="translate")
214
+
215
+ with gr.Tab("🎭 Expressive (S2S)"):
216
+ gr.Markdown("**SeamlessExpressive** - Preserves tone, emotion & style!")
217
+ exp_audio = gr.Audio(label="Input Audio", type="numpy")
218
+ with gr.Row():
219
+ exp_src = gr.Dropdown(EXPRESSIVE_LANGS, label="From", value="English")
220
+ exp_tgt = gr.Dropdown(EXPRESSIVE_LANGS, label="To", value="French")
221
+ exp_output_audio = gr.Audio(label="Translated Audio")
222
+ exp_output_text = gr.Textbox(label="Translated Text")
223
+ exp_btn = gr.Button("🎭 Translate with Expression", variant="primary")
224
+ exp_btn.click(expressive_translate, [exp_audio, exp_src, exp_tgt], [exp_output_audio, exp_output_text], api_name="expressive")
225
 
226
  demo.launch()