STTR
commited on
Commit
ยท
2ceddcf
1
Parent(s):
40e1a06
Add beautiful custom theme and CSS
Browse files
app.py
CHANGED
|
@@ -20,7 +20,6 @@ print(f"๐ฅ๏ธ Device: {device}")
|
|
| 20 |
# Load Models
|
| 21 |
# ============================================================
|
| 22 |
|
| 23 |
-
# SeamlessM4T v2 Large for STT
|
| 24 |
print("๐ฅ Loading SeamlessM4T v2 Large...")
|
| 25 |
STT_MODEL = "facebook/seamless-m4t-v2-large"
|
| 26 |
stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
|
|
@@ -28,7 +27,6 @@ stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(STT_MODEL)
|
|
| 28 |
stt_model = stt_model.to(device).eval()
|
| 29 |
print("โ
SeamlessM4T v2 Large loaded!")
|
| 30 |
|
| 31 |
-
# NLLB-200 for Translation
|
| 32 |
print("๐ฅ Loading NLLB-200...")
|
| 33 |
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
|
| 34 |
nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
|
|
@@ -36,8 +34,6 @@ nllb_model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
|
|
| 36 |
nllb_model = nllb_model.to(device).eval()
|
| 37 |
print("โ
NLLB-200 loaded!")
|
| 38 |
|
| 39 |
-
print("๐ All models ready!")
|
| 40 |
-
|
| 41 |
# ============================================================
|
| 42 |
# Language Codes
|
| 43 |
# ============================================================
|
|
@@ -54,9 +50,6 @@ NLLB_LANGS = {
|
|
| 54 |
"๐ฏ๐ต Japanese": "jpn_Jpan",
|
| 55 |
"๐ฐ๐ท Korean": "kor_Hang",
|
| 56 |
"๐ท๐บ Russian": "rus_Cyrl",
|
| 57 |
-
"๐น๐ท Turkish": "tur_Latn",
|
| 58 |
-
"๐ณ๐ฑ Dutch": "nld_Latn",
|
| 59 |
-
"๐ฎ๐ณ Hindi": "hin_Deva",
|
| 60 |
}
|
| 61 |
|
| 62 |
STT_LANGS = {
|
|
@@ -74,7 +67,6 @@ STT_LANGS = {
|
|
| 74 |
"๐ท๐บ Russian": "rus",
|
| 75 |
}
|
| 76 |
|
| 77 |
-
# Fish Audio API
|
| 78 |
FISH_AUDIO_API_KEY = os.environ.get('FISH_AUDIO_API_KEY', '')
|
| 79 |
|
| 80 |
# ============================================================
|
|
@@ -132,17 +124,21 @@ def translate_audio(audio, source_lang, target_lang, enable_voice_clone):
|
|
| 132 |
|
| 133 |
translation = nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 134 |
|
| 135 |
-
# 3. TTS
|
| 136 |
tts_audio = None
|
| 137 |
if FISH_AUDIO_API_KEY:
|
| 138 |
tts_audio = generate_tts(translation, enable_voice_clone, audio if enable_voice_clone else None)
|
| 139 |
|
| 140 |
result_text = f"""
|
| 141 |
-
|
| 142 |
-
{
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
|
| 145 |
-
{
|
|
|
|
|
|
|
| 146 |
"""
|
| 147 |
|
| 148 |
return tts_audio, result_text
|
|
@@ -159,7 +155,6 @@ def generate_tts(text, clone_voice=False, reference_audio=None):
|
|
| 159 |
headers = {'Authorization': f'Bearer {FISH_AUDIO_API_KEY}'}
|
| 160 |
|
| 161 |
if clone_voice and reference_audio:
|
| 162 |
-
# Voice cloning
|
| 163 |
import tempfile
|
| 164 |
import scipy.io.wavfile as wavfile
|
| 165 |
|
|
@@ -188,7 +183,6 @@ def generate_tts(text, clone_voice=False, reference_audio=None):
|
|
| 188 |
|
| 189 |
os.remove(audio_path)
|
| 190 |
else:
|
| 191 |
-
# Standard TTS
|
| 192 |
payload = {
|
| 193 |
'text': text,
|
| 194 |
'format': 'mp3',
|
|
@@ -213,54 +207,132 @@ def generate_tts(text, clone_voice=False, reference_audio=None):
|
|
| 213 |
return None
|
| 214 |
|
| 215 |
# ============================================================
|
| 216 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
# ============================================================
|
| 218 |
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
gr.Markdown("""
|
| 221 |
-
# ๐ Instant Translat
|
| 222 |
-
|
| 223 |
|
| 224 |
-
-
|
| 225 |
-
- ๐ **Translation**: NLLB-200 (200 languages + Darija)
|
| 226 |
-
- ๐ **TTS**: Fish Audio S1 (Natural voice)
|
| 227 |
-
- ๐ญ **Voice Cloning**: Your voice in any language
|
| 228 |
""")
|
| 229 |
|
| 230 |
with gr.Row():
|
| 231 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
| 232 |
audio_input = gr.Audio(
|
| 233 |
-
label="
|
| 234 |
type="numpy",
|
| 235 |
-
sources=["microphone"]
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
source_lang = gr.Dropdown(
|
| 239 |
-
choices=list(NLLB_LANGS.keys()),
|
| 240 |
-
value="๐ฒ๐ฆ Moroccan Arabic (Darija)",
|
| 241 |
-
label="๐ฃ๏ธ Source Language"
|
| 242 |
)
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
voice_clone = gr.Checkbox(
|
| 251 |
-
label="๐ญ Clone Voice
|
| 252 |
-
value=True
|
|
|
|
| 253 |
)
|
| 254 |
|
| 255 |
translate_btn = gr.Button(
|
| 256 |
-
"๐ Translate",
|
| 257 |
variant="primary",
|
| 258 |
-
size="lg"
|
|
|
|
| 259 |
)
|
| 260 |
|
| 261 |
with gr.Column(scale=1):
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
translate_btn.click(
|
| 266 |
translate_audio,
|
|
@@ -269,25 +341,24 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Instant Translat") as demo:
|
|
| 269 |
)
|
| 270 |
|
| 271 |
gr.Markdown("""
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
-
|
| 279 |
-
- ๐ฒ๐ฆ **Moroccan Darija** (Moroccan Arabic)
|
| 280 |
-
- ๐ธ๐ฆ Arabic (MSA)
|
| 281 |
-
- ๐ซ๐ท French
|
| 282 |
-
- ๐ฌ๐ง English
|
| 283 |
-
- ๐ช๐ธ Spanish
|
| 284 |
-
- ๐ฉ๐ช German
|
| 285 |
-
- And 190+ more languages!
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
- Secure API calls
|
| 291 |
""")
|
| 292 |
|
| 293 |
if __name__ == "__main__":
|
|
|
|
| 20 |
# Load Models
|
| 21 |
# ============================================================
|
| 22 |
|
|
|
|
| 23 |
print("๐ฅ Loading SeamlessM4T v2 Large...")
|
| 24 |
STT_MODEL = "facebook/seamless-m4t-v2-large"
|
| 25 |
stt_processor = AutoProcessor.from_pretrained(STT_MODEL)
|
|
|
|
| 27 |
stt_model = stt_model.to(device).eval()
|
| 28 |
print("โ
SeamlessM4T v2 Large loaded!")
|
| 29 |
|
|
|
|
| 30 |
print("๐ฅ Loading NLLB-200...")
|
| 31 |
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
|
| 32 |
nllb_tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
|
|
|
|
| 34 |
nllb_model = nllb_model.to(device).eval()
|
| 35 |
print("โ
NLLB-200 loaded!")
|
| 36 |
|
|
|
|
|
|
|
| 37 |
# ============================================================
|
| 38 |
# Language Codes
|
| 39 |
# ============================================================
|
|
|
|
| 50 |
"๐ฏ๐ต Japanese": "jpn_Jpan",
|
| 51 |
"๐ฐ๐ท Korean": "kor_Hang",
|
| 52 |
"๐ท๐บ Russian": "rus_Cyrl",
|
|
|
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
STT_LANGS = {
|
|
|
|
| 67 |
"๐ท๐บ Russian": "rus",
|
| 68 |
}
|
| 69 |
|
|
|
|
| 70 |
FISH_AUDIO_API_KEY = os.environ.get('FISH_AUDIO_API_KEY', '')
|
| 71 |
|
| 72 |
# ============================================================
|
|
|
|
| 124 |
|
| 125 |
translation = nllb_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 126 |
|
| 127 |
+
# 3. TTS
|
| 128 |
tts_audio = None
|
| 129 |
if FISH_AUDIO_API_KEY:
|
| 130 |
tts_audio = generate_tts(translation, enable_voice_clone, audio if enable_voice_clone else None)
|
| 131 |
|
| 132 |
result_text = f"""
|
| 133 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 12px; margin: 10px 0;">
|
| 134 |
+
<h3 style="color: white; margin: 0 0 10px 0;">๐ค {source_lang}</h3>
|
| 135 |
+
<p style="color: white; font-size: 1.1em; margin: 0;">{transcript}</p>
|
| 136 |
+
</div>
|
| 137 |
|
| 138 |
+
<div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 20px; border-radius: 12px; margin: 10px 0;">
|
| 139 |
+
<h3 style="color: white; margin: 0 0 10px 0;">๐ {target_lang}</h3>
|
| 140 |
+
<p style="color: white; font-size: 1.1em; margin: 0;">{translation}</p>
|
| 141 |
+
</div>
|
| 142 |
"""
|
| 143 |
|
| 144 |
return tts_audio, result_text
|
|
|
|
| 155 |
headers = {'Authorization': f'Bearer {FISH_AUDIO_API_KEY}'}
|
| 156 |
|
| 157 |
if clone_voice and reference_audio:
|
|
|
|
| 158 |
import tempfile
|
| 159 |
import scipy.io.wavfile as wavfile
|
| 160 |
|
|
|
|
| 183 |
|
| 184 |
os.remove(audio_path)
|
| 185 |
else:
|
|
|
|
| 186 |
payload = {
|
| 187 |
'text': text,
|
| 188 |
'format': 'mp3',
|
|
|
|
| 207 |
return None
|
| 208 |
|
| 209 |
# ============================================================
|
| 210 |
+
# Custom CSS
|
| 211 |
+
# ============================================================
|
| 212 |
+
|
| 213 |
+
custom_css = """
|
| 214 |
+
/* Modern Gradient Background */
|
| 215 |
+
.gradio-container {
|
| 216 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 217 |
+
font-family: 'Inter', sans-serif;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
/* Card Style */
|
| 221 |
+
.contain {
|
| 222 |
+
background: rgba(255, 255, 255, 0.95) !important;
|
| 223 |
+
border-radius: 20px !important;
|
| 224 |
+
box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3) !important;
|
| 225 |
+
padding: 30px !important;
|
| 226 |
+
backdrop-filter: blur(10px) !important;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
/* Buttons */
|
| 230 |
+
.primary {
|
| 231 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
|
| 232 |
+
border: none !important;
|
| 233 |
+
border-radius: 12px !important;
|
| 234 |
+
padding: 15px 30px !important;
|
| 235 |
+
font-weight: 600 !important;
|
| 236 |
+
font-size: 1.1em !important;
|
| 237 |
+
transition: all 0.3s ease !important;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.primary:hover {
|
| 241 |
+
transform: translateY(-2px) !important;
|
| 242 |
+
box-shadow: 0 10px 25px rgba(102, 126, 234, 0.4) !important;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
/* Input Fields */
|
| 246 |
+
.input-audio, .dropdown {
|
| 247 |
+
border-radius: 12px !important;
|
| 248 |
+
border: 2px solid #e0e0e0 !important;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
/* Headers */
|
| 252 |
+
h1, h2, h3 {
|
| 253 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 254 |
+
-webkit-background-clip: text;
|
| 255 |
+
-webkit-text-fill-color: transparent;
|
| 256 |
+
font-weight: 700;
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
/* Markdown Content */
|
| 260 |
+
.markdown-text {
|
| 261 |
+
line-height: 1.8;
|
| 262 |
+
}
|
| 263 |
+
"""
|
| 264 |
+
|
| 265 |
+
# ============================================================
|
| 266 |
+
# Gradio Interface with Custom Theme
|
| 267 |
# ============================================================
|
| 268 |
|
| 269 |
+
theme = gr.themes.Soft(
|
| 270 |
+
primary_hue="purple",
|
| 271 |
+
secondary_hue="pink",
|
| 272 |
+
neutral_hue="slate",
|
| 273 |
+
font=gr.themes.GoogleFont("Inter"),
|
| 274 |
+
).set(
|
| 275 |
+
button_primary_background_fill="*primary_500",
|
| 276 |
+
button_primary_background_fill_hover="*primary_600",
|
| 277 |
+
button_primary_text_color="white",
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
with gr.Blocks(theme=theme, css=custom_css, title="Instant Translat") as demo:
|
| 281 |
gr.Markdown("""
|
| 282 |
+
# ๐ Instant Translat
|
| 283 |
+
### AI-Powered Voice Translation in 200+ Languages
|
| 284 |
|
| 285 |
+
Translate your voice instantly with cutting-edge AI. Supports Moroccan Darija and 200+ languages!
|
|
|
|
|
|
|
|
|
|
| 286 |
""")
|
| 287 |
|
| 288 |
with gr.Row():
|
| 289 |
with gr.Column(scale=1):
|
| 290 |
+
gr.Markdown("### ๐ค Input")
|
| 291 |
+
|
| 292 |
audio_input = gr.Audio(
|
| 293 |
+
label="Record Your Voice",
|
| 294 |
type="numpy",
|
| 295 |
+
sources=["microphone"],
|
| 296 |
+
elem_classes="input-audio"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
)
|
| 298 |
|
| 299 |
+
with gr.Row():
|
| 300 |
+
source_lang = gr.Dropdown(
|
| 301 |
+
choices=list(NLLB_LANGS.keys()),
|
| 302 |
+
value="๐ฒ๐ฆ Moroccan Arabic (Darija)",
|
| 303 |
+
label="๐ฃ๏ธ From",
|
| 304 |
+
elem_classes="dropdown"
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
target_lang = gr.Dropdown(
|
| 308 |
+
choices=list(NLLB_LANGS.keys()),
|
| 309 |
+
value="๐ฌ๐ง English",
|
| 310 |
+
label="๐ฏ To",
|
| 311 |
+
elem_classes="dropdown"
|
| 312 |
+
)
|
| 313 |
|
| 314 |
voice_clone = gr.Checkbox(
|
| 315 |
+
label="๐ญ Clone My Voice",
|
| 316 |
+
value=True,
|
| 317 |
+
info="Hear translation in your own voice"
|
| 318 |
)
|
| 319 |
|
| 320 |
translate_btn = gr.Button(
|
| 321 |
+
"๐ Translate Now",
|
| 322 |
variant="primary",
|
| 323 |
+
size="lg",
|
| 324 |
+
elem_classes="primary"
|
| 325 |
)
|
| 326 |
|
| 327 |
with gr.Column(scale=1):
|
| 328 |
+
gr.Markdown("### ๐ Output")
|
| 329 |
+
|
| 330 |
+
audio_output = gr.Audio(
|
| 331 |
+
label="Translation Audio",
|
| 332 |
+
type="filepath"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
text_output = gr.HTML(label="Translation Text")
|
| 336 |
|
| 337 |
translate_btn.click(
|
| 338 |
translate_audio,
|
|
|
|
| 341 |
)
|
| 342 |
|
| 343 |
gr.Markdown("""
|
| 344 |
+
---
|
| 345 |
+
|
| 346 |
+
## โจ Features
|
| 347 |
+
|
| 348 |
+
- ๐ค **Speech Recognition** - Powered by Meta's SeamlessM4T v2 Large
|
| 349 |
+
- ๐ **Translation** - 200+ languages with NLLB-200
|
| 350 |
+
- ๐ **Natural Voice** - Fish Audio S1 TTS
|
| 351 |
+
- ๐ญ **Voice Cloning** - Hear translation in your voice
|
| 352 |
+
|
| 353 |
+
## ๐ Popular Languages
|
| 354 |
+
|
| 355 |
+
๐ฒ๐ฆ Moroccan Darija โข ๐ธ๐ฆ Arabic โข ๐ซ๐ท French โข ๐ฌ๐ง English โข ๐ช๐ธ Spanish โข ๐ฉ๐ช German โข ๐ฎ๐น Italian โข ๐ต๐น Portuguese โข ๐จ๐ณ Chinese โข ๐ฏ๐ต Japanese โข ๐ฐ๐ท Korean โข ๐ท๐บ Russian
|
| 356 |
|
| 357 |
+
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
+
<div style="text-align: center; padding: 20px;">
|
| 360 |
+
<p style="color: #666;">Made with โค๏ธ using Meta AI โข Powered by HuggingFace</p>
|
| 361 |
+
</div>
|
|
|
|
| 362 |
""")
|
| 363 |
|
| 364 |
if __name__ == "__main__":
|