nirukti / app.py
Anonym-050326's picture
Upload folder using huggingface_hub
82c88f2 verified
"""
Nirukti — Speech AI for Indian Languages
ASR (Speech to Text) + TTS (Text to Speech) + Translation
Set API_URL, TTS_API_URL environment variables to point to your backends.
"""
import os
import tempfile
import gradio as gr
import numpy as np
import requests
import soundfile as sf
# ── API URLs ─────────────────────────────────────────────────────────────────
ASR_API_URL = os.environ.get("API_URL", "https://genv3.uwc.world")
TTS_API_URL = os.environ.get("TTS_API_URL", "http://15.206.159.28:9051")
TRANSLATE_API_URL = os.environ.get("TRANSLATE_API_URL", "https://rakuten-d1.shunyalabs.ai")
TTS_SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "samples")
# ── ASR: Fallback language families ──────────────────────────────────────────
FALLBACK_LANGUAGE_FAMILIES = {
"Indo-Aryan": [
"Hindi", "Bengali", "Marathi", "Gujarati", "Punjabi", "Odia",
"Assamese", "Maithili", "Bhojpuri", "Rajasthani", "Magahi",
"Chhattisgarhi", "Urdu", "Kashmiri", "Nepali", "Sindhi", "Dogri",
"Konkani", "Sanskrit", "Marwadi", "Mewari", "Wagdi", "Harouti",
"Bagri", "Banjari", "Awadhi", "Bundeli", "Braj", "Haryanvi",
"Bagheli", "Garhwali", "Kumaoni", "Kangri", "Pahari Mahasui",
"Nimadi", "Bhili", "Lambadi", "Khortha", "Sambalpuri",
"Kachchhi", "Ahirani", "Surgujia",
],
"Dravidian": [
"Telugu", "Tamil", "Kannada", "Malayalam", "Tulu", "Kodava", "Kurukh",
],
"Sino-Tibetan": ["Manipuri", "Meitei", "Bodo", "Garo"],
"Austroasiatic": ["Santali"],
"Indo-European": ["English"],
}
# ── TTS: Sample voices config ───────────────────────────────────────────────
TTS_SAMPLE_LANGUAGES = {
"Hindi": "rajesh",
"English": "varun",
"Bengali": "arjun",
"Santali": "chandu",
"Tamil": "murugan",
"Telugu": "vishnu",
"Kannada": "kiran",
"Malayalam": "krishnan",
"Gujarati": "rakesh",
"Marathi": "siddharth",
}
TTS_EXPRESSIONS = [
"neutral", "happy", "angry", "sad", "news",
"conversational", "narrative", "enthusiastic",
"fearful", "disgust", "surprised",
]
# Style tags prepended to text for expressive generation
TTS_STYLE_TAGS = {
"Neutral": "<Neutral>",
"Happy": "<Happy>",
"Angry": "<Angry>",
"Sad": "<Sad>",
"News": "<News>",
"Fearful": "<Fearful>",
"Surprised": "<Surprised>",
"Disgust": "<Disgust>",
"Conversational": "<Conversational>",
"Narrative": "<Narrative>",
"Enthusiastic": "<Enthusiastic>",
}
# ── TTS: Sample generation texts per language ────────────────────────────────
TTS_SAMPLE_TEXTS = {
"Hindi": (
"आज की बैठक में हम तीसरी तिमाही की उपलब्धियों की समीक्षा करेंगे "
"और चौथी तिमाही के लिए एक ठोस कार्ययोजना तैयार करेंगे। "
"टीम का सहयोग और प्रतिबद्धता इस लक्ष्य को पूरा करने में अत्यंत महत्वपूर्ण है।"
),
"English": (
"The quarterly performance review revealed significant improvements "
"across all key metrics, underscoring our team's commitment to "
"excellence. We must continue this momentum into the next quarter."
),
"Bengali": (
"আজকের সভায় আমরা তৃতীয় প্রান্তিকের ফলাফল পর্যালোচনা করব এবং "
"চতুর্থ প্রান্তিকের জন্য একটি কার্যকর পরিকল্পনা তৈরি করব। "
"দলের প্রত্যেক সদস্যের সক্রিয় অংশগ্রহণ অপরিহার্য।"
),
"Santali": (
"ᱟᱡᱤ ᱵᱟᱭᱥᱤ ᱨᱮ ᱑᱓ᱟᱢ ᱠᱳᱣᱟᱨᱴᱟᱨ ᱒ᱟᱲᱟ ᱢᱮᱱᱟᱜ ᱠᱟᱱᱟ ᱟᱨ "
"᱑᱔ᱟᱢ ᱠᱳᱣᱟᱨᱴᱟᱨ ᱒ᱟᱲᱟ ᱡᱚᱠᱷᱟᱨ ᱴᱷᱤᱠ ᱠᱟᱱᱟ꯫ "
"ᱴᱤᱢ ᱒ᱟᱲᱟ ᱦᱚᱲ ᱠᱚ ᱥᱟᱦᱟᱡᱚᱜ ᱠᱚ ᱢᱮᱱᱟᱭ ᱠᱟᱱᱟ꯫"
),
"Tamil": (
"இன்றைய கூட்டத்தில் மூன்றாவது காலாண்டின் முடிவுகளை மதிப்பாய்வு "
"செய்வோம் மற்றும் நான்காவது காலாண்டிற்கான திட்டத்தை உருவாக்குவோம். "
"அணியின் ஒவ்வொரு உறுப்பினரின் அர்ப்பணிப்பும் மிகவும் அவசியம்."
),
"Telugu": (
"నేటి సమావేశంలో మనం మూడవ త్రైమాసికం ఫలితాలను సమీక్షిస్తాం "
"మరియు నాల్గవ త్రైమాసికం కోసం ఒక బలమైన ప్రణాళికను రూపొందిస్తాం. "
"బృందంలో ప్రతి సభ్యుని నిబద్ధత ఈ లక్ష్యసాధనకు చాలా అవసరం."
),
"Kannada": (
"ಇಂದಿನ ಸಭೆಯಲ್ಲಿ ನಾವು ಮೂರನೇ ತ್ರೈಮಾಸಿಕದ ಫಲಿತಾಂಶಗಳನ್ನು ಪರಿಶೀಲಿಸುತ್ತೇವೆ "
"ಮತ್ತು ನಾಲ್ಕನೇ ತ್ರೈಮಾಸಿಕದ ಕಾರ್ಯಯೋಜನೆಯನ್ನು ಸಿದ್ಧಪಡಿಸುತ್ತೇವೆ. "
"ತಂಡದ ಪ್ರತಿಯೊಬ್ಬ ಸದಸ್ಯರ ಪ್ರತಿಬದ್ಧತೆ ಅತ್ಯಂತ ಅವಶ್ಯಕ."
),
"Malayalam": (
"ഇന്നത്തെ യോഗത്തിൽ നാം മൂന്നാം പാദത്തിലെ ഫലങ്ങൾ അവലോകനം "
"ചെയ്യുകയും നാലാം പാദത്തിനായി ഒരു ഉറച്ച പദ്ധതി തയ്യാറാക്കുകയും "
"ചെയ്യും. ടീമിലെ ഓരോ അംഗത്തിന്റെ പ്രതിബദ്ധത അത്യന്താപേക്ഷിതമാണ്."
),
"Gujarati": (
"આજની બેઠકમાં આપણે ત્રીજા ત્રિમાસિક ગાળાના પરિણામોની સમીક્ષા "
"કરીશું અને ચોથા ત્રિમાસિક ગાળા માટે એક મજબૂત કાર્યયોજના "
"તૈયાર કરીશું। ટીમ ના દરેક સભ્ય નો સહયોગ અત્યંત મહત્વ નો છે।"
),
"Marathi": (
"आजच्या सभेत आपण तिसऱ्या तिमाहीच्या निकालांचा आढावा घेऊ "
"आणि चौथ्या तिमाहीसाठी एक भक्कम कार्ययोजना आखू. "
"टीमच्या प्रत्येक सदस्याचे सहकार्य या उद्दिष्टपूर्तीसाठी अत्यावश्यक आहे."
),
}
def _sample_path(lang: str, speaker: str, expression: str) -> str | None:
path = os.path.join(
TTS_SAMPLES_DIR, lang.lower(), f"{speaker}_{expression}_pure.wav"
)
return path if os.path.exists(path) else None
# ── ASR: Fetch languages ────────────────────────────────────────────────────
def fetch_languages(api_url: str) -> dict[str, list[str]]:
url = f"{api_url.rstrip('/')}/languages"
resp = requests.get(url, timeout=10)
resp.raise_for_status()
data = resp.json()
families = data.get("families")
if families:
return families
all_langs = set(data.get("all_languages", []))
if not all_langs:
return FALLBACK_LANGUAGE_FAMILIES
filtered = {}
for fam, langs in FALLBACK_LANGUAGE_FAMILIES.items():
supported = [l for l in langs if l in all_langs]
if supported:
filtered[fam] = supported
return filtered if filtered else FALLBACK_LANGUAGE_FAMILIES
# ── ASR: Audio helper ───────────────────────────────────────────────────────
def _audio_to_tmp_wav(audio_input) -> str:
sr, wav = audio_input
wav = np.asarray(wav)
if np.issubdtype(wav.dtype, np.integer):
info = np.iinfo(wav.dtype)
wav = wav.astype(np.float32) / max(abs(info.min), info.max)
else:
wav = wav.astype(np.float32)
if wav.ndim > 1:
wav = np.mean(wav, axis=-1)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(tmp.name, wav, sr)
tmp.close()
return tmp.name
# ── ASR: Transcribe ─────────────────────────────────────────────────────────
def transcribe_fn(audio_input, language, api_url):
if audio_input is None:
return "", "", "No audio"
tmp_path = _audio_to_tmp_wav(audio_input)
api_lang = language.replace(" ", "_")
try:
url = f"{api_url.rstrip('/')}/transcribe"
with open(tmp_path, "rb") as f:
files = {"audio": (os.path.basename(tmp_path), f)}
data = {"language": api_lang}
resp = requests.post(url, files=files, data=data, timeout=300)
resp.raise_for_status()
result = resp.json()
except requests.ConnectionError:
return "", "", f"Cannot connect to {api_url}"
except requests.HTTPError as e:
try:
detail = e.response.json().get("detail", "")
except Exception:
detail = e.response.text
return "", "", f"API error: {detail}"
except Exception as e:
return "", "", f"Error: {e}"
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
proxy = result.get("asr_language", "")
proxy_note = f" (via {proxy})" if proxy and proxy != api_lang else ""
info = f"{language}{proxy_note} | {result.get('inference_time_s', '?')}s"
return (
result.get("text", ""),
result.get("detected_language", result.get("asr_language", "")),
info,
)
# ── TTS: Fetch speakers ─────────────────────────────────────────────────────
def fetch_speakers():
try:
resp = requests.get(f"{TTS_API_URL}/speakers", timeout=10)
resp.raise_for_status()
return resp.json()["speakers"]
except Exception as e:
print(f"Failed to fetch speakers: {e}")
return {}
# ── TTS: Generate speech ────────────────────────────────────────────────────
def generate_speech(speaker_selection, text):
if not speaker_selection or " - " not in speaker_selection:
raise gr.Error("Please select a speaker.")
if not text or not text.strip():
raise gr.Error("Please enter some text.")
speaker_name = speaker_selection.split(" - ")[0].strip()
try:
resp = requests.post(
f"{TTS_API_URL}/tts",
json={"text": text.strip(), "speaker": speaker_name},
timeout=120,
)
resp.raise_for_status()
except requests.HTTPError as e:
detail = ""
try:
detail = e.response.json().get("detail", "")
except Exception:
pass
raise gr.Error(f"API error: {detail or str(e)}")
except requests.ConnectionError:
raise gr.Error("Cannot connect to TTS API.")
except requests.Timeout:
raise gr.Error("Request timed out. Try shorter text.")
os.makedirs("/tmp/nirukti-tts", exist_ok=True)
out_path = f"/tmp/nirukti-tts/{speaker_name}.wav"
with open(out_path, "wb") as f:
f.write(resp.content)
return out_path, out_path
# ── TTS: Update sample audios + text on language click ───────────────────────
def _make_sample_updater(lang: str):
speaker = TTS_SAMPLE_LANGUAGES[lang]
sample_text = TTS_SAMPLE_TEXTS.get(lang, "")
def updater():
results = []
for expr in TTS_EXPRESSIONS:
path = _sample_path(lang, speaker, expr)
results.append(gr.Audio(value=path, label=f"{speaker.title()}{expr.title()}"))
results.append(f"**{lang}{speaker.title()}**")
results.append(sample_text)
return results
return updater
# ── Translation: Fetch supported languages ───────────────────────────────────
def fetch_translate_languages():
try:
resp = requests.get(
f"{TRANSLATE_API_URL}/api/v1/supported-languages", timeout=10
)
resp.raise_for_status()
data = resp.json()
# Build name -> code mapping from all categories
lang_map = {}
for entry in data.get("supported", []) + data.get("fine_tuned", []) + data.get("proxied", []):
lang_map[entry["name"]] = entry["code"]
return lang_map
except Exception as e:
print(f"Failed to fetch translate languages: {e}")
return {}
# ── Translation: Translate text ──────────────────────────────────────────────
def translate_fn(src_lang, tgt_lang, text, lang_map):
if not text or not text.strip():
return "", ""
src_code = lang_map.get(src_lang, src_lang.lower()[:2])
tgt_code = lang_map.get(tgt_lang, tgt_lang.lower()[:2])
try:
resp = requests.post(
f"{TRANSLATE_API_URL}/api/v1/translate-text",
json={
"text": text.strip(),
"source_lang": src_code,
"target_lang": tgt_code,
},
timeout=60,
)
resp.raise_for_status()
result = resp.json()
except requests.ConnectionError:
return f"Cannot connect to translation API", ""
except requests.HTTPError as e:
try:
detail = e.response.json().get("detail", "")
except Exception:
detail = e.response.text
return f"API error: {detail}", ""
except Exception as e:
return f"Error: {e}", ""
translated = result.get("translated_text", "")
parts = [f"{result.get('processing_time_ms', '?')}ms"]
if result.get("source_proxy"):
parts.append(f"src proxy: {result['source_proxy']}")
if result.get("target_proxy"):
parts.append(f"tgt proxy: {result['target_proxy']}")
info = " | ".join(parts)
return translated, info
# ── Build the UI ─────────────────────────────────────────────────────────────
def build_app(
asr_api_url: str,
families: dict[str, list[str]],
tts_speakers: dict,
translate_langs: dict,
) -> gr.Blocks:
if not families:
raise RuntimeError("No language families available.")
total = sum(len(v) for v in families.values())
n_families = len(families)
first_family = next(iter(families))
first_langs = families[first_family]
speaker_choices = (
[f"{name} - {desc}" for name, desc in tts_speakers.items()]
if tts_speakers
else ["No speakers available"]
)
header_html = f"""\
<div class="header-block">
<h1>Nirukti</h1>
<p>Speech AI for <b>{total} Indian Languages</b></p>
<span class="badge">{total} Languages &middot; ASR &middot; TTS &middot; Translation</span>
</div>"""
with gr.Blocks() as demo:
asr_api_state = gr.State(asr_api_url)
gr.HTML(header_html)
# ━━━━━━━━━━━━━━━━━━━ ASR TAB ━━━━━━━━━━━━━━━━━━━
all_asr_langs = []
for fam_langs in families.values():
all_asr_langs.extend(fam_langs)
with gr.Tab("Speech to Text (ASR)"):
lang_radio = gr.Radio(
choices=all_asr_langs,
label="Language",
value=all_asr_langs[0] if all_asr_langs else None,
interactive=True,
)
with gr.Row(equal_height=True):
with gr.Column(scale=2):
audio_in = gr.Audio(
label="Upload or Record",
type="numpy",
sources=["upload", "microphone"],
)
with gr.Row():
transcribe_btn = gr.Button("Transcribe", variant="primary", scale=2)
clear_btn = gr.Button("Clear", variant="secondary", scale=1)
with gr.Column(scale=3):
out_text = gr.Textbox(
label="Transcription", lines=10,
interactive=False,
)
with gr.Row():
out_lang = gr.Textbox(label="Detected Language", lines=1, interactive=False, scale=1)
out_info = gr.Textbox(label="Info", lines=1, interactive=False, scale=2)
transcribe_btn.click(
transcribe_fn,
inputs=[audio_in, lang_radio, asr_api_state],
outputs=[out_text, out_lang, out_info],
)
clear_btn.click(
lambda: (None, all_asr_langs[0] if all_asr_langs else None, "", "", ""),
outputs=[audio_in, lang_radio, out_text, out_lang, out_info],
)
# ━━━━━━━━━━━━━━━━━━━ TTS TAB ━━━━━━━━━━━━━━━━━━━
with gr.Tab("Text to Speech (TTS)"):
# -- Generation controls --
tts_text = gr.Textbox(
label="Text",
lines=4,
placeholder="Enter text to synthesize...",
value=TTS_SAMPLE_TEXTS.get("Hindi", ""),
)
tts_speaker = gr.Dropdown(
choices=speaker_choices,
label="Speaker",
value=speaker_choices[0] if speaker_choices else None,
)
tts_btn = gr.Button(
"Generate", variant="primary", elem_classes=["generate-btn"], size="lg",
)
with gr.Row():
tts_audio = gr.Audio(label="Generated Audio", type="filepath")
tts_download = gr.File(label="Download WAV")
tts_btn.click(
generate_speech,
inputs=[tts_speaker, tts_text],
outputs=[tts_audio, tts_download],
)
gr.Markdown("---")
# -- Voice samples grid --
gr.Markdown("### Voice Samples")
gr.Markdown("Click a language to preview voice samples across expressions.")
with gr.Row():
lang_btns = {}
for lang_name in TTS_SAMPLE_LANGUAGES:
lang_btns[lang_name] = gr.Button(
lang_name, elem_classes=["lang-btn"], size="sm",
)
sample_label = gr.Markdown(
value="**Hindi — Rajesh**", elem_classes=["sample-label"]
)
sample_audios = []
default_lang = "Hindi"
default_speaker = TTS_SAMPLE_LANGUAGES[default_lang]
# Row 1: first 6 expressions
with gr.Row():
for expr in TTS_EXPRESSIONS[:6]:
path = _sample_path(default_lang, default_speaker, expr)
sample_audios.append(
gr.Audio(
value=path,
label=f"{default_speaker.title()}{expr.title()}",
interactive=False,
)
)
# Row 2: remaining 5 expressions
with gr.Row():
for expr in TTS_EXPRESSIONS[6:]:
path = _sample_path(default_lang, default_speaker, expr)
sample_audios.append(
gr.Audio(
value=path,
label=f"{default_speaker.title()}{expr.title()}",
interactive=False,
)
)
# Wire language buttons to update sample audios + text
for lang_name, btn in lang_btns.items():
updater = _make_sample_updater(lang_name)
btn.click(updater, outputs=sample_audios + [sample_label, tts_text])
# ━━━━━━━━━━━━━━━━━━━ TRANSLATION TAB ━━━━━━━━━━━━━━━━━━━
trans_lang_names = sorted(translate_langs.keys()) if translate_langs else ["English", "Hindi"]
trans_lang_state = gr.State(translate_langs)
with gr.Tab("Translation"):
gr.Markdown("### Text Translation")
with gr.Row():
trans_src_lang = gr.Dropdown(
choices=trans_lang_names,
label="Source Language",
value="English" if "English" in trans_lang_names else trans_lang_names[0],
scale=1,
)
trans_tgt_lang = gr.Dropdown(
choices=trans_lang_names,
label="Target Language",
value="Hindi" if "Hindi" in trans_lang_names else trans_lang_names[-1],
scale=1,
)
with gr.Row(equal_height=True):
trans_input = gr.Textbox(
label="Source Text", lines=6,
placeholder="Enter text to translate...",
)
trans_output = gr.Textbox(
label="Translated Text", lines=6,
interactive=False,
)
trans_info = gr.Textbox(label="Info", lines=1, interactive=False)
trans_btn = gr.Button("Translate", variant="primary")
trans_btn.click(
translate_fn,
inputs=[trans_src_lang, trans_tgt_lang, trans_input, trans_lang_state],
outputs=[trans_output, trans_info],
)
# -- Footer --
gr.HTML(
'<div style="text-align:center;color:#aaa;font-size:0.78em;padding:8px 0 0;'
'border-top:1px solid #eee;margin-top:12px;">'
"Nirukti</div>"
)
return demo
# ── Main ─────────────────────────────────────────────────────────────────────
print(f"Fetching ASR languages from {ASR_API_URL}/languages ...")
try:
families = fetch_languages(ASR_API_URL)
total = sum(len(v) for v in families.values())
if total == 0:
families = FALLBACK_LANGUAGE_FAMILIES
print(f" Got {sum(len(v) for v in families.values())} languages across {len(families)} families")
except Exception as e:
print(f" Failed: {e}. Using fallback.")
families = FALLBACK_LANGUAGE_FAMILIES
print(f"Fetching TTS speakers from {TTS_API_URL}/speakers ...")
tts_speakers = fetch_speakers()
print(f" Got {len(tts_speakers)} speakers")
print(f"Fetching translation languages from {TRANSLATE_API_URL} ...")
translate_langs = fetch_translate_languages()
print(f" Got {len(translate_langs)} languages")
demo = build_app(ASR_API_URL, families, tts_speakers, translate_langs)
theme = gr.themes.Soft(
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
primary_hue="blue",
)
css = """\
.family-btn { min-width: 90px !important; font-size: 0.82em !important; padding: 4px 8px !important; }
.lang-btn { min-width: 80px !important; font-size: 0.82em !important; padding: 4px 8px !important; }
.sample-label { text-align: center; font-size: 0.78em; color: #666; margin-top: 2px; }
footer { display: none !important; }
.header-block {
background: linear-gradient(135deg, #0F2850 0%, #1A4080 100%);
border-radius: 12px; padding: 24px 20px 18px; margin-bottom: 16px; text-align: center;
}
.header-block h1 { margin: 0 !important; font-size: 1.8em !important; color: #fff !important; font-weight: 700 !important; }
.header-block p { margin: 4px 0 0 !important; font-size: 0.9em !important; color: #ccc !important; }
.header-block .badge {
display: inline-block; margin-top: 10px; padding: 3px 12px;
background: rgba(255,255,255,0.12); border: 1px solid rgba(255,255,255,0.3);
border-radius: 16px; font-size: 0.75em !important; color: #fff !important;
}
.generate-btn {
background: linear-gradient(135deg, #0F2850, #1A4080) !important;
border: none !important; color: #fff !important; font-weight: 600 !important;
}
"""
demo.queue(default_concurrency_limit=4).launch(theme=theme, css=css)