""" Nirukti — Speech AI for Indian Languages ASR (Speech to Text) + TTS (Text to Speech) + Translation Set API_URL, TTS_API_URL environment variables to point to your backends. """ import os import tempfile import gradio as gr import numpy as np import requests import soundfile as sf # ── API URLs ───────────────────────────────────────────────────────────────── ASR_API_URL = os.environ.get("API_URL", "https://genv3.uwc.world") TTS_API_URL = os.environ.get("TTS_API_URL", "http://15.206.159.28:9051") TRANSLATE_API_URL = os.environ.get("TRANSLATE_API_URL", "https://rakuten-d1.shunyalabs.ai") TTS_SAMPLES_DIR = os.path.join(os.path.dirname(__file__), "samples") # ── ASR: Fallback language families ────────────────────────────────────────── FALLBACK_LANGUAGE_FAMILIES = { "Indo-Aryan": [ "Hindi", "Bengali", "Marathi", "Gujarati", "Punjabi", "Odia", "Assamese", "Maithili", "Bhojpuri", "Rajasthani", "Magahi", "Chhattisgarhi", "Urdu", "Kashmiri", "Nepali", "Sindhi", "Dogri", "Konkani", "Sanskrit", "Marwadi", "Mewari", "Wagdi", "Harouti", "Bagri", "Banjari", "Awadhi", "Bundeli", "Braj", "Haryanvi", "Bagheli", "Garhwali", "Kumaoni", "Kangri", "Pahari Mahasui", "Nimadi", "Bhili", "Lambadi", "Khortha", "Sambalpuri", "Kachchhi", "Ahirani", "Surgujia", ], "Dravidian": [ "Telugu", "Tamil", "Kannada", "Malayalam", "Tulu", "Kodava", "Kurukh", ], "Sino-Tibetan": ["Manipuri", "Meitei", "Bodo", "Garo"], "Austroasiatic": ["Santali"], "Indo-European": ["English"], } # ── TTS: Sample voices config ─────────────────────────────────────────────── TTS_SAMPLE_LANGUAGES = { "Hindi": "rajesh", "English": "varun", "Bengali": "arjun", "Santali": "chandu", "Tamil": "murugan", "Telugu": "vishnu", "Kannada": "kiran", "Malayalam": "krishnan", "Gujarati": "rakesh", "Marathi": "siddharth", } TTS_EXPRESSIONS = [ "neutral", "happy", "angry", "sad", "news", "conversational", "narrative", "enthusiastic", "fearful", "disgust", "surprised", ] # Style tags prepended to text for expressive generation TTS_STYLE_TAGS = { "Neutral": "", "Happy": "", "Angry": "", "Sad": "", "News": "", "Fearful": "", "Surprised": "", "Disgust": "", "Conversational": "", "Narrative": "", "Enthusiastic": "", } # ── TTS: Sample generation texts per language ──────────────────────────────── TTS_SAMPLE_TEXTS = { "Hindi": ( "आज की बैठक में हम तीसरी तिमाही की उपलब्धियों की समीक्षा करेंगे " "और चौथी तिमाही के लिए एक ठोस कार्ययोजना तैयार करेंगे। " "टीम का सहयोग और प्रतिबद्धता इस लक्ष्य को पूरा करने में अत्यंत महत्वपूर्ण है।" ), "English": ( "The quarterly performance review revealed significant improvements " "across all key metrics, underscoring our team's commitment to " "excellence. We must continue this momentum into the next quarter." ), "Bengali": ( "আজকের সভায় আমরা তৃতীয় প্রান্তিকের ফলাফল পর্যালোচনা করব এবং " "চতুর্থ প্রান্তিকের জন্য একটি কার্যকর পরিকল্পনা তৈরি করব। " "দলের প্রত্যেক সদস্যের সক্রিয় অংশগ্রহণ অপরিহার্য।" ), "Santali": ( "ᱟᱡᱤ ᱵᱟᱭᱥᱤ ᱨᱮ ᱑᱓ᱟᱢ ᱠᱳᱣᱟᱨᱴᱟᱨ ᱒ᱟᱲᱟ ᱢᱮᱱᱟᱜ ᱠᱟᱱᱟ ᱟᱨ " "᱑᱔ᱟᱢ ᱠᱳᱣᱟᱨᱴᱟᱨ ᱒ᱟᱲᱟ ᱡᱚᱠᱷᱟᱨ ᱴᱷᱤᱠ ᱠᱟᱱᱟ꯫ " "ᱴᱤᱢ ᱒ᱟᱲᱟ ᱦᱚᱲ ᱠᱚ ᱥᱟᱦᱟᱡᱚᱜ ᱠᱚ ᱢᱮᱱᱟᱭ ᱠᱟᱱᱟ꯫" ), "Tamil": ( "இன்றைய கூட்டத்தில் மூன்றாவது காலாண்டின் முடிவுகளை மதிப்பாய்வு " "செய்வோம் மற்றும் நான்காவது காலாண்டிற்கான திட்டத்தை உருவாக்குவோம். " "அணியின் ஒவ்வொரு உறுப்பினரின் அர்ப்பணிப்பும் மிகவும் அவசியம்." ), "Telugu": ( "నేటి సమావేశంలో మనం మూడవ త్రైమాసికం ఫలితాలను సమీక్షిస్తాం " "మరియు నాల్గవ త్రైమాసికం కోసం ఒక బలమైన ప్రణాళికను రూపొందిస్తాం. " "బృందంలో ప్రతి సభ్యుని నిబద్ధత ఈ లక్ష్యసాధనకు చాలా అవసరం." ), "Kannada": ( "ಇಂದಿನ ಸಭೆಯಲ್ಲಿ ನಾವು ಮೂರನೇ ತ್ರೈಮಾಸಿಕದ ಫಲಿತಾಂಶಗಳನ್ನು ಪರಿಶೀಲಿಸುತ್ತೇವೆ " "ಮತ್ತು ನಾಲ್ಕನೇ ತ್ರೈಮಾಸಿಕದ ಕಾರ್ಯಯೋಜನೆಯನ್ನು ಸಿದ್ಧಪಡಿಸುತ್ತೇವೆ. " "ತಂಡದ ಪ್ರತಿಯೊಬ್ಬ ಸದಸ್ಯರ ಪ್ರತಿಬದ್ಧತೆ ಅತ್ಯಂತ ಅವಶ್ಯಕ." ), "Malayalam": ( "ഇന്നത്തെ യോഗത്തിൽ നാം മൂന്നാം പാദത്തിലെ ഫലങ്ങൾ അവലോകനം " "ചെയ്യുകയും നാലാം പാദത്തിനായി ഒരു ഉറച്ച പദ്ധതി തയ്യാറാക്കുകയും " "ചെയ്യും. ടീമിലെ ഓരോ അംഗത്തിന്റെ പ്രതിബദ്ധത അത്യന്താപേക്ഷിതമാണ്." ), "Gujarati": ( "આજની બેઠકમાં આપણે ત્રીજા ત્રિમાસિક ગાળાના પરિણામોની સમીક્ષા " "કરીશું અને ચોથા ત્રિમાસિક ગાળા માટે એક મજબૂત કાર્યયોજના " "તૈયાર કરીશું। ટીમ ના દરેક સભ્ય નો સહયોગ અત્યંત મહત્વ નો છે।" ), "Marathi": ( "आजच्या सभेत आपण तिसऱ्या तिमाहीच्या निकालांचा आढावा घेऊ " "आणि चौथ्या तिमाहीसाठी एक भक्कम कार्ययोजना आखू. " "टीमच्या प्रत्येक सदस्याचे सहकार्य या उद्दिष्टपूर्तीसाठी अत्यावश्यक आहे." ), } def _sample_path(lang: str, speaker: str, expression: str) -> str | None: path = os.path.join( TTS_SAMPLES_DIR, lang.lower(), f"{speaker}_{expression}_pure.wav" ) return path if os.path.exists(path) else None # ── ASR: Fetch languages ──────────────────────────────────────────────────── def fetch_languages(api_url: str) -> dict[str, list[str]]: url = f"{api_url.rstrip('/')}/languages" resp = requests.get(url, timeout=10) resp.raise_for_status() data = resp.json() families = data.get("families") if families: return families all_langs = set(data.get("all_languages", [])) if not all_langs: return FALLBACK_LANGUAGE_FAMILIES filtered = {} for fam, langs in FALLBACK_LANGUAGE_FAMILIES.items(): supported = [l for l in langs if l in all_langs] if supported: filtered[fam] = supported return filtered if filtered else FALLBACK_LANGUAGE_FAMILIES # ── ASR: Audio helper ─────────────────────────────────────────────────────── def _audio_to_tmp_wav(audio_input) -> str: sr, wav = audio_input wav = np.asarray(wav) if np.issubdtype(wav.dtype, np.integer): info = np.iinfo(wav.dtype) wav = wav.astype(np.float32) / max(abs(info.min), info.max) else: wav = wav.astype(np.float32) if wav.ndim > 1: wav = np.mean(wav, axis=-1) tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") sf.write(tmp.name, wav, sr) tmp.close() return tmp.name # ── ASR: Transcribe ───────────────────────────────────────────────────────── def transcribe_fn(audio_input, language, api_url): if audio_input is None: return "", "", "No audio" tmp_path = _audio_to_tmp_wav(audio_input) api_lang = language.replace(" ", "_") try: url = f"{api_url.rstrip('/')}/transcribe" with open(tmp_path, "rb") as f: files = {"audio": (os.path.basename(tmp_path), f)} data = {"language": api_lang} resp = requests.post(url, files=files, data=data, timeout=300) resp.raise_for_status() result = resp.json() except requests.ConnectionError: return "", "", f"Cannot connect to {api_url}" except requests.HTTPError as e: try: detail = e.response.json().get("detail", "") except Exception: detail = e.response.text return "", "", f"API error: {detail}" except Exception as e: return "", "", f"Error: {e}" finally: try: os.unlink(tmp_path) except OSError: pass proxy = result.get("asr_language", "") proxy_note = f" (via {proxy})" if proxy and proxy != api_lang else "" info = f"{language}{proxy_note} | {result.get('inference_time_s', '?')}s" return ( result.get("text", ""), result.get("detected_language", result.get("asr_language", "")), info, ) # ── TTS: Fetch speakers ───────────────────────────────────────────────────── def fetch_speakers(): try: resp = requests.get(f"{TTS_API_URL}/speakers", timeout=10) resp.raise_for_status() return resp.json()["speakers"] except Exception as e: print(f"Failed to fetch speakers: {e}") return {} # ── TTS: Generate speech ──────────────────────────────────────────────────── def generate_speech(speaker_selection, text): if not speaker_selection or " - " not in speaker_selection: raise gr.Error("Please select a speaker.") if not text or not text.strip(): raise gr.Error("Please enter some text.") speaker_name = speaker_selection.split(" - ")[0].strip() try: resp = requests.post( f"{TTS_API_URL}/tts", json={"text": text.strip(), "speaker": speaker_name}, timeout=120, ) resp.raise_for_status() except requests.HTTPError as e: detail = "" try: detail = e.response.json().get("detail", "") except Exception: pass raise gr.Error(f"API error: {detail or str(e)}") except requests.ConnectionError: raise gr.Error("Cannot connect to TTS API.") except requests.Timeout: raise gr.Error("Request timed out. Try shorter text.") os.makedirs("/tmp/nirukti-tts", exist_ok=True) out_path = f"/tmp/nirukti-tts/{speaker_name}.wav" with open(out_path, "wb") as f: f.write(resp.content) return out_path, out_path # ── TTS: Update sample audios + text on language click ─────────────────────── def _make_sample_updater(lang: str): speaker = TTS_SAMPLE_LANGUAGES[lang] sample_text = TTS_SAMPLE_TEXTS.get(lang, "") def updater(): results = [] for expr in TTS_EXPRESSIONS: path = _sample_path(lang, speaker, expr) results.append(gr.Audio(value=path, label=f"{speaker.title()} — {expr.title()}")) results.append(f"**{lang} — {speaker.title()}**") results.append(sample_text) return results return updater # ── Translation: Fetch supported languages ─────────────────────────────────── def fetch_translate_languages(): try: resp = requests.get( f"{TRANSLATE_API_URL}/api/v1/supported-languages", timeout=10 ) resp.raise_for_status() data = resp.json() # Build name -> code mapping from all categories lang_map = {} for entry in data.get("supported", []) + data.get("fine_tuned", []) + data.get("proxied", []): lang_map[entry["name"]] = entry["code"] return lang_map except Exception as e: print(f"Failed to fetch translate languages: {e}") return {} # ── Translation: Translate text ────────────────────────────────────────────── def translate_fn(src_lang, tgt_lang, text, lang_map): if not text or not text.strip(): return "", "" src_code = lang_map.get(src_lang, src_lang.lower()[:2]) tgt_code = lang_map.get(tgt_lang, tgt_lang.lower()[:2]) try: resp = requests.post( f"{TRANSLATE_API_URL}/api/v1/translate-text", json={ "text": text.strip(), "source_lang": src_code, "target_lang": tgt_code, }, timeout=60, ) resp.raise_for_status() result = resp.json() except requests.ConnectionError: return f"Cannot connect to translation API", "" except requests.HTTPError as e: try: detail = e.response.json().get("detail", "") except Exception: detail = e.response.text return f"API error: {detail}", "" except Exception as e: return f"Error: {e}", "" translated = result.get("translated_text", "") parts = [f"{result.get('processing_time_ms', '?')}ms"] if result.get("source_proxy"): parts.append(f"src proxy: {result['source_proxy']}") if result.get("target_proxy"): parts.append(f"tgt proxy: {result['target_proxy']}") info = " | ".join(parts) return translated, info # ── Build the UI ───────────────────────────────────────────────────────────── def build_app( asr_api_url: str, families: dict[str, list[str]], tts_speakers: dict, translate_langs: dict, ) -> gr.Blocks: if not families: raise RuntimeError("No language families available.") total = sum(len(v) for v in families.values()) n_families = len(families) first_family = next(iter(families)) first_langs = families[first_family] speaker_choices = ( [f"{name} - {desc}" for name, desc in tts_speakers.items()] if tts_speakers else ["No speakers available"] ) header_html = f"""\

Nirukti

Speech AI for {total} Indian Languages

{total} Languages · ASR · TTS · Translation
""" with gr.Blocks() as demo: asr_api_state = gr.State(asr_api_url) gr.HTML(header_html) # ━━━━━━━━━━━━━━━━━━━ ASR TAB ━━━━━━━━━━━━━━━━━━━ all_asr_langs = [] for fam_langs in families.values(): all_asr_langs.extend(fam_langs) with gr.Tab("Speech to Text (ASR)"): lang_radio = gr.Radio( choices=all_asr_langs, label="Language", value=all_asr_langs[0] if all_asr_langs else None, interactive=True, ) with gr.Row(equal_height=True): with gr.Column(scale=2): audio_in = gr.Audio( label="Upload or Record", type="numpy", sources=["upload", "microphone"], ) with gr.Row(): transcribe_btn = gr.Button("Transcribe", variant="primary", scale=2) clear_btn = gr.Button("Clear", variant="secondary", scale=1) with gr.Column(scale=3): out_text = gr.Textbox( label="Transcription", lines=10, interactive=False, ) with gr.Row(): out_lang = gr.Textbox(label="Detected Language", lines=1, interactive=False, scale=1) out_info = gr.Textbox(label="Info", lines=1, interactive=False, scale=2) transcribe_btn.click( transcribe_fn, inputs=[audio_in, lang_radio, asr_api_state], outputs=[out_text, out_lang, out_info], ) clear_btn.click( lambda: (None, all_asr_langs[0] if all_asr_langs else None, "", "", ""), outputs=[audio_in, lang_radio, out_text, out_lang, out_info], ) # ━━━━━━━━━━━━━━━━━━━ TTS TAB ━━━━━━━━━━━━━━━━━━━ with gr.Tab("Text to Speech (TTS)"): # -- Generation controls -- tts_text = gr.Textbox( label="Text", lines=4, placeholder="Enter text to synthesize...", value=TTS_SAMPLE_TEXTS.get("Hindi", ""), ) tts_speaker = gr.Dropdown( choices=speaker_choices, label="Speaker", value=speaker_choices[0] if speaker_choices else None, ) tts_btn = gr.Button( "Generate", variant="primary", elem_classes=["generate-btn"], size="lg", ) with gr.Row(): tts_audio = gr.Audio(label="Generated Audio", type="filepath") tts_download = gr.File(label="Download WAV") tts_btn.click( generate_speech, inputs=[tts_speaker, tts_text], outputs=[tts_audio, tts_download], ) gr.Markdown("---") # -- Voice samples grid -- gr.Markdown("### Voice Samples") gr.Markdown("Click a language to preview voice samples across expressions.") with gr.Row(): lang_btns = {} for lang_name in TTS_SAMPLE_LANGUAGES: lang_btns[lang_name] = gr.Button( lang_name, elem_classes=["lang-btn"], size="sm", ) sample_label = gr.Markdown( value="**Hindi — Rajesh**", elem_classes=["sample-label"] ) sample_audios = [] default_lang = "Hindi" default_speaker = TTS_SAMPLE_LANGUAGES[default_lang] # Row 1: first 6 expressions with gr.Row(): for expr in TTS_EXPRESSIONS[:6]: path = _sample_path(default_lang, default_speaker, expr) sample_audios.append( gr.Audio( value=path, label=f"{default_speaker.title()} — {expr.title()}", interactive=False, ) ) # Row 2: remaining 5 expressions with gr.Row(): for expr in TTS_EXPRESSIONS[6:]: path = _sample_path(default_lang, default_speaker, expr) sample_audios.append( gr.Audio( value=path, label=f"{default_speaker.title()} — {expr.title()}", interactive=False, ) ) # Wire language buttons to update sample audios + text for lang_name, btn in lang_btns.items(): updater = _make_sample_updater(lang_name) btn.click(updater, outputs=sample_audios + [sample_label, tts_text]) # ━━━━━━━━━━━━━━━━━━━ TRANSLATION TAB ━━━━━━━━━━━━━━━━━━━ trans_lang_names = sorted(translate_langs.keys()) if translate_langs else ["English", "Hindi"] trans_lang_state = gr.State(translate_langs) with gr.Tab("Translation"): gr.Markdown("### Text Translation") with gr.Row(): trans_src_lang = gr.Dropdown( choices=trans_lang_names, label="Source Language", value="English" if "English" in trans_lang_names else trans_lang_names[0], scale=1, ) trans_tgt_lang = gr.Dropdown( choices=trans_lang_names, label="Target Language", value="Hindi" if "Hindi" in trans_lang_names else trans_lang_names[-1], scale=1, ) with gr.Row(equal_height=True): trans_input = gr.Textbox( label="Source Text", lines=6, placeholder="Enter text to translate...", ) trans_output = gr.Textbox( label="Translated Text", lines=6, interactive=False, ) trans_info = gr.Textbox(label="Info", lines=1, interactive=False) trans_btn = gr.Button("Translate", variant="primary") trans_btn.click( translate_fn, inputs=[trans_src_lang, trans_tgt_lang, trans_input, trans_lang_state], outputs=[trans_output, trans_info], ) # -- Footer -- gr.HTML( '
' "Nirukti
" ) return demo # ── Main ───────────────────────────────────────────────────────────────────── print(f"Fetching ASR languages from {ASR_API_URL}/languages ...") try: families = fetch_languages(ASR_API_URL) total = sum(len(v) for v in families.values()) if total == 0: families = FALLBACK_LANGUAGE_FAMILIES print(f" Got {sum(len(v) for v in families.values())} languages across {len(families)} families") except Exception as e: print(f" Failed: {e}. Using fallback.") families = FALLBACK_LANGUAGE_FAMILIES print(f"Fetching TTS speakers from {TTS_API_URL}/speakers ...") tts_speakers = fetch_speakers() print(f" Got {len(tts_speakers)} speakers") print(f"Fetching translation languages from {TRANSLATE_API_URL} ...") translate_langs = fetch_translate_languages() print(f" Got {len(translate_langs)} languages") demo = build_app(ASR_API_URL, families, tts_speakers, translate_langs) theme = gr.themes.Soft( font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], primary_hue="blue", ) css = """\ .family-btn { min-width: 90px !important; font-size: 0.82em !important; padding: 4px 8px !important; } .lang-btn { min-width: 80px !important; font-size: 0.82em !important; padding: 4px 8px !important; } .sample-label { text-align: center; font-size: 0.78em; color: #666; margin-top: 2px; } footer { display: none !important; } .header-block { background: linear-gradient(135deg, #0F2850 0%, #1A4080 100%); border-radius: 12px; padding: 24px 20px 18px; margin-bottom: 16px; text-align: center; } .header-block h1 { margin: 0 !important; font-size: 1.8em !important; color: #fff !important; font-weight: 700 !important; } .header-block p { margin: 4px 0 0 !important; font-size: 0.9em !important; color: #ccc !important; } .header-block .badge { display: inline-block; margin-top: 10px; padding: 3px 12px; background: rgba(255,255,255,0.12); border: 1px solid rgba(255,255,255,0.3); border-radius: 16px; font-size: 0.75em !important; color: #fff !important; } .generate-btn { background: linear-gradient(135deg, #0F2850, #1A4080) !important; border: none !important; color: #fff !important; font-weight: 600 !important; } """ demo.queue(default_concurrency_limit=4).launch(theme=theme, css=css)