Update app.py
Browse files
app.py
CHANGED
|
@@ -8,64 +8,92 @@ import re
|
|
| 8 |
|
| 9 |
DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT) with Auto Language ID"
|
| 10 |
|
| 11 |
-
# ---
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
"
|
| 16 |
-
"
|
| 17 |
-
"
|
| 18 |
-
"
|
| 19 |
-
"
|
| 20 |
-
"
|
| 21 |
-
"
|
| 22 |
-
"
|
| 23 |
-
"
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
|
|
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
LANGUAGE_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
|
| 39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 40 |
|
| 41 |
-
#
|
| 42 |
print("Loading IndicConformer model...")
|
| 43 |
model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
|
| 44 |
model.eval()
|
| 45 |
print("✅ Model loaded successfully.")
|
| 46 |
|
|
|
|
| 47 |
def identify_language(text: str) -> str | None:
|
| 48 |
"""Identifies the language of a given text based on character sets and common words."""
|
| 49 |
if not text.strip():
|
| 50 |
return None
|
| 51 |
|
| 52 |
-
scores = {lang: 0 for lang in
|
| 53 |
text_chars = set(text)
|
| 54 |
text_words = set(re.split(r'[\s,.:;!?]+', text))
|
| 55 |
|
| 56 |
-
for lang_code
|
| 57 |
-
char_score = len(text_chars.intersection(
|
| 58 |
-
word_score = len(text_words.intersection(
|
| 59 |
scores[lang_code] = (char_score * 2) + word_score
|
| 60 |
|
| 61 |
max_score = max(scores.values())
|
| 62 |
-
if max_score < 3:
|
| 63 |
return None
|
| 64 |
-
|
| 65 |
identified_code = max(scores, key=scores.get)
|
| 66 |
return identified_code
|
| 67 |
|
| 68 |
-
|
| 69 |
@spaces.GPU
|
| 70 |
def transcribe_and_identify(audio_path):
|
| 71 |
if not audio_path:
|
|
@@ -79,11 +107,9 @@ def transcribe_and_identify(audio_path):
|
|
| 79 |
return f"Error loading audio: {e}", "", ""
|
| 80 |
|
| 81 |
try:
|
| 82 |
-
# 1. Perform a fast, initial transcription using a pivot language (Hindi)
|
| 83 |
with torch.no_grad():
|
| 84 |
initial_transcription = model(waveform, "hi", "ctc")
|
| 85 |
-
|
| 86 |
-
# 2. Identify the language from the initial transcription
|
| 87 |
identified_lang_code = identify_language(initial_transcription)
|
| 88 |
|
| 89 |
if not identified_lang_code:
|
|
@@ -92,7 +118,6 @@ def transcribe_and_identify(audio_path):
|
|
| 92 |
|
| 93 |
detected_lang_str = f"Detected Language: {LANGUAGE_CODE_TO_NAME.get(identified_lang_code, 'Unknown')}"
|
| 94 |
|
| 95 |
-
# 3. Perform the final, high-quality transcription using the identified language
|
| 96 |
with torch.no_grad():
|
| 97 |
transcription_ctc = model(waveform, identified_lang_code, "ctc")
|
| 98 |
transcription_rnnt = model(waveform, identified_lang_code, "rnnt")
|
|
@@ -102,17 +127,16 @@ def transcribe_and_identify(audio_path):
|
|
| 102 |
|
| 103 |
return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
|
| 104 |
|
| 105 |
-
|
| 106 |
-
# Gradio UI
|
| 107 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 108 |
gr.Markdown(f"## {DESCRIPTION}")
|
| 109 |
gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription using both CTC and RNNT decoding.")
|
| 110 |
-
|
| 111 |
with gr.Row():
|
| 112 |
with gr.Column(scale=1):
|
| 113 |
audio = gr.Audio(label="Upload or Record Audio", type="filepath")
|
| 114 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 115 |
-
|
| 116 |
with gr.Column(scale=2):
|
| 117 |
detected_lang_output = gr.Label(label="Language Detection Result")
|
| 118 |
gr.Markdown("### RNNT Transcription (More Accurate)")
|
|
@@ -121,8 +145,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 121 |
ctc_output = gr.Textbox(lines=3, label="CTC Output")
|
| 122 |
|
| 123 |
transcribe_btn.click(
|
| 124 |
-
fn=transcribe_and_identify,
|
| 125 |
-
inputs=[audio],
|
| 126 |
outputs=[detected_lang_output, ctc_output, rnnt_output],
|
| 127 |
api_name="transcribe"
|
| 128 |
)
|
|
|
|
| 8 |
|
| 9 |
DESCRIPTION = "IndicConformer-600M Multilingual ASR (CTC + RNNT) with Auto Language ID"
|
| 10 |
|
| 11 |
+
# --- Data Dictionaries ---
|
| 12 |
+
|
| 13 |
+
# Dictionary for character sets, now with improved formatting for readability.
|
| 14 |
+
LANGUAGE_CHARSETS = {
|
| 15 |
+
"as": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'ৰ', 'ল', 'ৱ', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্']),
|
| 16 |
+
"bn": set(['অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'ৎ', 'ং', 'ঃ', 'ঽ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ড়', 'ঢ়', 'য়']),
|
| 17 |
+
"br": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', '़', 'ा', 'ि', 'ी', 'ु', 'ূ', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 18 |
+
"doi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ج', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 19 |
+
"gu": set(['અ', 'આ', 'ઇ', 'ઈ', 'ઉ', 'ઊ', 'ઋ', 'એ', 'ઐ', 'ઓ', 'ઔ', 'ક', 'ખ', 'ગ', 'ઘ', 'ઙ', 'ચ', 'છ', 'જ', 'ઝ', 'ઞ', 'ટ', 'ઠ', 'ડ', 'ઢ', 'ણ', 'ત', 'થ', 'દ', 'ધ', 'ન', 'પ', 'ફ', 'બ', 'ભ', 'મ', 'ય', 'ર', 'લ', 'ળ', 'વ', 'શ', 'ષ', 'સ', 'હ', '઼', 'ા', 'િ', 'ી', 'ુ', 'ૂ', 'ૃ', 'ે', 'ૈ', 'ો', 'ૌ', '્']),
|
| 20 |
+
"hi": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 21 |
+
"kn": set(['ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ಎ', 'ಏ', 'ಐ', 'ಒ', 'ಓ', 'ಔ', 'ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ', 'ಯ', 'ರ', 'ಲ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 'ಳ', 'ಱ', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', '್']),
|
| 22 |
+
"ks": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
|
| 23 |
+
"kok": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 24 |
+
"mai": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 25 |
+
"ml": set(['അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', 'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', 'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', 'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ', 'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ', 'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', '്']),
|
| 26 |
+
"mni": set(['ꯑ', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '', '', '', '', '', '', 'ꯀ', 'ꯂ', 'ꯃ', 'ꯄ', 'ꯅ', 'ꯆ', 'ꯇ', 'ꯈ', 'ꯉ', 'ꯊ', 'ꯋ', 'ꯌ', 'ꯍ', 'ꯎ', 'ꯏ', 'ꯐ', 'ꯑ']),
|
| 27 |
+
"mr": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्', 'ळ']),
|
| 28 |
+
"ne": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 29 |
+
"or": set(['ଅ', 'ଆ', 'ଇ', 'ଈ', 'ଉ', 'ଊ', 'ଋ', 'ଏ', 'ଐ', 'ଓ', 'ଔ', 'କ', 'ଖ', 'ଗ', 'ଘ', 'ଙ', 'ଚ', 'ଛ', 'ଜ', 'ଝ', 'ଞ', 'ଟ', 'ଠ', 'ଡ', 'ଢ', 'ଣ', 'ତ', 'ଥ', 'ଦ', 'ଧ', 'ନ', 'ପ', 'ଫ', 'ବ', 'ଭ', 'ମ', 'ଯ', 'ର', 'ଲ', 'ଳ', 'ବ', 'ଶ', 'ଷ', 'ସ', 'ହ', 'କ୍ଷ', 'ା', 'ି', 'ୀ', 'ୁ', 'ୂ', 'ୃ', 'େ', 'ୈ', 'ୋ', 'ୌ', '୍']),
|
| 30 |
+
"pa": set(['ਅ', 'ਆ', 'ਇ', 'ਈ', 'ਉ', 'ਊ', 'ਏ', 'ਐ', 'ਓ', 'ਔ', 'ਕ', 'ਖ', 'ਗ', 'ਘ', 'ਙ', 'ਚ', 'ਛ', 'ਜ', 'ਝ', 'ਞ', 'ਟ', 'ਠ', 'ਡ', 'ਢ', 'ਣ', 'ਤ', 'ਥ', 'ਦ', 'ਧ', 'ਨ', 'ਪ', 'ਫ', 'ਬ', 'ਭ', 'ਮ', 'ਯ', 'ਰ', 'ਲ', 'ਵ', 'ਸ਼', 'ਸ', 'ਹ', 'ਖ਼', 'ਗ਼', 'ਜ਼', 'ੜ', 'ਫ਼', 'ਲ਼', 'ਿ', 'ੀ', 'ੁ', 'ੂ', 'ੇ', 'ੈ', 'ੋ', 'ੌ', '੍']),
|
| 31 |
+
"sa": set(['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ो', 'ौ', '्']),
|
| 32 |
+
"sat": set(['ᱚ', 'ᱛ', 'ᱚ', 'ᱜ', 'ᱚ', 'ᱝ', 'ᱞ', 'ᱟ', 'ᱠ', 'ᱥ', 'ᱮ', 'ᱫ', 'ఇ', 'ᱤ', 'ᱩ', 'ੂ', 'େ', 'ୈ', 'ᱪ', 'ᱡ', 'ᱭ']),
|
| 33 |
+
"sd": set(['ا', 'آ', 'ب', 'ڀ', 'ت', 'ٽ', 'ث', 'پ', 'ج', 'ڄ', 'جھ', 'چ', 'ح', 'خ', 'ڌ', 'د', 'ڏ', 'ڊ', 'ذ', 'ر', 'ز', 'ڙ', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ڦ', 'ق', 'ڪ', 'ک', 'گ', 'ڳ', 'ڱ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ي']),
|
| 34 |
+
"ta": set(['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ', 'ஓ', 'ஔ', 'க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'த', 'ந', 'ன', 'ப', 'ம', 'ய', 'ர', 'ல', 'வ', 'ழ', 'ள', 'ஷ', 'ஸ', 'ஹ']),
|
| 35 |
+
"te": set(['అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'క', 'ఖ', 'గ', 'ఘ', 'ఙ', 'చ', 'ఛ', 'జ', 'ఝ', 'ఞ', 'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న', 'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ', 'శ', 'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ', 'ా', 'ి', 'ీ', 'ు', 'ూ', 'ృ', 'ె', 'ే', 'ై', 'ొ', 'ో', 'ౌ', '్']),
|
| 36 |
+
"ur": set(['ا', 'آ', 'ب', 'پ', 'ت', 'ٹ', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ھ', 'ء', 'ی', 'ی', 'ے']),
|
| 37 |
}
|
| 38 |
|
| 39 |
+
# Dictionary for common words of each language
|
| 40 |
+
LANGUAGE_COMMON_WORDS = {
|
| 41 |
+
"as": set(["আৰু", "হয়", "এটা", "কৰা", "ওপৰত", "যে"]),
|
| 42 |
+
"bn": set(["এবং", "একটি", "করুন", "জন্য", "সঙ্গে", "হচ্ছে"]),
|
| 43 |
+
"br": set(["आरो", "एसे", "मोनसे", "माव", "दं", "जा"]),
|
| 44 |
+
"doi": set(["ते", "दे", "ऐ", "इक", "ओह्", "कर"]),
|
| 45 |
+
"gu": set(["અને", "એક", "માટે", "છે", "સાથે", "કરવું"]),
|
| 46 |
+
"hi": set(["और", "है", "एक", "में", "के", "लिए"]),
|
| 47 |
+
"kn": set(["ಮತ್ತು", "ಒಂದು", "ಹೇಗೆ", "ನಾನು", "ಇದೆ", "ಆ"]),
|
| 48 |
+
"ks": set([" تہٕ", "چھُ", "اکھ", "منز", "کیتھ", "छु", "छ"]),
|
| 49 |
+
"kok": set(["आनी", "एक", "कर", "खातीर", "कडेन", "आसा"]),
|
| 50 |
+
"mai": set(["आ", "एक", "हम", "अछि", "क'", "छै"]),
|
| 51 |
+
"ml": set(["ഒരു", "കൂടാതെ", "എങ്ങനെ", "ഞാൻ", "ഇത്", "ആണ്"]),
|
| 52 |
+
"mni": set(["ꯗꯥ", "ꯑꯃꯥ", "ꯀꯔꯤ", "ꯑꯩꯅꯥ", "ꯑꯁꯤ", "ꯂꯩ"]),
|
| 53 |
+
"mr": set(["आणि", "एक", "आहे", "मी", "तू", "जे"]),
|
| 54 |
+
"ne": set(["र", "एक", "हो", "म", "तिमी", "छ"]),
|
| 55 |
+
"or": set(["ଏବଂ", "ଗୋଟିଏ", "କରନ୍ତୁ", "ପାଇଁ", "ସହିତ", "ଅଛି"]),
|
| 56 |
+
"pa": set(["ਅਤੇ", "ਇੱਕ", "ਹੈ", "ਵਿੱਚ", "ਨੂੰ", "ਦਾ"]),
|
| 57 |
+
"sa": set(["च", "एकः", "अस्ति", "अहम्", "त्वम्", "सः"]),
|
| 58 |
+
"sat": set(["ᱟᱨ", "ᱫᱚ", "ᱢᱤᱫ", "ಒಂದು", "ಮತ್ತು", "ক"]),
|
| 59 |
+
"sd": set(["۽", "هڪ", "آهي", "۾", "کي", "جو"]),
|
| 60 |
+
"ta": set(["மற்றும்", "ஒரு", "வேண்டும்", "நான்", "இது", "ஆகும்"]),
|
| 61 |
+
"te": set(["మరియు", "ఒక", "కావాలి", "నేను", "ఇది", "ఉంది"]),
|
| 62 |
+
"ur": set(["اور", "ہے", "ایک", "میں", "کے", "لیے"]),
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Mapping from language code to its full name for display purposes
|
| 66 |
LANGUAGE_CODE_TO_NAME = { "as": "Assamese", "bn": "Bengali", "br": "Bodo", "doi": "Dogri", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada", "ks": "Kashmiri", "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia", "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"}
|
| 67 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 68 |
|
| 69 |
+
# --- Model Loading ---
|
| 70 |
print("Loading IndicConformer model...")
|
| 71 |
model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True).to(device)
|
| 72 |
model.eval()
|
| 73 |
print("✅ Model loaded successfully.")
|
| 74 |
|
| 75 |
+
# --- Core Logic ---
|
| 76 |
def identify_language(text: str) -> str | None:
|
| 77 |
"""Identifies the language of a given text based on character sets and common words."""
|
| 78 |
if not text.strip():
|
| 79 |
return None
|
| 80 |
|
| 81 |
+
scores = {lang: 0 for lang in LANGUAGE_CHARSETS.keys()}
|
| 82 |
text_chars = set(text)
|
| 83 |
text_words = set(re.split(r'[\s,.:;!?]+', text))
|
| 84 |
|
| 85 |
+
for lang_code in scores.keys():
|
| 86 |
+
char_score = len(text_chars.intersection(LANGUAGE_CHARSETS.get(lang_code, set())))
|
| 87 |
+
word_score = len(text_words.intersection(LANGUAGE_COMMON_WORDS.get(lang_code, set())))
|
| 88 |
scores[lang_code] = (char_score * 2) + word_score
|
| 89 |
|
| 90 |
max_score = max(scores.values())
|
| 91 |
+
if max_score < 3:
|
| 92 |
return None
|
| 93 |
+
|
| 94 |
identified_code = max(scores, key=scores.get)
|
| 95 |
return identified_code
|
| 96 |
|
|
|
|
| 97 |
@spaces.GPU
|
| 98 |
def transcribe_and_identify(audio_path):
|
| 99 |
if not audio_path:
|
|
|
|
| 107 |
return f"Error loading audio: {e}", "", ""
|
| 108 |
|
| 109 |
try:
|
|
|
|
| 110 |
with torch.no_grad():
|
| 111 |
initial_transcription = model(waveform, "hi", "ctc")
|
| 112 |
+
|
|
|
|
| 113 |
identified_lang_code = identify_language(initial_transcription)
|
| 114 |
|
| 115 |
if not identified_lang_code:
|
|
|
|
| 118 |
|
| 119 |
detected_lang_str = f"Detected Language: {LANGUAGE_CODE_TO_NAME.get(identified_lang_code, 'Unknown')}"
|
| 120 |
|
|
|
|
| 121 |
with torch.no_grad():
|
| 122 |
transcription_ctc = model(waveform, identified_lang_code, "ctc")
|
| 123 |
transcription_rnnt = model(waveform, identified_lang_code, "rnnt")
|
|
|
|
| 127 |
|
| 128 |
return detected_lang_str, transcription_ctc.strip(), transcription_rnnt.strip()
|
| 129 |
|
| 130 |
+
# --- Gradio UI ---
|
|
|
|
| 131 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 132 |
gr.Markdown(f"## {DESCRIPTION}")
|
| 133 |
gr.Markdown("Upload or record audio in any of the 22 supported Indian languages. The app will automatically detect the language and provide the transcription using both CTC and RNNT decoding.")
|
| 134 |
+
|
| 135 |
with gr.Row():
|
| 136 |
with gr.Column(scale=1):
|
| 137 |
audio = gr.Audio(label="Upload or Record Audio", type="filepath")
|
| 138 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
| 139 |
+
|
| 140 |
with gr.Column(scale=2):
|
| 141 |
detected_lang_output = gr.Label(label="Language Detection Result")
|
| 142 |
gr.Markdown("### RNNT Transcription (More Accurate)")
|
|
|
|
| 145 |
ctc_output = gr.Textbox(lines=3, label="CTC Output")
|
| 146 |
|
| 147 |
transcribe_btn.click(
|
| 148 |
+
fn=transcribe_and_identify,
|
| 149 |
+
inputs=[audio],
|
| 150 |
outputs=[detected_lang_output, ctc_output, rnnt_output],
|
| 151 |
api_name="transcribe"
|
| 152 |
)
|