Persian-FB-ASR / app.py
imansarraf's picture
Update app.py
2b91596 verified
import gradio as gr
from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
import os
import tempfile
import spaces
# Initialize the ASR pipeline
pipeline = ASRInferencePipeline(model_card="omniASR_LLM_300M")
@spaces.GPU
def transcribe_audio(audio_file, language):
"""
Transcribe audio file using Omnilingual ASR
Args:
audio_file: Path to audio file (from gr.Audio)
language: Selected language code (e.g., "eng_Latn")
Returns:
str: Transcription text
"""
if audio_file is None:
return "Please upload an audio file."
if language is None or language == "":
return "Please select a language."
try:
# Transcribe the audio
transcriptions = pipeline.transcribe([audio_file], lang=[language], batch_size=1)
return transcriptions[0] if transcriptions else "No transcription generated."
except Exception as e:
return f"Error during transcription: {str(e)}"
def get_language_choices():
"""Get formatted language choices for dropdown"""
# Create readable labels from language codes
# Format: "eng_Latn - English (Latin)"
choices = []
for lang_code in sorted(supported_langs):
parts = lang_code.split('_')
if len(parts) == 2:
lang, script = parts
# Simple mapping for common language codes
lang_names = {
'eng': 'English',
'cmn': 'Mandarin Chinese',
'spa': 'Spanish',
'fra': 'French',
'deu': 'German',
'jpn': 'Japanese',
'kor': 'Korean',
'ara': 'Arabic',
'rus': 'Russian',
'por': 'Portuguese',
'hin': 'Hindi',
'ben': 'Bengali',
'ita': 'Italian',
'tur': 'Turkish',
'vie': 'Vietnamese',
'tha': 'Thai',
'pol': 'Polish',
'nld': 'Dutch',
'swe': 'Swedish',
'ind': 'Indonesian',
}
script_names = {
'Latn': 'Latin',
'Hans': 'Simplified',
'Hant': 'Traditional',
'Arab': 'Arabic',
'Deva': 'Devanagari',
'Cyrl': 'Cyrillic',
'Jpan': 'Japanese',
'Kore': 'Korean',
'Thai': 'Thai',
}
lang_display = lang_names.get(lang, lang.upper())
script_display = script_names.get(script, script)
label = f"{lang_code} - {lang_display} ({script_display})"
else:
label = lang_code
choices.append((label, lang_code))
return choices
# Create the Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Omnilingual ASR - 1600+ Languages") as demo:
gr.Markdown(
"""
# ๐ŸŒ Omnilingual ASR - Universal Speech Recognition
Transcribe audio in 1600+ languages using state-of-the-art ASR technology.
**Note:** Currently supports audio files up to 40 seconds in length.
<p style="text-align: center;">
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a>
</p>
"""
)
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload Audio or Record",
format="wav"
)
language_dropdown = gr.Dropdown(
choices=get_language_choices(),
label="Select Language",
value="fas_Arab",
filterable=True,
info="Choose the language of your audio (1600+ languages supported)"
)
transcribe_btn = gr.Button("Transcribe", variant="primary", size="lg")
gr.Markdown(
"""
### Supported Languages
This model supports over 1600 languages across various scripts including:
- Latin, Arabic, Devanagari, Cyrillic
- Chinese (Simplified & Traditional)
- Japanese, Korean, Thai, and many more
### Examples
- **eng_Latn**: English (Latin script)
- **cmn_Hans**: Mandarin Chinese (Simplified)
- **spa_Latn**: Spanish (Latin script)
- **ara_Arab**: Arabic (Arabic script)
- **hin_Deva**: Hindi (Devanagari script)
"""
)
with gr.Column(scale=1):
transcription_output = gr.Textbox(
label="Transcription",
placeholder="Transcription will appear here...",
lines=10,
show_copy_button=True
)
gr.Markdown(
"""
### Tips
- Audio files should be shorter than 40 seconds
- Supported formats: WAV, FLAC, MP3
- Select the correct language for better accuracy
- You can also record audio directly using your microphone
"""
)
# Set up the transcription event
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input, language_dropdown],
outputs=transcription_output,
api_name="transcribe"
)
# Also allow submission when audio is uploaded
audio_input.upload(
fn=transcribe_audio,
inputs=[audio_input, language_dropdown],
outputs=transcription_output
)
gr.Markdown(
"""
---
### About Omnilingual ASR
Omnilingual ASR is a cutting-edge automatic speech recognition system capable of
transcribing audio in over 1600 languages. Built with fairseq2, it provides
state-of-the-art accuracy across diverse languages and scripts.
**Model:** omniASR_LLM_300M
**Framework:** fairseq2
"""
)
if __name__ == "__main__":
demo.launch()