Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline | |
| from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs | |
| import os | |
| import tempfile | |
| import spaces | |
| # Initialize the ASR pipeline | |
| pipeline = ASRInferencePipeline(model_card="omniASR_LLM_300M") | |
| def transcribe_audio(audio_file, language): | |
| """ | |
| Transcribe audio file using Omnilingual ASR | |
| Args: | |
| audio_file: Path to audio file (from gr.Audio) | |
| language: Selected language code (e.g., "eng_Latn") | |
| Returns: | |
| str: Transcription text | |
| """ | |
| if audio_file is None: | |
| return "Please upload an audio file." | |
| if language is None or language == "": | |
| return "Please select a language." | |
| try: | |
| # Transcribe the audio | |
| transcriptions = pipeline.transcribe([audio_file], lang=[language], batch_size=1) | |
| return transcriptions[0] if transcriptions else "No transcription generated." | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| def get_language_choices(): | |
| """Get formatted language choices for dropdown""" | |
| # Create readable labels from language codes | |
| # Format: "eng_Latn - English (Latin)" | |
| choices = [] | |
| for lang_code in sorted(supported_langs): | |
| parts = lang_code.split('_') | |
| if len(parts) == 2: | |
| lang, script = parts | |
| # Simple mapping for common language codes | |
| lang_names = { | |
| 'eng': 'English', | |
| 'cmn': 'Mandarin Chinese', | |
| 'spa': 'Spanish', | |
| 'fra': 'French', | |
| 'deu': 'German', | |
| 'jpn': 'Japanese', | |
| 'kor': 'Korean', | |
| 'ara': 'Arabic', | |
| 'rus': 'Russian', | |
| 'por': 'Portuguese', | |
| 'hin': 'Hindi', | |
| 'ben': 'Bengali', | |
| 'ita': 'Italian', | |
| 'tur': 'Turkish', | |
| 'vie': 'Vietnamese', | |
| 'tha': 'Thai', | |
| 'pol': 'Polish', | |
| 'nld': 'Dutch', | |
| 'swe': 'Swedish', | |
| 'ind': 'Indonesian', | |
| } | |
| script_names = { | |
| 'Latn': 'Latin', | |
| 'Hans': 'Simplified', | |
| 'Hant': 'Traditional', | |
| 'Arab': 'Arabic', | |
| 'Deva': 'Devanagari', | |
| 'Cyrl': 'Cyrillic', | |
| 'Jpan': 'Japanese', | |
| 'Kore': 'Korean', | |
| 'Thai': 'Thai', | |
| } | |
| lang_display = lang_names.get(lang, lang.upper()) | |
| script_display = script_names.get(script, script) | |
| label = f"{lang_code} - {lang_display} ({script_display})" | |
| else: | |
| label = lang_code | |
| choices.append((label, lang_code)) | |
| return choices | |
| # Create the Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Omnilingual ASR - 1600+ Languages") as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐ Omnilingual ASR - Universal Speech Recognition | |
| Transcribe audio in 1600+ languages using state-of-the-art ASR technology. | |
| **Note:** Currently supports audio files up to 40 seconds in length. | |
| <p style="text-align: center;"> | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a> | |
| </p> | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Upload Audio or Record", | |
| format="wav" | |
| ) | |
| language_dropdown = gr.Dropdown( | |
| choices=get_language_choices(), | |
| label="Select Language", | |
| value="fas_Arab", | |
| filterable=True, | |
| info="Choose the language of your audio (1600+ languages supported)" | |
| ) | |
| transcribe_btn = gr.Button("Transcribe", variant="primary", size="lg") | |
| gr.Markdown( | |
| """ | |
| ### Supported Languages | |
| This model supports over 1600 languages across various scripts including: | |
| - Latin, Arabic, Devanagari, Cyrillic | |
| - Chinese (Simplified & Traditional) | |
| - Japanese, Korean, Thai, and many more | |
| ### Examples | |
| - **eng_Latn**: English (Latin script) | |
| - **cmn_Hans**: Mandarin Chinese (Simplified) | |
| - **spa_Latn**: Spanish (Latin script) | |
| - **ara_Arab**: Arabic (Arabic script) | |
| - **hin_Deva**: Hindi (Devanagari script) | |
| """ | |
| ) | |
| with gr.Column(scale=1): | |
| transcription_output = gr.Textbox( | |
| label="Transcription", | |
| placeholder="Transcription will appear here...", | |
| lines=10, | |
| show_copy_button=True | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Tips | |
| - Audio files should be shorter than 40 seconds | |
| - Supported formats: WAV, FLAC, MP3 | |
| - Select the correct language for better accuracy | |
| - You can also record audio directly using your microphone | |
| """ | |
| ) | |
| # Set up the transcription event | |
| transcribe_btn.click( | |
| fn=transcribe_audio, | |
| inputs=[audio_input, language_dropdown], | |
| outputs=transcription_output, | |
| api_name="transcribe" | |
| ) | |
| # Also allow submission when audio is uploaded | |
| audio_input.upload( | |
| fn=transcribe_audio, | |
| inputs=[audio_input, language_dropdown], | |
| outputs=transcription_output | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### About Omnilingual ASR | |
| Omnilingual ASR is a cutting-edge automatic speech recognition system capable of | |
| transcribing audio in over 1600 languages. Built with fairseq2, it provides | |
| state-of-the-art accuracy across diverse languages and scripts. | |
| **Model:** omniASR_LLM_300M | |
| **Framework:** fairseq2 | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |