import gradio as gr from faster_whisper import WhisperModel import torch import os import warnings # Suppress Hugging Face Hub unauthenticated requests warning warnings.filterwarnings("ignore", message="You are sending unauthenticated requests") # --- Configuration --- # You can change the model size here (tiny, base, small, medium, large-v2, large-v3) # The user specifically requested "tiny" (guillaumekln/faster-whisper-tiny equivalent) MODEL_SIZE = "tiny" # Check for CUDA availability device = "cuda" if torch.cuda.is_available() else "cpu" # Use float16 only if on CUDA, otherwise int8 or float32 for CPU compute_type = "float16" if device == "cuda" else "int8" # Global model variable model = None def get_model(): global model if model is None: print(f"Initializing Faster Whisper Model: {MODEL_SIZE}") print(f"Device: {device}, Compute Type: {compute_type}") try: model = WhisperModel(MODEL_SIZE, device=device, compute_type=compute_type) except Exception as e: print(f"Error loading model: {e}") print("Attempting to load on CPU with int8...") model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8") return model # --- Language Options --- # A selection of common languages supported by Whisper LANGUAGES = [ "Auto-Detect", "Bengali (bn)", "English (en)", "Hindi (hi)", "Chinese (zh)", "Spanish (es)", "French (fr)", "German (de)", "Japanese (ja)", "Russian (ru)", "Portuguese (pt)", "Arabic (ar)", "Urdu (ur)", "Italian (it)", "Korean (ko)", "Turkish (tr)", "Polish (pl)", "Dutch (nl)", "Thai (th)", "Vietnamese (vi)", "Indonesian (id)" ] def format_timestamp(seconds): """Formats seconds into MM:SS.ms""" minutes = int(seconds // 60) secs = seconds % 60 return f"{minutes:02d}:{secs:05.2f}" def transcribe_audio(audio_path, language, beam_size, vad_filter): """ Transcribes the given audio file using Faster Whisper. Yields segments as they are processed for a real-time effect. """ if not audio_path: yield "Please upload or record an audio file first.", "Waiting..." return # Parse language code lang_code = None if language and language != "Auto-Detect": # Extracts 'bn' from 'Bengali (bn)' try: lang_code = language.split("(")[-1].strip(")") except: lang_code = None print(f"Transcribing {audio_path} with language={lang_code}, beam_size={beam_size}, vad={vad_filter}") try: # Get the model (loads only once) whisper_model = get_model() segments, info = whisper_model.transcribe( audio_path, language=lang_code, beam_size=int(beam_size), vad_filter=vad_filter ) detected_lang_info = f"Detected Language: {info.language} (Prob: {info.language_probability:.2f})" full_transcript = "" current_text = "" # Iterate over segments generator for segment in segments: start_fmt = format_timestamp(segment.start) end_fmt = format_timestamp(segment.end) # Format: [00:00.00 -> 00:05.00] Text segment_text = f"[{start_fmt} -> {end_fmt}] {segment.text}" full_transcript += segment_text + "\n" # Yielding the updated transcript and status yield full_transcript, f"{detected_lang_info} | Processing segment endings at {end_fmt}s" yield full_transcript, f"{detected_lang_info} | Completed" except Exception as e: yield f"Error during transcription: {str(e)}", "Error" # --- Gradio UI --- theme = gr.themes.Soft(primary_hue="blue", neutral_hue="slate") with gr.Blocks(title="Faster Whisper Tiny Demo") as demo: with gr.Row(): with gr.Column(scale=1): gr.Markdown( """ # đŸŽ™ī¸ Faster Whisper Tiny STT Demo ### Bengali & Multilingual Support | āĻŦāĻžāĻ‚āϞāĻž āĻāĻŦāĻ‚ āĻŦāĻšā§āĻ­āĻžāώāĻŋāĻ• āϏāĻŽāĻ°ā§āĻĨāύ This Space uses the `faster-whisper` library with the **'tiny'** model for fast and efficient speech-to-text transcription. Run entirely on CPU/GPU seamlessly. """ ) with gr.Row(): with gr.Column(scale=1): # Audio Input: allow file upload and microphone audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Audio Input (Audio File or Microphone) | āĻ…āĻĄāĻŋāĻ“ āχāύāĻĒ⧁āϟ" ) with gr.Accordion("Advanced Settings | āωāĻ¨ā§āύāϤ āϏ⧇āϟāĻŋāĻ‚āϏ", open=True): language_dropdown = gr.Dropdown( choices=LANGUAGES, value="Auto-Detect", label="Language | āĻ­āĻžāώāĻž", info="Select 'Auto-Detect' or specify a language." ) beam_size_slider = gr.Slider( minimum=1, maximum=10, step=1, value=5, label="Beam Size", info="Higher values search more paths (slower but potentially more accurate)." ) vad_filter_checkbox = gr.Checkbox( value=True, label="VAD Filter", info="Filter out silence using Voice Activity Detection." ) transcribe_btn = gr.Button("Transcribe Audio | āĻĒā§āϰāϤāĻŋāϞāĻŋāĻĒāĻŋ āĻ•āϰ⧁āύ", variant="primary", size="lg") with gr.Column(scale=1): status_output = gr.Textbox(label="Status | āĻ…āĻŦāĻ¸ā§āĻĨāĻž", interactive=False) transcript_output = gr.Textbox( label="Transcription Output | āĻĒā§āϰāϤāĻŋāϞāĻŋāĻĒāĻŋ āĻĢāϞāĻžāĻĢāϞ", lines=20, max_lines=30, placeholder="Transcription will appear here..." ) # Event Handlers transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input, language_dropdown, beam_size_slider, vad_filter_checkbox], outputs=[transcript_output, status_output] ) gr.Markdown( """ --- **Note:** The model downloads automatically on the first run. Powered by [faster-whisper](https://github.com/guillaumekln/faster-whisper) and Hugging Face Spaces. """ ) if __name__ == "__main__": demo.launch()