import gradio as gr import whisper import torch # Load the Whisper base model device = "cuda" if torch.cuda.is_available() else "cpu" model = whisper.load_model("base", device=device) def transcribe(audio): # Load and process the audio file audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) # Detect the language of the audio _, probs = model.detect_language(mel) language = max(probs, key=probs.get) print(f"Detected language: {language}") # Transcribe the audio options = whisper.DecodingOptions(fp16=torch.cuda.is_available()) result = whisper.decode(model, mel, options) return result.text # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Multilingual Speech-to-Text Transcription") with gr.Tab("Upload Audio"): audio_file = gr.Audio(source="upload", type="filepath", label="Upload your audio file") transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription") with gr.Tab("Record Audio"): audio_record = gr.Audio(source="microphone", type="filepath", label="Record your audio") record_button = gr.Button("Transcribe") record_output = gr.Textbox(label="Transcription") # Define button actions transcribe_button.click(transcribe, inputs=audio_file, outputs=transcription_output) record_button.click(transcribe, inputs=audio_record, outputs=record_output) if __name__ == "__main__": demo.launch()