import gradio as gr from transformers import pipeline import torch from pydub import AudioSegment import os # Initialize the Whisper model try: whisper = pipeline( "automatic-speech-recognition", model="openai/whisper-small", device="cuda" if torch.cuda.is_available() else "cpu" ) except Exception as e: raise Exception(f"Failed to load Whisper model: {str(e)}") # Define the transcription function with chunking and automatic language detection def transcribe_audio(audio): if audio is None: return "Error: Please upload an audio file." # Validate file size (100 MB limit) try: file_size_mb = os.path.getsize(audio) / (1024 * 1024) if file_size_mb > 100: return "Error: Audio file exceeds 100 MB limit." except FileNotFoundError: return "Error: Audio file not found." try: # Load and process audio audio_segment = AudioSegment.from_file(audio) duration_ms = len(audio_segment) chunk_length_ms = 30000 # 30 seconds # Chunk long audio files if duration_ms > chunk_length_ms: chunks = [audio_segment[i:i + chunk_length_ms] for i in range(0, duration_ms, chunk_length_ms)] transcriptions = [] for i, chunk in enumerate(chunks): chunk_path = f"chunk_{i}.wav" chunk.export(chunk_path, format="wav") result = whisper(chunk_path, generate_kwargs={"task": "transcribe"}) # Automatic language detection transcriptions.append(result["text"]) if os.path.exists(chunk_path): os.remove(chunk_path) return " ".join(transcriptions) else: result = whisper(audio, generate_kwargs={"task": "transcribe"}) # Automatic language detection return result["text"] except Exception as e: return f"Error during transcription: {str(e)}" finally: # Clean up uploaded file if os.path.exists(audio): try: os.remove(audio) except Exception: pass # Create Gradio interface demo = gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(type="filepath", label="Upload an Audio File (MP3, WAV, max 100 MB)") ], outputs=gr.Textbox(label="Transcription"), title="Audio to Text Transcription with Whisper", description="Upload an audio file (MP3/WAV, up to 100 MB) to transcribe it using Open AI's Whisper model with automatic language detection.", allow_flagging="never" ) # Launch the app if __name__ == "__main__": demo.launch()