Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import torch | |
| from pydub import AudioSegment | |
| import os | |
| # Initialize the Whisper model | |
| try: | |
| whisper = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-small", | |
| device="cuda" if torch.cuda.is_available() else "cpu" | |
| ) | |
| except Exception as e: | |
| raise Exception(f"Failed to load Whisper model: {str(e)}") | |
| # Define the transcription function with chunking and automatic language detection | |
| def transcribe_audio(audio): | |
| if audio is None: | |
| return "Error: Please upload an audio file." | |
| # Validate file size (100 MB limit) | |
| try: | |
| file_size_mb = os.path.getsize(audio) / (1024 * 1024) | |
| if file_size_mb > 100: | |
| return "Error: Audio file exceeds 100 MB limit." | |
| except FileNotFoundError: | |
| return "Error: Audio file not found." | |
| try: | |
| # Load and process audio | |
| audio_segment = AudioSegment.from_file(audio) | |
| duration_ms = len(audio_segment) | |
| chunk_length_ms = 30000 # 30 seconds | |
| # Chunk long audio files | |
| if duration_ms > chunk_length_ms: | |
| chunks = [audio_segment[i:i + chunk_length_ms] for i in range(0, duration_ms, chunk_length_ms)] | |
| transcriptions = [] | |
| for i, chunk in enumerate(chunks): | |
| chunk_path = f"chunk_{i}.wav" | |
| chunk.export(chunk_path, format="wav") | |
| result = whisper(chunk_path, generate_kwargs={"task": "transcribe"}) # Automatic language detection | |
| transcriptions.append(result["text"]) | |
| if os.path.exists(chunk_path): | |
| os.remove(chunk_path) | |
| return " ".join(transcriptions) | |
| else: | |
| result = whisper(audio, generate_kwargs={"task": "transcribe"}) # Automatic language detection | |
| return result["text"] | |
| except Exception as e: | |
| return f"Error during transcription: {str(e)}" | |
| finally: | |
| # Clean up uploaded file | |
| if os.path.exists(audio): | |
| try: | |
| os.remove(audio) | |
| except Exception: | |
| pass | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=transcribe_audio, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Upload an Audio File (MP3, WAV, max 100 MB)") | |
| ], | |
| outputs=gr.Textbox(label="Transcription"), | |
| title="Audio to Text Transcription with Whisper", | |
| description="Upload an audio file (MP3/WAV, up to 100 MB) to transcribe it using Open AI's Whisper model with automatic language detection.", | |
| allow_flagging="never" | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |