File size: 2,750 Bytes
99e2ed9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
from transformers import pipeline
import torch
from pydub import AudioSegment
import os

# Initialize the Whisper model
try:
    whisper = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-small",
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
except Exception as e:
    raise Exception(f"Failed to load Whisper model: {str(e)}")

# Define the transcription function with chunking and automatic language detection
def transcribe_audio(audio):
    if audio is None:
        return "Error: Please upload an audio file."
    
    # Validate file size (100 MB limit)
    try:
        file_size_mb = os.path.getsize(audio) / (1024 * 1024)
        if file_size_mb > 100:
            return "Error: Audio file exceeds 100 MB limit."
    except FileNotFoundError:
        return "Error: Audio file not found."

    try:
        # Load and process audio
        audio_segment = AudioSegment.from_file(audio)
        duration_ms = len(audio_segment)
        chunk_length_ms = 30000  # 30 seconds
        
        # Chunk long audio files
        if duration_ms > chunk_length_ms:
            chunks = [audio_segment[i:i + chunk_length_ms] for i in range(0, duration_ms, chunk_length_ms)]
            transcriptions = []
            for i, chunk in enumerate(chunks):
                chunk_path = f"chunk_{i}.wav"
                chunk.export(chunk_path, format="wav")
                result = whisper(chunk_path, generate_kwargs={"task": "transcribe"})  # Automatic language detection
                transcriptions.append(result["text"])
                if os.path.exists(chunk_path):
                    os.remove(chunk_path)
            return " ".join(transcriptions)
        else:
            result = whisper(audio, generate_kwargs={"task": "transcribe"})  # Automatic language detection
            return result["text"]
    
    except Exception as e:
        return f"Error during transcription: {str(e)}"
    finally:
        # Clean up uploaded file
        if os.path.exists(audio):
            try:
                os.remove(audio)
            except Exception:
                pass

# Create Gradio interface
demo = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(type="filepath", label="Upload an Audio File (MP3, WAV, max 100 MB)")
    ],
    outputs=gr.Textbox(label="Transcription"),
    title="Audio to Text Transcription with Whisper",
    description="Upload an audio file (MP3/WAV, up to 100 MB) to transcribe it using Open AI's Whisper model with automatic language detection.",
    allow_flagging="never"
)

# Launch the app
if __name__ == "__main__":
    demo.launch()