File size: 1,971 Bytes
135f59e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5385eaf
 
135f59e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
from faster_whisper import WhisperModel
from time import time
import logging
import json

# Initialize logging
logging.basicConfig()
logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
CHOICES = [
    "tiny", "tiny.en", "base", 
    "base.en", "small", "small.en", 
    "medium", "medium.en", "large-v1", 
    "large-v2", "large-v3", "large"
]
# Function to load model
def load_model(model):
    download_path_int8 = "int8"  # Adjust path as needed for Hugging Face Spaces
    return WhisperModel(model, device="auto", compute_type="int8", download_root=download_path_int8)

# Current model (default to small)
current_model = load_model("small")

def transcribe(audio_file, model):
    global current_model

    # Load the model if different size is selected
    if current_model.model != model:
        current_model = load_model(model)

    start = time()
    segments, info = current_model.transcribe(
        audio_file,
        vad_filter=True,
        vad_parameters=dict(min_silence_duration_ms=500),
    )

    # Prepare JSON output
    transcript = [{"start": segment.start, "end": segment.end, "text": segment.text} for segment in segments]
    print(f"Time Taken to transcribe: {time() - start}")
    output = {
        "language": info.language,
        "language_probability": info.language_probability,
        "transcript": transcript
    }

    return json.dumps(output, indent=4)

# Define Gradio interface
iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath", label="Upload MP3 Audio File"),
        gr.Dropdown(choices=CHOICES, value="small", label="Model")
    ],
    outputs=gr.JSON(label="Transcription with Timestamps"),
    title="Whisper Transcription Service",
    description="Upload an MP3 audio file to transcribe. Select the model. The output includes the transcription with timestamps.",
    concurrency_limit=2
)

# Launch the app
if __name__ == "__main__":
    iface.launch()