File size: 3,641 Bytes
33708d5
 
 
 
 
 
81bd23a
 
c00f45b
33708d5
81bd23a
 
 
c00f45b
 
81bd23a
 
 
 
c00f45b
33708d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81bd23a
33708d5
 
 
81bd23a
c00f45b
81bd23a
6dd81c2
 
 
81bd23a
 
6dd81c2
 
 
33708d5
 
 
6dd81c2
 
 
 
 
 
33708d5
 
 
 
 
 
 
 
 
 
 
 
 
81bd23a
 
33708d5
 
 
 
81bd23a
33708d5
 
 
 
 
 
 
 
 
 
81bd23a
33708d5
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import tempfile
import gradio as gr
from faster_whisper import WhisperModel
import torch

# Global cache for the model
_cached_model_name = None
_cached_model = None

def get_model(model_name):
    global _cached_model_name, _cached_model
    if _cached_model_name != model_name or _cached_model is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        compute_type = "float16" if torch.cuda.is_available() else "int8"
        print(f"Loading {model_name} on {device} ({compute_type})...")
        # Using 4 CPU threads to maximize performance on free tier
        _cached_model = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=4)
        _cached_model_name = model_name
    return _cached_model

def format_timestamp(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds % 1) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def segments_to_srt(segments):
    lines = []
    for i, seg in enumerate(segments, 1):
        start = format_timestamp(seg.start)
        end = format_timestamp(seg.end)
        lines.append(str(i))
        lines.append(f"{start} --> {end}")
        lines.append(seg.text.strip())
        lines.append("")
    return "\n".join(lines)

def transcribe(audio_path, model_name, task="transcribe", language=None):
    if audio_path is None:
        return "Please upload an audio file.", None

    model = get_model(model_name)
    
    print(f"Transcribing {audio_path} using {model_name}...")
    
    options = {
        "task": task,
        "beam_size": 1, # Set to 1 for absolute maximum speed on CPU
        "best_of": 1,
        "vad_filter": True, # Filter out non-speech/silence to speed up
    }
    
    if language and language != "auto":
        options["language"] = language

    segments, info = model.transcribe(audio_path, **options)
    
    segments_list = []
    for segment in segments:
        segments_list.append(segment)
        print(f"[{format_timestamp(segment.start)}] {segment.text}")
    
    full_text = " ".join([s.text.strip() for s in segments_list])
    srt_content = segments_to_srt(segments_list)
    
    # Save SRT to a temporary file
    temp_srt = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
    temp_srt.write(srt_content.encode("utf-8"))
    temp_srt.close()
    
    return full_text, temp_srt.name

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎙️ VoiceScript - CPU Optimized")
    gr.Markdown("Fast transcription using models optimized for CPU performance (`large-v3-turbo` and `distil-large-v3`).")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio/Video")
            model_input = gr.Radio(["large-v3-turbo", "distil-large-v3"], label="Model", value="large-v3-turbo")
            task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
            lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto")
            transcribe_btn = gr.Button("Transcribe", variant="primary")
            
        with gr.Column():
            text_output = gr.Textbox(label="Transcript", lines=10)
            file_output = gr.File(label="Download SRT")
            
    transcribe_btn.click(
        fn=transcribe,
        inputs=[audio_input, model_input, task_input, lang_input],
        outputs=[text_output, file_output]
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)