import os import tempfile import gradio as gr from faster_whisper import WhisperModel import torch # Global cache for the model _cached_model_name = None _cached_model = None def get_model(model_name): global _cached_model_name, _cached_model if _cached_model_name != model_name or _cached_model is None: device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float16" if torch.cuda.is_available() else "int8" print(f"Loading {model_name} on {device} ({compute_type})...") # Using 4 CPU threads to maximize performance on free tier _cached_model = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=4) _cached_model_name = model_name return _cached_model def format_timestamp(seconds): h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = int(seconds % 60) ms = int((seconds % 1) * 1000) return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" def segments_to_srt(segments): lines = [] for i, seg in enumerate(segments, 1): start = format_timestamp(seg.start) end = format_timestamp(seg.end) lines.append(str(i)) lines.append(f"{start} --> {end}") lines.append(seg.text.strip()) lines.append("") return "\n".join(lines) def transcribe(audio_path, model_name, task="transcribe", language=None): if audio_path is None: return "Please upload an audio file.", None model = get_model(model_name) print(f"Transcribing {audio_path} using {model_name}...") options = { "task": task, "beam_size": 1, # Set to 1 for absolute maximum speed on CPU "best_of": 1, "vad_filter": True, # Filter out non-speech/silence to speed up } if language and language != "auto": options["language"] = language segments, info = model.transcribe(audio_path, **options) segments_list = [] for segment in segments: segments_list.append(segment) print(f"[{format_timestamp(segment.start)}] {segment.text}") full_text = " ".join([s.text.strip() for s in segments_list]) srt_content = segments_to_srt(segments_list) # Save SRT to a temporary file temp_srt = tempfile.NamedTemporaryFile(delete=False, suffix=".srt") temp_srt.write(srt_content.encode("utf-8")) temp_srt.close() return full_text, temp_srt.name # Gradio UI with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎙️ VoiceScript - CPU Optimized") gr.Markdown("Fast transcription using models optimized for CPU performance (`large-v3-turbo` and `distil-large-v3`).") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Upload Audio/Video") model_input = gr.Radio(["large-v3-turbo", "distil-large-v3"], label="Model", value="large-v3-turbo") task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe") lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto") transcribe_btn = gr.Button("Transcribe", variant="primary") with gr.Column(): text_output = gr.Textbox(label="Transcript", lines=10) file_output = gr.File(label="Download SRT") transcribe_btn.click( fn=transcribe, inputs=[audio_input, model_input, task_input, lang_input], outputs=[text_output, file_output] ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)