Spaces:
Sleeping
Sleeping
File size: 3,641 Bytes
33708d5 81bd23a c00f45b 33708d5 81bd23a c00f45b 81bd23a c00f45b 33708d5 81bd23a 33708d5 81bd23a c00f45b 81bd23a 6dd81c2 81bd23a 6dd81c2 33708d5 6dd81c2 33708d5 81bd23a 33708d5 81bd23a 33708d5 81bd23a 33708d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import os
import tempfile
import gradio as gr
from faster_whisper import WhisperModel
import torch
# Global cache for the model
_cached_model_name = None
_cached_model = None
def get_model(model_name):
global _cached_model_name, _cached_model
if _cached_model_name != model_name or _cached_model is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if torch.cuda.is_available() else "int8"
print(f"Loading {model_name} on {device} ({compute_type})...")
# Using 4 CPU threads to maximize performance on free tier
_cached_model = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=4)
_cached_model_name = model_name
return _cached_model
def format_timestamp(seconds):
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def segments_to_srt(segments):
lines = []
for i, seg in enumerate(segments, 1):
start = format_timestamp(seg.start)
end = format_timestamp(seg.end)
lines.append(str(i))
lines.append(f"{start} --> {end}")
lines.append(seg.text.strip())
lines.append("")
return "\n".join(lines)
def transcribe(audio_path, model_name, task="transcribe", language=None):
if audio_path is None:
return "Please upload an audio file.", None
model = get_model(model_name)
print(f"Transcribing {audio_path} using {model_name}...")
options = {
"task": task,
"beam_size": 1, # Set to 1 for absolute maximum speed on CPU
"best_of": 1,
"vad_filter": True, # Filter out non-speech/silence to speed up
}
if language and language != "auto":
options["language"] = language
segments, info = model.transcribe(audio_path, **options)
segments_list = []
for segment in segments:
segments_list.append(segment)
print(f"[{format_timestamp(segment.start)}] {segment.text}")
full_text = " ".join([s.text.strip() for s in segments_list])
srt_content = segments_to_srt(segments_list)
# Save SRT to a temporary file
temp_srt = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
temp_srt.write(srt_content.encode("utf-8"))
temp_srt.close()
return full_text, temp_srt.name
# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎙️ VoiceScript - CPU Optimized")
gr.Markdown("Fast transcription using models optimized for CPU performance (`large-v3-turbo` and `distil-large-v3`).")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio/Video")
model_input = gr.Radio(["large-v3-turbo", "distil-large-v3"], label="Model", value="large-v3-turbo")
task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto")
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
text_output = gr.Textbox(label="Transcript", lines=10)
file_output = gr.File(label="Download SRT")
transcribe_btn.click(
fn=transcribe,
inputs=[audio_input, model_input, task_input, lang_input],
outputs=[text_output, file_output]
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|