Spaces:

redhairedshanks1
/

VoiceScript

Sleeping

Antigravity Agent

Optimize for CPU execution and add CPU models

81bd23a 25 days ago

3.64 kB

	import os
	import tempfile
	import gradio as gr
	from faster_whisper import WhisperModel
	import torch

	# Global cache for the model
	_cached_model_name = None
	_cached_model = None

	def get_model(model_name):
	global _cached_model_name, _cached_model
	if _cached_model_name != model_name or _cached_model is None:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	compute_type = "float16" if torch.cuda.is_available() else "int8"
	print(f"Loading {model_name} on {device} ({compute_type})...")
	# Using 4 CPU threads to maximize performance on free tier
	_cached_model = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=4)
	_cached_model_name = model_name
	return _cached_model

	def format_timestamp(seconds):
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = int(seconds % 60)
	ms = int((seconds % 1) * 1000)
	return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

	def segments_to_srt(segments):
	lines = []
	for i, seg in enumerate(segments, 1):
	start = format_timestamp(seg.start)
	end = format_timestamp(seg.end)
	lines.append(str(i))
	lines.append(f"{start} --> {end}")
	lines.append(seg.text.strip())
	lines.append("")
	return "\n".join(lines)

	def transcribe(audio_path, model_name, task="transcribe", language=None):
	if audio_path is None:
	return "Please upload an audio file.", None

	model = get_model(model_name)

	print(f"Transcribing {audio_path} using {model_name}...")

	options = {
	"task": task,
	"beam_size": 1, # Set to 1 for absolute maximum speed on CPU
	"best_of": 1,
	"vad_filter": True, # Filter out non-speech/silence to speed up
	}

	if language and language != "auto":
	options["language"] = language

	segments, info = model.transcribe(audio_path, **options)

	segments_list = []
	for segment in segments:
	segments_list.append(segment)
	print(f"[{format_timestamp(segment.start)}] {segment.text}")

	full_text = " ".join([s.text.strip() for s in segments_list])
	srt_content = segments_to_srt(segments_list)

	# Save SRT to a temporary file
	temp_srt = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
	temp_srt.write(srt_content.encode("utf-8"))
	temp_srt.close()

	return full_text, temp_srt.name

	# Gradio UI
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎙️ VoiceScript - CPU Optimized")
	gr.Markdown("Fast transcription using models optimized for CPU performance (`large-v3-turbo` and `distil-large-v3`).")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Upload Audio/Video")
	model_input = gr.Radio(["large-v3-turbo", "distil-large-v3"], label="Model", value="large-v3-turbo")
	task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
	lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto")
	transcribe_btn = gr.Button("Transcribe", variant="primary")

	with gr.Column():
	text_output = gr.Textbox(label="Transcript", lines=10)
	file_output = gr.File(label="Download SRT")

	transcribe_btn.click(
	fn=transcribe,
	inputs=[audio_input, model_input, task_input, lang_input],
	outputs=[text_output, file_output]
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)