Spaces:
Sleeping
Sleeping
Antigravity Agent commited on
Commit ·
81bd23a
1
Parent(s): c00f45b
Optimize for CPU execution and add CPU models
Browse files- app.py +18 -18
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -3,19 +3,20 @@ import tempfile
|
|
| 3 |
import gradio as gr
|
| 4 |
from faster_whisper import WhisperModel
|
| 5 |
import torch
|
| 6 |
-
import spaces
|
| 7 |
|
| 8 |
-
# Global cache for the model
|
| 9 |
-
|
| 10 |
_cached_model = None
|
| 11 |
|
| 12 |
-
def get_model():
|
| 13 |
-
global _cached_model
|
| 14 |
-
if _cached_model is None:
|
| 15 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 16 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
| 17 |
-
print(f"Loading
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
return _cached_model
|
| 20 |
|
| 21 |
def format_timestamp(seconds):
|
|
@@ -36,20 +37,18 @@ def segments_to_srt(segments):
|
|
| 36 |
lines.append("")
|
| 37 |
return "\n".join(lines)
|
| 38 |
|
| 39 |
-
|
| 40 |
-
def transcribe(audio_path, task="transcribe", language=None):
|
| 41 |
if audio_path is None:
|
| 42 |
return "Please upload an audio file.", None
|
| 43 |
|
| 44 |
-
|
| 45 |
-
model = get_model()
|
| 46 |
|
| 47 |
-
print(f"Transcribing {audio_path}...")
|
| 48 |
|
| 49 |
options = {
|
| 50 |
"task": task,
|
| 51 |
-
"beam_size":
|
| 52 |
-
"best_of":
|
| 53 |
"vad_filter": True, # Filter out non-speech/silence to speed up
|
| 54 |
}
|
| 55 |
|
|
@@ -75,12 +74,13 @@ def transcribe(audio_path, task="transcribe", language=None):
|
|
| 75 |
|
| 76 |
# Gradio UI
|
| 77 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 78 |
-
gr.Markdown("# 🎙️ VoiceScript -
|
| 79 |
-
gr.Markdown("Fast
|
| 80 |
|
| 81 |
with gr.Row():
|
| 82 |
with gr.Column():
|
| 83 |
audio_input = gr.Audio(type="filepath", label="Upload Audio/Video")
|
|
|
|
| 84 |
task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
|
| 85 |
lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto")
|
| 86 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
|
@@ -91,7 +91,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 91 |
|
| 92 |
transcribe_btn.click(
|
| 93 |
fn=transcribe,
|
| 94 |
-
inputs=[audio_input, task_input, lang_input],
|
| 95 |
outputs=[text_output, file_output]
|
| 96 |
)
|
| 97 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from faster_whisper import WhisperModel
|
| 5 |
import torch
|
|
|
|
| 6 |
|
| 7 |
+
# Global cache for the model
|
| 8 |
+
_cached_model_name = None
|
| 9 |
_cached_model = None
|
| 10 |
|
| 11 |
+
def get_model(model_name):
|
| 12 |
+
global _cached_model_name, _cached_model
|
| 13 |
+
if _cached_model_name != model_name or _cached_model is None:
|
| 14 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
compute_type = "float16" if torch.cuda.is_available() else "int8"
|
| 16 |
+
print(f"Loading {model_name} on {device} ({compute_type})...")
|
| 17 |
+
# Using 4 CPU threads to maximize performance on free tier
|
| 18 |
+
_cached_model = WhisperModel(model_name, device=device, compute_type=compute_type, cpu_threads=4)
|
| 19 |
+
_cached_model_name = model_name
|
| 20 |
return _cached_model
|
| 21 |
|
| 22 |
def format_timestamp(seconds):
|
|
|
|
| 37 |
lines.append("")
|
| 38 |
return "\n".join(lines)
|
| 39 |
|
| 40 |
+
def transcribe(audio_path, model_name, task="transcribe", language=None):
|
|
|
|
| 41 |
if audio_path is None:
|
| 42 |
return "Please upload an audio file.", None
|
| 43 |
|
| 44 |
+
model = get_model(model_name)
|
|
|
|
| 45 |
|
| 46 |
+
print(f"Transcribing {audio_path} using {model_name}...")
|
| 47 |
|
| 48 |
options = {
|
| 49 |
"task": task,
|
| 50 |
+
"beam_size": 1, # Set to 1 for absolute maximum speed on CPU
|
| 51 |
+
"best_of": 1,
|
| 52 |
"vad_filter": True, # Filter out non-speech/silence to speed up
|
| 53 |
}
|
| 54 |
|
|
|
|
| 74 |
|
| 75 |
# Gradio UI
|
| 76 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 77 |
+
gr.Markdown("# 🎙️ VoiceScript - CPU Optimized")
|
| 78 |
+
gr.Markdown("Fast transcription using models optimized for CPU performance (`large-v3-turbo` and `distil-large-v3`).")
|
| 79 |
|
| 80 |
with gr.Row():
|
| 81 |
with gr.Column():
|
| 82 |
audio_input = gr.Audio(type="filepath", label="Upload Audio/Video")
|
| 83 |
+
model_input = gr.Radio(["large-v3-turbo", "distil-large-v3"], label="Model", value="large-v3-turbo")
|
| 84 |
task_input = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
|
| 85 |
lang_input = gr.Dropdown(["auto", "en", "es", "fr", "de", "zh", "ja", "ko", "hi"], label="Language (optional)", value="auto")
|
| 86 |
transcribe_btn = gr.Button("Transcribe", variant="primary")
|
|
|
|
| 91 |
|
| 92 |
transcribe_btn.click(
|
| 93 |
fn=transcribe,
|
| 94 |
+
inputs=[audio_input, model_input, task_input, lang_input],
|
| 95 |
outputs=[text_output, file_output]
|
| 96 |
)
|
| 97 |
|
requirements.txt
CHANGED
|
@@ -3,4 +3,4 @@ gradio
|
|
| 3 |
torch
|
| 4 |
torchaudio
|
| 5 |
ffmpeg-python
|
| 6 |
-
|
|
|
|
| 3 |
torch
|
| 4 |
torchaudio
|
| 5 |
ffmpeg-python
|
| 6 |
+
|