audio-to-srt / app.py
Nav3005's picture
Update app.py
5ea61b6 verified
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
import gradio as gr
import pysrt
import requests
import tempfile
from faster_whisper import WhisperModel
from datetime import timedelta
from urllib.parse import urlparse
# -----------------------------
# Core subtitle generator
# -----------------------------
class LinearSubtitleGenerator:
def __init__(self, model_size="base"):
self.model = WhisperModel(
model_size,
device="cpu",
compute_type="int8"
)
def transcribe(self, audio_path):
segments, _ = self.model.transcribe(
audio_path,
word_timestamps=True,
vad_filter=True
)
return segments
def extract_words(self, segments):
words = []
for segment in segments:
if not segment.words:
continue
for w in segment.words:
if w.start is None or w.end is None:
continue
words.append({
"word": w.word.strip(),
"start": float(w.start),
"end": float(w.end)
})
return words
def create_linear_subtitles(self, words):
subs = pysrt.SubRipFile()
total_words = len(words)
index = 0
subtitle_index = 1
current_size = 1 # 1,2,3,4,...
while index < total_words:
planned_size = current_size
remaining = total_words - (index + planned_size)
next_size = current_size + 1
# absorb leftovers to avoid tiny last subtitle
if remaining > 0 and remaining < next_size:
planned_size += remaining
subtitle_words = []
start_time = None
end_time = None
for _ in range(planned_size):
if index >= total_words:
break
w = words[index]
subtitle_words.append(w["word"])
if start_time is None:
start_time = w["start"]
end_time = w["end"]
index += 1
subs.append(
pysrt.SubRipItem(
index=subtitle_index,
start=self._to_time(start_time),
end=self._to_time(end_time),
text=" ".join(subtitle_words)
)
)
subtitle_index += 1
if planned_size == current_size:
current_size += 1
else:
break
return subs
def _to_time(self, seconds):
td = timedelta(seconds=seconds)
return pysrt.SubRipTime(
hours=td.seconds // 3600,
minutes=(td.seconds % 3600) // 60,
seconds=td.seconds % 60,
milliseconds=td.microseconds // 1000
)
# -----------------------------
# Helper: download audio from URL
# -----------------------------
def download_audio(url: str) -> str:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
raise ValueError("Invalid URL scheme")
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
suffix = os.path.splitext(parsed.path)[1] or ".wav"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
for chunk in response.iter_content(chunk_size=8192):
tmp.write(chunk)
tmp.close()
return tmp.name
# -----------------------------
# Gradio callable function
# -----------------------------
def generate_srt(audio_file, audio_url, model_size):
# exactly one input must be provided
if bool(audio_file) == bool(audio_url):
raise gr.Error(
"Please provide EITHER an audio file OR an audio URL (not both)."
)
if audio_url:
audio_path = download_audio(audio_url)
else:
audio_path = audio_file
generator = LinearSubtitleGenerator(model_size)
segments = generator.transcribe(audio_path)
words = generator.extract_words(segments)
subs = generator.create_linear_subtitles(words)
out = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
subs.save(out.name, encoding="utf-8")
return out.name
# -----------------------------
# Gradio UI (UNCHANGED)
# -----------------------------
with gr.Blocks(title="Subtitle Generator") as demo:
gr.Markdown(
"""
# srt generator
"""
)
with gr.Row():
audio_file = gr.Audio(
label="Upload Audio File",
type="filepath"
)
audio_url = gr.Textbox(
label="Audio URL (http/https)",
placeholder="https://example.com/audio.wav"
)
model_choice = gr.Dropdown(
choices=["tiny", "base", "small", "medium"],
value="base",
label="Whisper Model"
)
generate_btn = gr.Button("Generate SRT")
output_file = gr.File(label="Download SRT")
generate_btn.click(
fn=generate_srt,
inputs=[audio_file, audio_url, model_choice],
outputs=output_file
)
if __name__ == "__main__":
demo.launch(mcp_server=True)