audio-to-srt

Running

App Files Files Community

audio-to-srt / app.py

sampleacc-3003

Update app.py

7ac3eba verified 6 days ago

raw

history blame contribute delete

15.7 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"

	import gradio as gr
	import pysrt
	import requests
	import tempfile
	import time
	from faster_whisper import WhisperModel
	from datetime import timedelta
	from urllib.parse import urlparse

	# Maximum words per subtitle (set to None to disable)
	DEFAULT_MAX_WORDS = 18

	# -----------------------------
	# Core subtitle generator
	# -----------------------------
	class LinearSubtitleGenerator:
	def __init__(self, model_size="base"):
	self.model = WhisperModel(
	model_size,
	device="cpu",
	compute_type="int8"
	)

	def transcribe(self, audio_path):
	segments, _ = self.model.transcribe(
	audio_path,
	word_timestamps=True,
	vad_filter=True
	)
	return segments

	def extract_words(self, segments):
	words = []
	for segment in segments:
	if not segment.words:
	continue
	for w in segment.words:
	if w.start is None or w.end is None:
	continue
	words.append({
	"word": w.word.strip(),
	"start": float(w.start),
	"end": float(w.end)
	})
	return words

	def find_sentence_boundaries(self, words):
	"""
	Find first and last sentence boundaries based on periods.
	Returns: (first_period_idx, last_period_idx)
	"""
	first_period_idx = None
	last_period_idx = None

	for idx, word_data in enumerate(words):
	word = word_data["word"]
	# Check if word ends with period (and not abbreviation)
	if word.endswith('.') or word.endswith('!') or word.endswith('?'):
	if first_period_idx is None:
	first_period_idx = idx
	last_period_idx = idx

	return first_period_idx, last_period_idx

	def create_linear_subtitles(self, words, max_words=None):
	"""
	Create subtitles with:
	- First sentence as first subtitle
	- Middle content with linear pattern (1, 2, 3, 4... words)
	- Last sentence as last subtitle
	"""
	subs = pysrt.SubRipFile()

	if not words:
	return subs

	total_words = len(words)
	first_period_idx, last_period_idx = self.find_sentence_boundaries(words)

	# Edge case: No periods found - use original linear pattern
	if first_period_idx is None:
	return self._create_basic_linear_subtitles(words, max_words=max_words)

	# Edge case: Only one sentence (first = last)
	if first_period_idx == last_period_idx:
	# Single sentence becomes single subtitle
	self._add_subtitle(subs, 1, words, 0, total_words)
	return subs

	subtitle_index = 1

	# 1. First sentence as first subtitle
	first_sentence_words = words[0:first_period_idx + 1]
	self._add_subtitle(subs, subtitle_index, first_sentence_words, 0, len(first_sentence_words))
	subtitle_index += 1

	# 2. Middle content with linear pattern
	middle_start = first_period_idx + 1
	middle_end = last_period_idx

	if middle_start < middle_end:
	middle_words = words[middle_start:middle_end]
	subtitle_index = self._add_linear_pattern(
	subs, middle_words, subtitle_index, max_words=max_words
	)

	# 3. Last sentence as last subtitle
	last_sentence_words = words[last_period_idx:total_words]
	if last_sentence_words:
	self._add_subtitle(subs, subtitle_index, last_sentence_words, 0, len(last_sentence_words))

	return subs

	def _add_subtitle(self, subs, index, words, start_idx, end_idx):
	"""Helper to add a single subtitle from word range"""
	if start_idx >= end_idx or start_idx >= len(words):
	return

	subtitle_words = []
	start_time = None
	end_time = None

	for i in range(start_idx, min(end_idx, len(words))):
	w = words[i]
	subtitle_words.append(w["word"])
	if start_time is None:
	start_time = w["start"]
	end_time = w["end"]

	if subtitle_words:
	subs.append(
	pysrt.SubRipItem(
	index=index,
	start=self._to_time(start_time),
	end=self._to_time(end_time),
	text=" ".join(subtitle_words)
	)
	)

	def _add_linear_pattern(self, subs, words, start_index, max_words=None):
	"""Apply linear pattern (1, 2, 3, 4... words) to words list

	If `max_words` is provided, no subtitle will contain more than
	`max_words` words. Once the linear size reaches `max_words` it
	will remain at that size for subsequent subtitles.
	"""
	total_words = len(words)
	index = 0
	subtitle_index = start_index
	current_size = 1

	while index < total_words:
	planned_size = current_size
	if max_words is not None:
	planned_size = min(planned_size, max_words)
	remaining = total_words - (index + planned_size)
	next_size = current_size + 1

	# Absorb leftovers to avoid tiny last subtitle
	if remaining > 0 and remaining < next_size:
	planned_size += remaining

	subtitle_words = []
	start_time = None
	end_time = None

	for _ in range(planned_size):
	if index >= total_words:
	break
	w = words[index]
	subtitle_words.append(w["word"])
	if start_time is None:
	start_time = w["start"]
	end_time = w["end"]
	index += 1

	if subtitle_words:
	subs.append(
	pysrt.SubRipItem(
	index=subtitle_index,
	start=self._to_time(start_time),
	end=self._to_time(end_time),
	text=" ".join(subtitle_words)
	)
	)
	subtitle_index += 1

	# Progress to next size only if we didn't absorb leftovers
	# and we're not already at the configured maximum.
	if planned_size == current_size:
	if max_words is None or current_size < max_words:
	current_size += 1
	else:
	# stay at max_words for following subtitles
	current_size = max_words
	else:
	break

	return subtitle_index

	def _create_basic_linear_subtitles(self, words, max_words=None):
	"""Fallback: Original linear pattern when no periods found

	Honors `max_words` similarly to the linear pattern above.
	"""
	subs = pysrt.SubRipFile()
	total_words = len(words)
	index = 0
	subtitle_index = 1
	current_size = 1

	while index < total_words:
	planned_size = current_size
	if max_words is not None:
	planned_size = min(planned_size, max_words)
	remaining = total_words - (index + planned_size)
	next_size = current_size + 1

	if remaining > 0 and remaining < next_size:
	planned_size += remaining

	subtitle_words = []
	start_time = None
	end_time = None

	for _ in range(planned_size):
	if index >= total_words:
	break
	w = words[index]
	subtitle_words.append(w["word"])
	if start_time is None:
	start_time = w["start"]
	end_time = w["end"]
	index += 1

	subs.append(
	pysrt.SubRipItem(
	index=subtitle_index,
	start=self._to_time(start_time),
	end=self._to_time(end_time),
	text=" ".join(subtitle_words)
	)
	)
	subtitle_index += 1

	if planned_size == current_size:
	if max_words is None or current_size < max_words:
	current_size += 1
	else:
	current_size = max_words
	else:
	break

	return subs

	def _to_time(self, seconds):
	td = timedelta(seconds=seconds)
	return pysrt.SubRipTime(
	hours=td.seconds // 3600,
	minutes=(td.seconds % 3600) // 60,
	seconds=td.seconds % 60,
	milliseconds=td.microseconds // 1000
	)

	# -----------------------------
	# Helper: download audio from URL
	# -----------------------------
	def download_audio(url: str) -> str:
	parsed = urlparse(url)
	if parsed.scheme not in ("http", "https"):
	raise ValueError("Invalid URL scheme")

	response = requests.get(url, stream=True, timeout=30)
	response.raise_for_status()

	suffix = os.path.splitext(parsed.path)[1] or ".wav"
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)

	for chunk in response.iter_content(chunk_size=8192):
	tmp.write(chunk)

	tmp.close()
	return tmp.name

	# -----------------------------
	# Helper: format elapsed time
	# -----------------------------
	def format_time(seconds):
	"""Format seconds into readable time string"""
	if seconds < 60:
	return f"{seconds:.1f}s"
	elif seconds < 3600:
	mins = int(seconds // 60)
	secs = int(seconds % 60)
	return f"{mins}m {secs}s"
	else:
	hours = int(seconds // 3600)
	mins = int((seconds % 3600) // 60)
	return f"{hours}h {mins}m"

	# -----------------------------
	# Gradio callable function with status updates
	# -----------------------------
	def generate_srt(audio_file, audio_url, model_size):
	start_time = time.time()
	status_messages = []

	try:
	# Validation
	if bool(audio_file) == bool(audio_url):
	error_msg = "❌ Error: Please provide EITHER an audio file OR an audio URL (not both)."
	return None, error_msg

	status_messages.append("🚀 Starting subtitle generation...")
	yield None, "\n".join(status_messages)

	# Step 1: Get audio file
	if audio_url:
	status_messages.append("📥 Downloading audio from URL...")
	yield None, "\n".join(status_messages)

	download_start = time.time()
	audio_path = download_audio(audio_url)
	download_time = time.time() - download_start

	status_messages.append(f"✓ Download completed in {format_time(download_time)}")
	yield None, "\n".join(status_messages)
	else:
	audio_path = audio_file
	status_messages.append("✓ Audio file loaded")
	yield None, "\n".join(status_messages)

	# Step 2: Load model
	status_messages.append(f"🧠 Loading Whisper model ({model_size})...")
	yield None, "\n".join(status_messages)

	model_start = time.time()
	generator = LinearSubtitleGenerator(model_size)
	model_time = time.time() - model_start

	status_messages.append(f"✓ Model loaded in {format_time(model_time)}")
	yield None, "\n".join(status_messages)

	# Step 3: Transcribe
	status_messages.append("🎤 Transcribing audio (this may take a while)...")
	yield None, "\n".join(status_messages)

	transcribe_start = time.time()
	segments = generator.transcribe(audio_path)
	words = generator.extract_words(segments)
	transcribe_time = time.time() - transcribe_start

	status_messages.append(f"✓ Transcription completed in {format_time(transcribe_time)}")
	status_messages.append(f"📊 Extracted {len(words)} words")
	yield None, "\n".join(status_messages)

	# Step 4: Generate subtitles
	status_messages.append("📝 Generating SRT subtitles...")
	yield None, "\n".join(status_messages)

	srt_start = time.time()
	subs = generator.create_linear_subtitles(words, max_words=DEFAULT_MAX_WORDS)
	srt_time = time.time() - srt_start

	status_messages.append(f"✓ Created {len(subs)} subtitle segments in {format_time(srt_time)}")
	yield None, "\n".join(status_messages)

	# Step 5: Save file
	status_messages.append("💾 Saving SRT file...")
	yield None, "\n".join(status_messages)

	out = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
	subs.save(out.name, encoding="utf-8")

	# Calculate total time
	total_time = time.time() - start_time

	# Final success message
	status_messages.append(f"✅ SUCCESS! Total time: {format_time(total_time)}")
	status_messages.append(f"📁 SRT file ready for download")

	yield out.name, "\n".join(status_messages)

	except requests.RequestException as e:
	error_msg = f"❌ Network Error: Failed to download audio\nDetails: {str(e)}"
	yield None, error_msg

	except ValueError as e:
	error_msg = f"❌ Validation Error: {str(e)}"
	yield None, error_msg

	except Exception as e:
	total_time = time.time() - start_time
	error_msg = f"❌ Error occurred after {format_time(total_time)}\nDetails: {str(e)}"
	yield None, error_msg

	# -----------------------------
	# Gradio UI with Status Bar
	# -----------------------------
	with gr.Blocks(title="Subtitle Generator") as demo:
	gr.Markdown(
	"""
	# SRT Generator with Smart Sentence Handling

	Features:
	- First sentence → First subtitle
	- Middle content → Linear pattern (1, 2, 3, 4... words)
	- Last sentence → Last subtitle
	"""
	)

	with gr.Row():
	audio_file = gr.Audio(
	label="Upload Audio File",
	type="filepath"
	)

	audio_url = gr.Textbox(
	label="Audio URL (http/https)",
	placeholder="https://example.com/audio.wav"
	)

	model_choice = gr.Dropdown(
	choices=["tiny", "base", "small", "medium"],
	value="base",
	label="Whisper Model"
	)

	generate_btn = gr.Button("Generate SRT", variant="primary")

	# Status display
	status_box = gr.Textbox(
	label="Status",
	placeholder="Status updates will appear here...",
	lines=10,
	max_lines=15,
	interactive=False
	)

	output_file = gr.File(label="Download SRT")

	# Event handler
	generate_btn.click(
	fn=generate_srt,
	inputs=[audio_file, audio_url, model_choice],
	outputs=[output_file, status_box]
	)

	gr.Markdown(
	"""
	---
	Tips:
	- Larger models (small/medium) are more accurate but slower
	- For best results, use clear audio with minimal background noise
	- Processing time depends on audio length and model size
	"""
	)

	if __name__ == "__main__":
	demo.launch()