audio-to-srt

Sleeping

File size: 15,748 Bytes

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

import gradio as gr
import pysrt
import requests
import tempfile
import time
from faster_whisper import WhisperModel
from datetime import timedelta
from urllib.parse import urlparse

# Maximum words per subtitle (set to None to disable)
DEFAULT_MAX_WORDS = 18

# -----------------------------
# Core subtitle generator 
# -----------------------------
class LinearSubtitleGenerator:
    def __init__(self, model_size="base"):
        self.model = WhisperModel(
            model_size,
            device="cpu",
            compute_type="int8"
        )

    def transcribe(self, audio_path):
        segments, _ = self.model.transcribe(
            audio_path,
            word_timestamps=True,
            vad_filter=True
        )
        return segments

    def extract_words(self, segments):
        words = []
        for segment in segments:
            if not segment.words:
                continue
            for w in segment.words:
                if w.start is None or w.end is None:
                    continue
                words.append({
                    "word": w.word.strip(),
                    "start": float(w.start),
                    "end": float(w.end)
                })
        return words

    def find_sentence_boundaries(self, words):
        """
        Find first and last sentence boundaries based on periods.
        Returns: (first_period_idx, last_period_idx)
        """
        first_period_idx = None
        last_period_idx = None
        
        for idx, word_data in enumerate(words):
            word = word_data["word"]
            # Check if word ends with period (and not abbreviation)
            if word.endswith('.') or word.endswith('!') or word.endswith('?'):
                if first_period_idx is None:
                    first_period_idx = idx
                last_period_idx = idx
        
        return first_period_idx, last_period_idx

    def create_linear_subtitles(self, words, max_words=None):
        """
        Create subtitles with:
        - First sentence as first subtitle
        - Middle content with linear pattern (1, 2, 3, 4... words)
        - Last sentence as last subtitle
        """
        subs = pysrt.SubRipFile()
        
        if not words:
            return subs
        
        total_words = len(words)
        first_period_idx, last_period_idx = self.find_sentence_boundaries(words)
        
        # Edge case: No periods found - use original linear pattern
        if first_period_idx is None:
            return self._create_basic_linear_subtitles(words, max_words=max_words)
        
        # Edge case: Only one sentence (first = last)
        if first_period_idx == last_period_idx:
            # Single sentence becomes single subtitle
            self._add_subtitle(subs, 1, words, 0, total_words)
            return subs
        
        subtitle_index = 1
        
        # 1. First sentence as first subtitle
        first_sentence_words = words[0:first_period_idx + 1]
        self._add_subtitle(subs, subtitle_index, first_sentence_words, 0, len(first_sentence_words))
        subtitle_index += 1
        
        # 2. Middle content with linear pattern
        middle_start = first_period_idx + 1
        middle_end = last_period_idx
        
        if middle_start < middle_end:
            middle_words = words[middle_start:middle_end]
            subtitle_index = self._add_linear_pattern(
                subs, middle_words, subtitle_index, max_words=max_words
            )
        
        # 3. Last sentence as last subtitle
        last_sentence_words = words[last_period_idx:total_words]
        if last_sentence_words:
            self._add_subtitle(subs, subtitle_index, last_sentence_words, 0, len(last_sentence_words))
        
        return subs

    def _add_subtitle(self, subs, index, words, start_idx, end_idx):
        """Helper to add a single subtitle from word range"""
        if start_idx >= end_idx or start_idx >= len(words):
            return
        
        subtitle_words = []
        start_time = None
        end_time = None
        
        for i in range(start_idx, min(end_idx, len(words))):
            w = words[i]
            subtitle_words.append(w["word"])
            if start_time is None:
                start_time = w["start"]
            end_time = w["end"]
        
        if subtitle_words:
            subs.append(
                pysrt.SubRipItem(
                    index=index,
                    start=self._to_time(start_time),
                    end=self._to_time(end_time),
                    text=" ".join(subtitle_words)
                )
            )

    def _add_linear_pattern(self, subs, words, start_index, max_words=None):
        """Apply linear pattern (1, 2, 3, 4... words) to words list

        If `max_words` is provided, no subtitle will contain more than
        `max_words` words. Once the linear size reaches `max_words` it
        will remain at that size for subsequent subtitles.
        """
        total_words = len(words)
        index = 0
        subtitle_index = start_index
        current_size = 1
        
        while index < total_words:
            planned_size = current_size
            if max_words is not None:
                planned_size = min(planned_size, max_words)
            remaining = total_words - (index + planned_size)
            next_size = current_size + 1
            
            # Absorb leftovers to avoid tiny last subtitle
            if remaining > 0 and remaining < next_size:
                planned_size += remaining
            
            subtitle_words = []
            start_time = None
            end_time = None
            
            for _ in range(planned_size):
                if index >= total_words:
                    break
                w = words[index]
                subtitle_words.append(w["word"])
                if start_time is None:
                    start_time = w["start"]
                end_time = w["end"]
                index += 1
            
            if subtitle_words:
                subs.append(
                    pysrt.SubRipItem(
                        index=subtitle_index,
                        start=self._to_time(start_time),
                        end=self._to_time(end_time),
                        text=" ".join(subtitle_words)
                    )
                )
                subtitle_index += 1
            
            # Progress to next size only if we didn't absorb leftovers
            # and we're not already at the configured maximum.
            if planned_size == current_size:
                if max_words is None or current_size < max_words:
                    current_size += 1
                else:
                    # stay at max_words for following subtitles
                    current_size = max_words
            else:
                break
        
        return subtitle_index

    def _create_basic_linear_subtitles(self, words, max_words=None):
        """Fallback: Original linear pattern when no periods found

        Honors `max_words` similarly to the linear pattern above.
        """
        subs = pysrt.SubRipFile()
        total_words = len(words)
        index = 0
        subtitle_index = 1
        current_size = 1
        
        while index < total_words:
            planned_size = current_size
            if max_words is not None:
                planned_size = min(planned_size, max_words)
            remaining = total_words - (index + planned_size)
            next_size = current_size + 1
            
            if remaining > 0 and remaining < next_size:
                planned_size += remaining
            
            subtitle_words = []
            start_time = None
            end_time = None
            
            for _ in range(planned_size):
                if index >= total_words:
                    break
                w = words[index]
                subtitle_words.append(w["word"])
                if start_time is None:
                    start_time = w["start"]
                end_time = w["end"]
                index += 1
            
            subs.append(
                pysrt.SubRipItem(
                    index=subtitle_index,
                    start=self._to_time(start_time),
                    end=self._to_time(end_time),
                    text=" ".join(subtitle_words)
                )
            )
            subtitle_index += 1
            
            if planned_size == current_size:
                if max_words is None or current_size < max_words:
                    current_size += 1
                else:
                    current_size = max_words
            else:
                break
        
        return subs

    def _to_time(self, seconds):
        td = timedelta(seconds=seconds)
        return pysrt.SubRipTime(
            hours=td.seconds // 3600,
            minutes=(td.seconds % 3600) // 60,
            seconds=td.seconds % 60,
            milliseconds=td.microseconds // 1000
        )

    # -----------------------------
    # Helper: download audio from URL
    # -----------------------------
def download_audio(url: str) -> str:
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        raise ValueError("Invalid URL scheme")

    response = requests.get(url, stream=True, timeout=30)
    response.raise_for_status()

    suffix = os.path.splitext(parsed.path)[1] or ".wav"
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)

    for chunk in response.iter_content(chunk_size=8192):
        tmp.write(chunk)

    tmp.close()
    return tmp.name

    # -----------------------------
    # Helper: format elapsed time
    # -----------------------------
def format_time(seconds):
    """Format seconds into readable time string"""
    if seconds < 60:
        return f"{seconds:.1f}s"
    elif seconds < 3600:
        mins = int(seconds // 60)
        secs = int(seconds % 60)
        return f"{mins}m {secs}s"
    else:
        hours = int(seconds // 3600)
        mins = int((seconds % 3600) // 60)
        return f"{hours}h {mins}m"

    # -----------------------------
    # Gradio callable function with status updates
    # -----------------------------
def generate_srt(audio_file, audio_url, model_size):
    start_time = time.time()
    status_messages = []

    try:
        # Validation
        if bool(audio_file) == bool(audio_url):
            error_msg = "❌ Error: Please provide EITHER an audio file OR an audio URL (not both)."
            return None, error_msg
        
        status_messages.append("🚀 Starting subtitle generation...")
        yield None, "\n".join(status_messages)
        
        # Step 1: Get audio file
        if audio_url:
            status_messages.append("📥 Downloading audio from URL...")
            yield None, "\n".join(status_messages)
            
            download_start = time.time()
            audio_path = download_audio(audio_url)
            download_time = time.time() - download_start
            
            status_messages.append(f"✓ Download completed in {format_time(download_time)}")
            yield None, "\n".join(status_messages)
        else:
            audio_path = audio_file
            status_messages.append("✓ Audio file loaded")
            yield None, "\n".join(status_messages)
        
        # Step 2: Load model
        status_messages.append(f"🧠 Loading Whisper model ({model_size})...")
        yield None, "\n".join(status_messages)
        
        model_start = time.time()
        generator = LinearSubtitleGenerator(model_size)
        model_time = time.time() - model_start
        
        status_messages.append(f"✓ Model loaded in {format_time(model_time)}")
        yield None, "\n".join(status_messages)
        
        # Step 3: Transcribe
        status_messages.append("🎤 Transcribing audio (this may take a while)...")
        yield None, "\n".join(status_messages)
        
        transcribe_start = time.time()
        segments = generator.transcribe(audio_path)
        words = generator.extract_words(segments)
        transcribe_time = time.time() - transcribe_start
        
        status_messages.append(f"✓ Transcription completed in {format_time(transcribe_time)}")
        status_messages.append(f"📊 Extracted {len(words)} words")
        yield None, "\n".join(status_messages)
        
        # Step 4: Generate subtitles
        status_messages.append("📝 Generating SRT subtitles...")
        yield None, "\n".join(status_messages)
        
        srt_start = time.time()
        subs = generator.create_linear_subtitles(words, max_words=DEFAULT_MAX_WORDS)
        srt_time = time.time() - srt_start
        
        status_messages.append(f"✓ Created {len(subs)} subtitle segments in {format_time(srt_time)}")
        yield None, "\n".join(status_messages)
        
        # Step 5: Save file
        status_messages.append("💾 Saving SRT file...")
        yield None, "\n".join(status_messages)
        
        out = tempfile.NamedTemporaryFile(delete=False, suffix=".srt")
        subs.save(out.name, encoding="utf-8")
        
        # Calculate total time
        total_time = time.time() - start_time
        
        # Final success message
        status_messages.append(f"✅ SUCCESS! Total time: {format_time(total_time)}")
        status_messages.append(f"📁 SRT file ready for download")
        
        yield out.name, "\n".join(status_messages)

    except requests.RequestException as e:
        error_msg = f"❌ Network Error: Failed to download audio\nDetails: {str(e)}"
        yield None, error_msg

    except ValueError as e:
        error_msg = f"❌ Validation Error: {str(e)}"
        yield None, error_msg

    except Exception as e:
        total_time = time.time() - start_time
        error_msg = f"❌ Error occurred after {format_time(total_time)}\nDetails: {str(e)}"
        yield None, error_msg

    # -----------------------------
    # Gradio UI with Status Bar
    # -----------------------------
with gr.Blocks(title="Subtitle Generator") as demo:
    gr.Markdown(
        """
        # SRT Generator with Smart Sentence Handling
        
        **Features:**
        - First sentence → First subtitle
        - Middle content → Linear pattern (1, 2, 3, 4... words)
        - Last sentence → Last subtitle
        """
    )

    with gr.Row():
        audio_file = gr.Audio(
            label="Upload Audio File",
            type="filepath"
        )

        audio_url = gr.Textbox(
            label="Audio URL (http/https)",
            placeholder="https://example.com/audio.wav"
        )

    model_choice = gr.Dropdown(
        choices=["tiny", "base", "small", "medium"],
        value="base",
        label="Whisper Model"
    )

    generate_btn = gr.Button("Generate SRT", variant="primary")

    # Status display
    status_box = gr.Textbox(
        label="Status",
        placeholder="Status updates will appear here...",
        lines=10,
        max_lines=15,
        interactive=False
    )

    output_file = gr.File(label="Download SRT")

    # Event handler
    generate_btn.click(
        fn=generate_srt,
        inputs=[audio_file, audio_url, model_choice],
        outputs=[output_file, status_box]
    )

    gr.Markdown(
        """
        ---
        **Tips:**
        - Larger models (small/medium) are more accurate but slower
        - For best results, use clear audio with minimal background noise
        - Processing time depends on audio length and model size
        """
    )

if __name__ == "__main__":
    demo.launch()