Spaces:

danielrosehill
/

Audio-To-Subtitles

Running

File size: 22,609 Bytes

import gradio as gr
import whisper
import torch
from pyannote.audio import Pipeline
from pydub import AudioSegment
import re
import os
from typing import List, Dict, Tuple
import tempfile

# Detect and use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load models (will be cached after first load)
print("Loading Whisper model...")
whisper_model = whisper.load_model("large-v2", device=device)  # Load on GPU if available
print(f"Whisper model loaded on {device}")

# Diarization pipeline will be loaded on-demand with user's token

# Filler words and minimal vocalizations to remove
FILLER_WORDS = [
    r'\buh\b', r'\bum\b', r'\bmmm+\b', r'\bmm+\b', r'\bhmm+\b',
    r'\bahh+\b', r'\buhh+\b', r'\berr+\b', r'\boh\b',
    r'\byou know\b', r'\blike\b', r'\bbasically\b', r'\bliterally\b',
    r'\bactually\b', r'\bokay\b', r'\bright\b', r'\byeah\b',
    r'\buh-huh\b', r'\bmhm\b', r'\bnah\b'
]

def convert_to_wav(audio_path: str) -> str:
    """Convert audio file to WAV format for processing."""
    audio = AudioSegment.from_file(audio_path)
    wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(wav_path, format="wav")
    return wav_path

def clean_text(text: str) -> str:
    """Remove filler words, stutters, and clean up text."""
    # Remove filler words
    for filler in FILLER_WORDS:
        text = re.sub(filler, '', text, flags=re.IGNORECASE)

    # Remove stutters (e.g., "I-I-I" -> "I")
    text = re.sub(r'\b(\w+)(-\1)+\b', r'\1', text)

    # Clean up extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

def identify_speaker(speaker_label: str, voice_mapping: Dict[str, str] = None) -> str:
    """
    Identify speaker based on diarization label and user-provided voice mapping.

    Args:
        speaker_label: The speaker label from diarization (e.g., "SPEAKER_00")
        voice_mapping: Dictionary mapping speaker labels to names

    Returns:
        The identified speaker name
    """
    if voice_mapping and speaker_label in voice_mapping:
        return voice_mapping[speaker_label]
    else:
        # Fallback for unmapped speakers
        speaker_num = speaker_label.split("_")[-1] if "_" in speaker_label else "00"
        return f"Speaker {speaker_num}"

def format_timestamp(seconds: float) -> str:
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

def split_into_sentences(text: str) -> List[str]:
    """Split text into sentences for better subtitle formatting."""
    # Split on sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def process_audio_to_srt(
    audio_path: str,
    hf_token: str,
    voice1_name: str = "",
    voice1_desc: str = "",
    voice2_name: str = "",
    voice2_desc: str = "",
    voice3_name: str = "",
    voice3_desc: str = "",
    progress=gr.Progress()
) -> Tuple[str, str]:
    """
    Main processing function: STT + Diarization + SRT generation.

    Args:
        audio_path: Path to the audio file
        hf_token: Hugging Face API token for accessing Pyannote models
        voice1_name: Name for the first voice
        voice1_desc: Description for the first voice
        voice2_name: Name for the second voice
        voice2_desc: Description for the second voice
        voice3_name: Name for the third voice
        voice3_desc: Description for the third voice
        progress: Gradio progress tracker

    Returns: (srt_content, debug_info)
    """
    # Validate HF token
    if not hf_token or not hf_token.strip():
        return "Error: Hugging Face token is required. Please provide your HF token.", "Token validation failed"

    # Build voice mapping from user inputs
    voice_mapping = {}
    if voice1_name.strip():
        voice_mapping["SPEAKER_00"] = voice1_name.strip()
    if voice2_name.strip():
        voice_mapping["SPEAKER_01"] = voice2_name.strip()
    if voice3_name.strip():
        voice_mapping["SPEAKER_02"] = voice3_name.strip()
    try:
        progress(0, desc="Loading Pyannote diarization pipeline...")

        # Load diarization pipeline with user's token
        try:
            diarization_pipeline = Pipeline.from_pretrained(
                "pyannote/speaker-diarization-3.1",
                token=hf_token.strip()
            )
            # Move to GPU if available
            if device == "cuda":
                diarization_pipeline.to(torch.device(device))
        except Exception as e:
            error_msg = str(e)
            if "gated repo" in error_msg.lower() or "agreement" in error_msg.lower():
                return ("Error: You need to accept the user agreement for pyannote/speaker-diarization-3.1\n"
                       "Please visit: https://huggingface.co/pyannote/speaker-diarization-3.1\n"
                       "Accept the agreement, then try again."), f"Pipeline loading failed: {error_msg}"
            elif "token" in error_msg.lower() or "unauthorized" in error_msg.lower():
                return ("Error: Invalid Hugging Face token. Please check your token and try again.\n"
                       "Get your token at: https://huggingface.co/settings/tokens"), f"Token validation failed: {error_msg}"
            else:
                return f"Error loading diarization pipeline: {error_msg}", f"Pipeline loading failed: {error_msg}"

        progress(0.05, desc="Converting audio to WAV format...")

        # Convert to WAV if needed
        if not audio_path.endswith('.wav'):
            wav_path = convert_to_wav(audio_path)
        else:
            wav_path = audio_path

        # Step 1: Transcribe with Whisper
        progress(0.1, desc="Starting Whisper transcription (this may take 2-5 minutes)...")
        result = whisper_model.transcribe(
            wav_path,
            language="en",
            word_timestamps=True,
            verbose=False,
            fp16=(device == "cuda")  # Use FP16 on GPU for faster processing
        )

        # Step 2: Perform speaker diarization
        progress(0.4, desc="Transcription complete! Now analyzing speakers with Pyannote...")
        progress(0.45, desc="Pyannote: Loading audio and extracting features...")
        progress(0.5, desc="Pyannote: Detecting speaker segments (this is the longest step - 3-10 minutes)...")
        diarization = diarization_pipeline(wav_path)

        # Step 3: Align transcription with speaker labels
        progress(0.75, desc="Diarization complete! Matching speakers to transcription...")

        # Create a list of speaker segments
        speaker_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speaker_segments.append({
                'start': turn.start,
                'end': turn.end,
                'speaker': speaker
            })

        # Match words to speakers
        segments_with_speakers = []
        for segment in result['segments']:
            segment_start = segment['start']
            segment_end = segment['end']
            segment_text = segment['text'].strip()

            # Find the speaker for this segment (based on overlap)
            speaker = None
            max_overlap = 0

            for spk_seg in speaker_segments:
                overlap_start = max(segment_start, spk_seg['start'])
                overlap_end = min(segment_end, spk_seg['end'])
                overlap_duration = max(0, overlap_end - overlap_start)

                if overlap_duration > max_overlap:
                    max_overlap = overlap_duration
                    speaker = spk_seg['speaker']

            if speaker:
                speaker_name = identify_speaker(speaker, voice_mapping)
                segments_with_speakers.append({
                    'start': segment_start,
                    'end': segment_end,
                    'text': segment_text,
                    'speaker': speaker_name
                })

        # Step 4: Generate SRT with formatting rules
        progress(0.85, desc="Cleaning text and formatting SRT subtitles...")

        srt_lines = []
        subtitle_number = 1

        for seg in segments_with_speakers:
            # Clean the text
            cleaned_text = clean_text(seg['text'])

            if not cleaned_text:
                continue

            # Split into sentences if needed
            sentences = split_into_sentences(cleaned_text)

            if not sentences:
                sentences = [cleaned_text]

            # Create subtitle blocks (one per sentence)
            for sentence in sentences:
                if not sentence:
                    continue

                start_time = format_timestamp(seg['start'])
                end_time = format_timestamp(seg['end'])

                # Format: subtitle number, timestamps, (Speaker) text
                srt_lines.append(f"{subtitle_number}")
                srt_lines.append(f"{start_time} --> {end_time}")
                srt_lines.append(f"({seg['speaker']}) {sentence}")
                srt_lines.append("")  # Blank line between subtitles

                subtitle_number += 1

        srt_content = "\n".join(srt_lines)

        # Clean up temporary file
        if wav_path != audio_path and os.path.exists(wav_path):
            os.remove(wav_path)

        debug_info = f"Processed successfully!\nTotal segments: {len(segments_with_speakers)}\nTotal subtitles: {subtitle_number - 1}"

        progress(1.0, desc="Complete! SRT file ready for download.")
        return srt_content, debug_info

    except Exception as e:
        return f"Error: {str(e)}", f"Processing failed: {str(e)}"

def save_srt_file(srt_content: str) -> str:
    """Save SRT content to a temporary file for download."""
    if not srt_content or srt_content.startswith("Error"):
        return None

    temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False, encoding='utf-8')
    temp_file.write(srt_content)
    temp_file.close()
    return temp_file.name

# Create Gradio interface
with gr.Blocks(title="Audio to SRT Converter with Speaker Diarization", theme=gr.themes.Soft()) as demo:
    # Display GPU info
    gpu_info = f"Running on: {device.upper()}"
    if device == "cuda":
        gpu_name = torch.cuda.get_device_name(0)
        gpu_info += f" ({gpu_name})"

    gr.Markdown(f"""
    # Audio to SRT Converter with Speaker Diarization

    Convert audio files to formatted SRT subtitles with automatic speaker detection and identification.

    <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
    <b>{gpu_info}</b> | Processing time: 5-15 minutes
    </div>
    """)

    with gr.Tabs():
        with gr.Tab("Upload & Process"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Step 1: Authentication")
                    gr.Markdown("""
                    <div style="background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;">
                    <b>Required:</b> You need a Hugging Face token for speaker diarization.
                    </div>
                    """)

                    with gr.Accordion("How to get your token", open=False):
                        gr.Markdown("""
                        1. Create a free account at [Hugging Face](https://huggingface.co/join) (if you don't have one)
                        2. Get your token at [Settings → Access Tokens](https://huggingface.co/settings/tokens)
                        3. Accept the user agreement at [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
                        4. Paste your token below (starts with `hf_...`)
                        """)

                    hf_token_input = gr.Textbox(
                        label="Hugging Face Token",
                        placeholder="Enter your HF token here",
                        type="password",
                        max_lines=1,
                        info="Your token is not stored and only used for this session"
                    )

                    gr.Markdown("### Step 2: Upload Your Audio")
                    audio_input = gr.Audio(
                        label="Audio File",
                        type="filepath",
                        sources=["upload"]
                    )
                    gr.Markdown("*Supports MP3, WAV, Opus, M4A, and most audio formats*")

                    gr.Markdown("### Step 3: Identify Speakers (Optional)")

                    with gr.Accordion("About speaker identification", open=False):
                        gr.Markdown("""
                        The system automatically detects up to 3 speakers in order of appearance.

                        - **Without names:** Speakers appear as "Speaker 00", "Speaker 01", etc.
                        - **With names:** Your custom names appear instead (e.g., "Daniel", "Sarah")
                        - **Descriptions:** Optional notes to help you identify speakers (not shown in output)

                        **Tip:** Listen to the first 30 seconds of your audio to identify who speaks first!
                        """)

                    with gr.Accordion("Voice 1 (First speaker)", open=False):
                        voice1_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Daniel, John, Host",
                            max_lines=1
                        )
                        voice1_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Male voice, asks questions, host",
                            max_lines=2
                        )

                    with gr.Accordion("Voice 2 (Second speaker)", open=False):
                        voice2_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Sarah, Guest, Interviewer",
                            max_lines=1
                        )
                        voice2_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Female voice, provides answers, expert",
                            max_lines=2
                        )

                    with gr.Accordion("Voice 3 (Third speaker)", open=False):
                        voice3_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Alex, Moderator",
                            max_lines=1
                        )
                        voice3_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Neutral voice, moderate pace",
                            max_lines=2
                        )

                    gr.Markdown("---")
                    process_btn = gr.Button(
                        "Generate SRT Subtitles",
                        variant="primary",
                        size="lg",
                        scale=1
                    )
                    gr.Markdown("""
                    <div style="background-color: #d1ecf1; padding: 10px; border-radius: 5px; margin-top: 10px;">
                    <b>Expected processing time:</b><br>
                    • Transcription: 2-5 minutes<br>
                    • Speaker detection: 3-10 minutes<br>
                    • Formatting: ~30 seconds<br>
                    <br>
                    Watch the progress bar for real-time updates!
                    </div>
                    """)

                with gr.Column(scale=1):
                    gr.Markdown("### Results")

                    srt_output = gr.Textbox(
                        label="Generated SRT Content",
                        lines=20,
                        max_lines=30,
                        show_copy_button=True,
                        placeholder="Your SRT subtitles will appear here after processing...",
                        info="Preview your subtitles or copy to clipboard"
                    )

                    download_btn = gr.File(
                        label="Download SRT File",
                        file_count="single"
                    )

                    debug_output = gr.Textbox(
                        label="Processing Info",
                        lines=3,
                        placeholder="Status updates will appear here..."
                    )

        with gr.Tab("Help & Info"):
            gr.Markdown("""
            ## How This Tool Works

            ### Process Overview

            1. **Audio Upload**
               - Upload any audio file (MP3, WAV, M4A, Opus, etc.)
               - File is automatically converted to WAV format for processing

            2. **Speech-to-Text Transcription**
               - Uses OpenAI's Whisper (large-v2 model)
               - Generates accurate word-level timestamps
               - Supports English language

            3. **Speaker Diarization**
               - Uses Pyannote Audio 3.1 for speaker detection
               - Automatically identifies up to 3 different speakers
               - Labels speakers in order of first appearance

            4. **Text Cleaning & Formatting**
               - Removes filler words (um, uh, like, you know, etc.)
               - Splits text into readable sentence blocks
               - Adds speaker labels to each subtitle
               - Generates standard SRT format

            ---

            ## Features

            - **Automatic speaker detection** - No manual marking needed
            - **Custom speaker names** - Replace "Speaker 00" with real names
            - **Clean text** - Filler words automatically removed
            - **Smart formatting** - One speaker per subtitle, one sentence per block
            - **Standard SRT format** - Works with all video players and editors
            - **GPU acceleration** - Fast processing on T4 GPU

            ---

            ## Tips for Best Results

            ### Before Processing
            - **Listen to the first minute** of your audio to identify speakers
            - **Note the order** speakers appear (first voice = Voice 1, etc.)
            - **Use clear names** for easy identification in subtitles

            ### Audio Quality
            - Better audio quality = more accurate transcription
            - Minimize background noise for best speaker detection
            - Clear speech separation helps diarization accuracy

            ### Speaker Identification
            - You don't need to fill in all 3 voices if you have fewer speakers
            - If you skip speaker names, output will show "Speaker 00", "Speaker 01", etc.
            - Descriptions are just for your reference and don't affect the output

            ---

            ## Output Format

            Your SRT file will look like this:

            ```
            1
            00:00:01,234 --> 00:00:05,678
            (Daniel) Welcome to the podcast.

            2
            00:00:06,123 --> 00:00:10,456
            (Sarah) Thanks for having me.

            3
            00:00:11,789 --> 00:00:15,234
            (Daniel) Let's dive into today's topic.
            ```

            Each subtitle block includes:
            - Subtitle number
            - Start and end timestamps (HH:MM:SS,mmm format)
            - Speaker name in parentheses
            - Cleaned, formatted text

            ---

            ## Troubleshooting

            ### "Error: You need to accept the user agreement"
            - Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
            - Click "Agree and access repository"
            - Try processing again

            ### "Error: Invalid Hugging Face token"
            - Check your token at [HF Settings](https://huggingface.co/settings/tokens)
            - Make sure you copied the full token (starts with `hf_`)
            - Generate a new token if needed

            ### Processing takes too long
            - Normal processing: 5-15 minutes for typical audio files
            - First run may download models (~1-2 GB)
            - Longer files (60+ minutes) may take 20-30 minutes

            ### Wrong speaker labels
            - Speakers are detected in order of first appearance
            - Voice 1 = first person to speak, Voice 2 = second, etc.
            - Re-listen to your audio to identify the correct order

            ---

            ## Privacy & Security

            - Your audio files are processed temporarily and not stored
            - Your HF token is only used for this session and never saved
            - All processing happens on Hugging Face's secure infrastructure
            - Generated SRT files are temporarily stored for download only

            ---

            ## Technical Details

            **Models Used:**
            - Whisper large-v2 (OpenAI) - Speech-to-text
            - Pyannote 3.1 - Speaker diarization

            **Hardware:**
            - NVIDIA T4 GPU with CUDA support
            - 16GB GPU memory
            - Automatic FP16 optimization

            **Supported Audio Formats:**
            MP3, WAV, M4A, AAC, Opus, FLAC, OGG, WMA, and more

            ---

            ## Support

            If you encounter issues or have suggestions, please visit the Space's community tab or create an issue.
            """)

    # Process button click handler
    def process_and_prepare_download(audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc):
        srt_content, debug = process_audio_to_srt(
            audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc
        )
        srt_file = save_srt_file(srt_content)
        return srt_content, srt_file, debug

    process_btn.click(
        fn=process_and_prepare_download,
        inputs=[
            audio_input,
            hf_token_input,
            voice1_name, voice1_desc,
            voice2_name, voice2_desc,
            voice3_name, voice3_desc
        ],
        outputs=[srt_output, download_btn, debug_output]
    )

if __name__ == "__main__":
    demo.launch()