import gradio as gr import whisper import torch from pyannote.audio import Pipeline from pydub import AudioSegment import re import os from typing import List, Dict, Tuple import tempfile # Detect and use GPU if available device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load models (will be cached after first load) print("Loading Whisper model...") whisper_model = whisper.load_model("large-v2", device=device) # Load on GPU if available print(f"Whisper model loaded on {device}") # Diarization pipeline will be loaded on-demand with user's token # Filler words and minimal vocalizations to remove FILLER_WORDS = [ r'\buh\b', r'\bum\b', r'\bmmm+\b', r'\bmm+\b', r'\bhmm+\b', r'\bahh+\b', r'\buhh+\b', r'\berr+\b', r'\boh\b', r'\byou know\b', r'\blike\b', r'\bbasically\b', r'\bliterally\b', r'\bactually\b', r'\bokay\b', r'\bright\b', r'\byeah\b', r'\buh-huh\b', r'\bmhm\b', r'\bnah\b' ] def convert_to_wav(audio_path: str) -> str: """Convert audio file to WAV format for processing.""" audio = AudioSegment.from_file(audio_path) wav_path = tempfile.mktemp(suffix=".wav") audio.export(wav_path, format="wav") return wav_path def clean_text(text: str) -> str: """Remove filler words, stutters, and clean up text.""" # Remove filler words for filler in FILLER_WORDS: text = re.sub(filler, '', text, flags=re.IGNORECASE) # Remove stutters (e.g., "I-I-I" -> "I") text = re.sub(r'\b(\w+)(-\1)+\b', r'\1', text) # Clean up extra spaces text = re.sub(r'\s+', ' ', text) text = text.strip() return text def identify_speaker(speaker_label: str, voice_mapping: Dict[str, str] = None) -> str: """ Identify speaker based on diarization label and user-provided voice mapping. Args: speaker_label: The speaker label from diarization (e.g., "SPEAKER_00") voice_mapping: Dictionary mapping speaker labels to names Returns: The identified speaker name """ if voice_mapping and speaker_label in voice_mapping: return voice_mapping[speaker_label] else: # Fallback for unmapped speakers speaker_num = speaker_label.split("_")[-1] if "_" in speaker_label else "00" return f"Speaker {speaker_num}" def format_timestamp(seconds: float) -> str: """Convert seconds to SRT timestamp format (HH:MM:SS,mmm).""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) millis = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" def split_into_sentences(text: str) -> List[str]: """Split text into sentences for better subtitle formatting.""" # Split on sentence boundaries sentences = re.split(r'(?<=[.!?])\s+', text) return [s.strip() for s in sentences if s.strip()] def process_audio_to_srt( audio_path: str, hf_token: str, voice1_name: str = "", voice1_desc: str = "", voice2_name: str = "", voice2_desc: str = "", voice3_name: str = "", voice3_desc: str = "", progress=gr.Progress() ) -> Tuple[str, str]: """ Main processing function: STT + Diarization + SRT generation. Args: audio_path: Path to the audio file hf_token: Hugging Face API token for accessing Pyannote models voice1_name: Name for the first voice voice1_desc: Description for the first voice voice2_name: Name for the second voice voice2_desc: Description for the second voice voice3_name: Name for the third voice voice3_desc: Description for the third voice progress: Gradio progress tracker Returns: (srt_content, debug_info) """ # Validate HF token if not hf_token or not hf_token.strip(): return "Error: Hugging Face token is required. Please provide your HF token.", "Token validation failed" # Build voice mapping from user inputs voice_mapping = {} if voice1_name.strip(): voice_mapping["SPEAKER_00"] = voice1_name.strip() if voice2_name.strip(): voice_mapping["SPEAKER_01"] = voice2_name.strip() if voice3_name.strip(): voice_mapping["SPEAKER_02"] = voice3_name.strip() try: progress(0, desc="Loading Pyannote diarization pipeline...") # Load diarization pipeline with user's token try: diarization_pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization-3.1", token=hf_token.strip() ) # Move to GPU if available if device == "cuda": diarization_pipeline.to(torch.device(device)) except Exception as e: error_msg = str(e) if "gated repo" in error_msg.lower() or "agreement" in error_msg.lower(): return ("Error: You need to accept the user agreement for pyannote/speaker-diarization-3.1\n" "Please visit: https://huggingface.co/pyannote/speaker-diarization-3.1\n" "Accept the agreement, then try again."), f"Pipeline loading failed: {error_msg}" elif "token" in error_msg.lower() or "unauthorized" in error_msg.lower(): return ("Error: Invalid Hugging Face token. Please check your token and try again.\n" "Get your token at: https://huggingface.co/settings/tokens"), f"Token validation failed: {error_msg}" else: return f"Error loading diarization pipeline: {error_msg}", f"Pipeline loading failed: {error_msg}" progress(0.05, desc="Converting audio to WAV format...") # Convert to WAV if needed if not audio_path.endswith('.wav'): wav_path = convert_to_wav(audio_path) else: wav_path = audio_path # Step 1: Transcribe with Whisper progress(0.1, desc="Starting Whisper transcription (this may take 2-5 minutes)...") result = whisper_model.transcribe( wav_path, language="en", word_timestamps=True, verbose=False, fp16=(device == "cuda") # Use FP16 on GPU for faster processing ) # Step 2: Perform speaker diarization progress(0.4, desc="Transcription complete! Now analyzing speakers with Pyannote...") progress(0.45, desc="Pyannote: Loading audio and extracting features...") progress(0.5, desc="Pyannote: Detecting speaker segments (this is the longest step - 3-10 minutes)...") diarization = diarization_pipeline(wav_path) # Step 3: Align transcription with speaker labels progress(0.75, desc="Diarization complete! Matching speakers to transcription...") # Create a list of speaker segments speaker_segments = [] for turn, _, speaker in diarization.itertracks(yield_label=True): speaker_segments.append({ 'start': turn.start, 'end': turn.end, 'speaker': speaker }) # Match words to speakers segments_with_speakers = [] for segment in result['segments']: segment_start = segment['start'] segment_end = segment['end'] segment_text = segment['text'].strip() # Find the speaker for this segment (based on overlap) speaker = None max_overlap = 0 for spk_seg in speaker_segments: overlap_start = max(segment_start, spk_seg['start']) overlap_end = min(segment_end, spk_seg['end']) overlap_duration = max(0, overlap_end - overlap_start) if overlap_duration > max_overlap: max_overlap = overlap_duration speaker = spk_seg['speaker'] if speaker: speaker_name = identify_speaker(speaker, voice_mapping) segments_with_speakers.append({ 'start': segment_start, 'end': segment_end, 'text': segment_text, 'speaker': speaker_name }) # Step 4: Generate SRT with formatting rules progress(0.85, desc="Cleaning text and formatting SRT subtitles...") srt_lines = [] subtitle_number = 1 for seg in segments_with_speakers: # Clean the text cleaned_text = clean_text(seg['text']) if not cleaned_text: continue # Split into sentences if needed sentences = split_into_sentences(cleaned_text) if not sentences: sentences = [cleaned_text] # Create subtitle blocks (one per sentence) for sentence in sentences: if not sentence: continue start_time = format_timestamp(seg['start']) end_time = format_timestamp(seg['end']) # Format: subtitle number, timestamps, (Speaker) text srt_lines.append(f"{subtitle_number}") srt_lines.append(f"{start_time} --> {end_time}") srt_lines.append(f"({seg['speaker']}) {sentence}") srt_lines.append("") # Blank line between subtitles subtitle_number += 1 srt_content = "\n".join(srt_lines) # Clean up temporary file if wav_path != audio_path and os.path.exists(wav_path): os.remove(wav_path) debug_info = f"Processed successfully!\nTotal segments: {len(segments_with_speakers)}\nTotal subtitles: {subtitle_number - 1}" progress(1.0, desc="Complete! SRT file ready for download.") return srt_content, debug_info except Exception as e: return f"Error: {str(e)}", f"Processing failed: {str(e)}" def save_srt_file(srt_content: str) -> str: """Save SRT content to a temporary file for download.""" if not srt_content or srt_content.startswith("Error"): return None temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False, encoding='utf-8') temp_file.write(srt_content) temp_file.close() return temp_file.name # Create Gradio interface with gr.Blocks(title="Audio to SRT Converter with Speaker Diarization", theme=gr.themes.Soft()) as demo: # Display GPU info gpu_info = f"Running on: {device.upper()}" if device == "cuda": gpu_name = torch.cuda.get_device_name(0) gpu_info += f" ({gpu_name})" gr.Markdown(f""" # Audio to SRT Converter with Speaker Diarization Convert audio files to formatted SRT subtitles with automatic speaker detection and identification.
{gpu_info} | Processing time: 5-15 minutes
""") with gr.Tabs(): with gr.Tab("Upload & Process"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Step 1: Authentication") gr.Markdown("""
Required: You need a Hugging Face token for speaker diarization.
""") with gr.Accordion("How to get your token", open=False): gr.Markdown(""" 1. Create a free account at [Hugging Face](https://huggingface.co/join) (if you don't have one) 2. Get your token at [Settings → Access Tokens](https://huggingface.co/settings/tokens) 3. Accept the user agreement at [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) 4. Paste your token below (starts with `hf_...`) """) hf_token_input = gr.Textbox( label="Hugging Face Token", placeholder="Enter your HF token here", type="password", max_lines=1, info="Your token is not stored and only used for this session" ) gr.Markdown("### Step 2: Upload Your Audio") audio_input = gr.Audio( label="Audio File", type="filepath", sources=["upload"] ) gr.Markdown("*Supports MP3, WAV, Opus, M4A, and most audio formats*") gr.Markdown("### Step 3: Identify Speakers (Optional)") with gr.Accordion("About speaker identification", open=False): gr.Markdown(""" The system automatically detects up to 3 speakers in order of appearance. - **Without names:** Speakers appear as "Speaker 00", "Speaker 01", etc. - **With names:** Your custom names appear instead (e.g., "Daniel", "Sarah") - **Descriptions:** Optional notes to help you identify speakers (not shown in output) **Tip:** Listen to the first 30 seconds of your audio to identify who speaks first! """) with gr.Accordion("Voice 1 (First speaker)", open=False): voice1_name = gr.Textbox( label="Speaker Name", placeholder="e.g., Daniel, John, Host", max_lines=1 ) voice1_desc = gr.Textbox( label="Description (optional)", placeholder="e.g., Male voice, asks questions, host", max_lines=2 ) with gr.Accordion("Voice 2 (Second speaker)", open=False): voice2_name = gr.Textbox( label="Speaker Name", placeholder="e.g., Sarah, Guest, Interviewer", max_lines=1 ) voice2_desc = gr.Textbox( label="Description (optional)", placeholder="e.g., Female voice, provides answers, expert", max_lines=2 ) with gr.Accordion("Voice 3 (Third speaker)", open=False): voice3_name = gr.Textbox( label="Speaker Name", placeholder="e.g., Alex, Moderator", max_lines=1 ) voice3_desc = gr.Textbox( label="Description (optional)", placeholder="e.g., Neutral voice, moderate pace", max_lines=2 ) gr.Markdown("---") process_btn = gr.Button( "Generate SRT Subtitles", variant="primary", size="lg", scale=1 ) gr.Markdown("""
Expected processing time:
• Transcription: 2-5 minutes
• Speaker detection: 3-10 minutes
• Formatting: ~30 seconds

Watch the progress bar for real-time updates!
""") with gr.Column(scale=1): gr.Markdown("### Results") srt_output = gr.Textbox( label="Generated SRT Content", lines=20, max_lines=30, show_copy_button=True, placeholder="Your SRT subtitles will appear here after processing...", info="Preview your subtitles or copy to clipboard" ) download_btn = gr.File( label="Download SRT File", file_count="single" ) debug_output = gr.Textbox( label="Processing Info", lines=3, placeholder="Status updates will appear here..." ) with gr.Tab("Help & Info"): gr.Markdown(""" ## How This Tool Works ### Process Overview 1. **Audio Upload** - Upload any audio file (MP3, WAV, M4A, Opus, etc.) - File is automatically converted to WAV format for processing 2. **Speech-to-Text Transcription** - Uses OpenAI's Whisper (large-v2 model) - Generates accurate word-level timestamps - Supports English language 3. **Speaker Diarization** - Uses Pyannote Audio 3.1 for speaker detection - Automatically identifies up to 3 different speakers - Labels speakers in order of first appearance 4. **Text Cleaning & Formatting** - Removes filler words (um, uh, like, you know, etc.) - Splits text into readable sentence blocks - Adds speaker labels to each subtitle - Generates standard SRT format --- ## Features - **Automatic speaker detection** - No manual marking needed - **Custom speaker names** - Replace "Speaker 00" with real names - **Clean text** - Filler words automatically removed - **Smart formatting** - One speaker per subtitle, one sentence per block - **Standard SRT format** - Works with all video players and editors - **GPU acceleration** - Fast processing on T4 GPU --- ## Tips for Best Results ### Before Processing - **Listen to the first minute** of your audio to identify speakers - **Note the order** speakers appear (first voice = Voice 1, etc.) - **Use clear names** for easy identification in subtitles ### Audio Quality - Better audio quality = more accurate transcription - Minimize background noise for best speaker detection - Clear speech separation helps diarization accuracy ### Speaker Identification - You don't need to fill in all 3 voices if you have fewer speakers - If you skip speaker names, output will show "Speaker 00", "Speaker 01", etc. - Descriptions are just for your reference and don't affect the output --- ## Output Format Your SRT file will look like this: ``` 1 00:00:01,234 --> 00:00:05,678 (Daniel) Welcome to the podcast. 2 00:00:06,123 --> 00:00:10,456 (Sarah) Thanks for having me. 3 00:00:11,789 --> 00:00:15,234 (Daniel) Let's dive into today's topic. ``` Each subtitle block includes: - Subtitle number - Start and end timestamps (HH:MM:SS,mmm format) - Speaker name in parentheses - Cleaned, formatted text --- ## Troubleshooting ### "Error: You need to accept the user agreement" - Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) - Click "Agree and access repository" - Try processing again ### "Error: Invalid Hugging Face token" - Check your token at [HF Settings](https://huggingface.co/settings/tokens) - Make sure you copied the full token (starts with `hf_`) - Generate a new token if needed ### Processing takes too long - Normal processing: 5-15 minutes for typical audio files - First run may download models (~1-2 GB) - Longer files (60+ minutes) may take 20-30 minutes ### Wrong speaker labels - Speakers are detected in order of first appearance - Voice 1 = first person to speak, Voice 2 = second, etc. - Re-listen to your audio to identify the correct order --- ## Privacy & Security - Your audio files are processed temporarily and not stored - Your HF token is only used for this session and never saved - All processing happens on Hugging Face's secure infrastructure - Generated SRT files are temporarily stored for download only --- ## Technical Details **Models Used:** - Whisper large-v2 (OpenAI) - Speech-to-text - Pyannote 3.1 - Speaker diarization **Hardware:** - NVIDIA T4 GPU with CUDA support - 16GB GPU memory - Automatic FP16 optimization **Supported Audio Formats:** MP3, WAV, M4A, AAC, Opus, FLAC, OGG, WMA, and more --- ## Support If you encounter issues or have suggestions, please visit the Space's community tab or create an issue. """) # Process button click handler def process_and_prepare_download(audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc): srt_content, debug = process_audio_to_srt( audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc ) srt_file = save_srt_file(srt_content) return srt_content, srt_file, debug process_btn.click( fn=process_and_prepare_download, inputs=[ audio_input, hf_token_input, voice1_name, voice1_desc, voice2_name, voice2_desc, voice3_name, voice3_desc ], outputs=[srt_output, download_btn, debug_output] ) if __name__ == "__main__": demo.launch()