File size: 22,609 Bytes
8f9dc96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe86fc
8f9dc96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fe86fc
8f9dc96
 
 
 
 
 
 
 
 
 
 
 
 
8fe86fc
8f9dc96
 
 
 
 
 
 
 
 
 
8fe86fc
 
 
 
8f9dc96
 
 
 
 
 
 
 
 
8fe86fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f9dc96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e2d643
8f9dc96
5e2d643
8f9dc96
 
 
 
 
3569ef7
8f9dc96
5e2d643
8f9dc96
5e2d643
3569ef7
5e2d643
8f9dc96
 
5e2d643
3569ef7
5e2d643
 
3569ef7
5e2d643
 
3569ef7
5e2d643
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
350a04f
5e2d643
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
3569ef7
5e2d643
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
3569ef7
5e2d643
 
 
 
3569ef7
8fe86fc
3569ef7
8fe86fc
5e2d643
8fe86fc
3569ef7
5e2d643
 
8fe86fc
3569ef7
5e2d643
 
 
8f9dc96
3569ef7
5e2d643
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
3569ef7
5e2d643
3569ef7
 
 
 
 
 
5e2d643
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f9dc96
5e2d643
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3569ef7
5e2d643
 
 
 
 
8fe86fc
8f9dc96
8fe86fc
8f9dc96
 
 
 
 
 
 
 
8fe86fc
8f9dc96
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
import gradio as gr
import whisper
import torch
from pyannote.audio import Pipeline
from pydub import AudioSegment
import re
import os
from typing import List, Dict, Tuple
import tempfile

# Detect and use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load models (will be cached after first load)
print("Loading Whisper model...")
whisper_model = whisper.load_model("large-v2", device=device)  # Load on GPU if available
print(f"Whisper model loaded on {device}")

# Diarization pipeline will be loaded on-demand with user's token

# Filler words and minimal vocalizations to remove
FILLER_WORDS = [
    r'\buh\b', r'\bum\b', r'\bmmm+\b', r'\bmm+\b', r'\bhmm+\b',
    r'\bahh+\b', r'\buhh+\b', r'\berr+\b', r'\boh\b',
    r'\byou know\b', r'\blike\b', r'\bbasically\b', r'\bliterally\b',
    r'\bactually\b', r'\bokay\b', r'\bright\b', r'\byeah\b',
    r'\buh-huh\b', r'\bmhm\b', r'\bnah\b'
]

def convert_to_wav(audio_path: str) -> str:
    """Convert audio file to WAV format for processing."""
    audio = AudioSegment.from_file(audio_path)
    wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(wav_path, format="wav")
    return wav_path

def clean_text(text: str) -> str:
    """Remove filler words, stutters, and clean up text."""
    # Remove filler words
    for filler in FILLER_WORDS:
        text = re.sub(filler, '', text, flags=re.IGNORECASE)

    # Remove stutters (e.g., "I-I-I" -> "I")
    text = re.sub(r'\b(\w+)(-\1)+\b', r'\1', text)

    # Clean up extra spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    return text

def identify_speaker(speaker_label: str, voice_mapping: Dict[str, str] = None) -> str:
    """
    Identify speaker based on diarization label and user-provided voice mapping.

    Args:
        speaker_label: The speaker label from diarization (e.g., "SPEAKER_00")
        voice_mapping: Dictionary mapping speaker labels to names

    Returns:
        The identified speaker name
    """
    if voice_mapping and speaker_label in voice_mapping:
        return voice_mapping[speaker_label]
    else:
        # Fallback for unmapped speakers
        speaker_num = speaker_label.split("_")[-1] if "_" in speaker_label else "00"
        return f"Speaker {speaker_num}"

def format_timestamp(seconds: float) -> str:
    """Convert seconds to SRT timestamp format (HH:MM:SS,mmm)."""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds % 1) * 1000)
    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"

def split_into_sentences(text: str) -> List[str]:
    """Split text into sentences for better subtitle formatting."""
    # Split on sentence boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def process_audio_to_srt(
    audio_path: str,
    hf_token: str,
    voice1_name: str = "",
    voice1_desc: str = "",
    voice2_name: str = "",
    voice2_desc: str = "",
    voice3_name: str = "",
    voice3_desc: str = "",
    progress=gr.Progress()
) -> Tuple[str, str]:
    """
    Main processing function: STT + Diarization + SRT generation.

    Args:
        audio_path: Path to the audio file
        hf_token: Hugging Face API token for accessing Pyannote models
        voice1_name: Name for the first voice
        voice1_desc: Description for the first voice
        voice2_name: Name for the second voice
        voice2_desc: Description for the second voice
        voice3_name: Name for the third voice
        voice3_desc: Description for the third voice
        progress: Gradio progress tracker

    Returns: (srt_content, debug_info)
    """
    # Validate HF token
    if not hf_token or not hf_token.strip():
        return "Error: Hugging Face token is required. Please provide your HF token.", "Token validation failed"

    # Build voice mapping from user inputs
    voice_mapping = {}
    if voice1_name.strip():
        voice_mapping["SPEAKER_00"] = voice1_name.strip()
    if voice2_name.strip():
        voice_mapping["SPEAKER_01"] = voice2_name.strip()
    if voice3_name.strip():
        voice_mapping["SPEAKER_02"] = voice3_name.strip()
    try:
        progress(0, desc="Loading Pyannote diarization pipeline...")

        # Load diarization pipeline with user's token
        try:
            diarization_pipeline = Pipeline.from_pretrained(
                "pyannote/speaker-diarization-3.1",
                token=hf_token.strip()
            )
            # Move to GPU if available
            if device == "cuda":
                diarization_pipeline.to(torch.device(device))
        except Exception as e:
            error_msg = str(e)
            if "gated repo" in error_msg.lower() or "agreement" in error_msg.lower():
                return ("Error: You need to accept the user agreement for pyannote/speaker-diarization-3.1\n"
                       "Please visit: https://huggingface.co/pyannote/speaker-diarization-3.1\n"
                       "Accept the agreement, then try again."), f"Pipeline loading failed: {error_msg}"
            elif "token" in error_msg.lower() or "unauthorized" in error_msg.lower():
                return ("Error: Invalid Hugging Face token. Please check your token and try again.\n"
                       "Get your token at: https://huggingface.co/settings/tokens"), f"Token validation failed: {error_msg}"
            else:
                return f"Error loading diarization pipeline: {error_msg}", f"Pipeline loading failed: {error_msg}"

        progress(0.05, desc="Converting audio to WAV format...")

        # Convert to WAV if needed
        if not audio_path.endswith('.wav'):
            wav_path = convert_to_wav(audio_path)
        else:
            wav_path = audio_path

        # Step 1: Transcribe with Whisper
        progress(0.1, desc="Starting Whisper transcription (this may take 2-5 minutes)...")
        result = whisper_model.transcribe(
            wav_path,
            language="en",
            word_timestamps=True,
            verbose=False,
            fp16=(device == "cuda")  # Use FP16 on GPU for faster processing
        )

        # Step 2: Perform speaker diarization
        progress(0.4, desc="Transcription complete! Now analyzing speakers with Pyannote...")
        progress(0.45, desc="Pyannote: Loading audio and extracting features...")
        progress(0.5, desc="Pyannote: Detecting speaker segments (this is the longest step - 3-10 minutes)...")
        diarization = diarization_pipeline(wav_path)

        # Step 3: Align transcription with speaker labels
        progress(0.75, desc="Diarization complete! Matching speakers to transcription...")

        # Create a list of speaker segments
        speaker_segments = []
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            speaker_segments.append({
                'start': turn.start,
                'end': turn.end,
                'speaker': speaker
            })

        # Match words to speakers
        segments_with_speakers = []
        for segment in result['segments']:
            segment_start = segment['start']
            segment_end = segment['end']
            segment_text = segment['text'].strip()

            # Find the speaker for this segment (based on overlap)
            speaker = None
            max_overlap = 0

            for spk_seg in speaker_segments:
                overlap_start = max(segment_start, spk_seg['start'])
                overlap_end = min(segment_end, spk_seg['end'])
                overlap_duration = max(0, overlap_end - overlap_start)

                if overlap_duration > max_overlap:
                    max_overlap = overlap_duration
                    speaker = spk_seg['speaker']

            if speaker:
                speaker_name = identify_speaker(speaker, voice_mapping)
                segments_with_speakers.append({
                    'start': segment_start,
                    'end': segment_end,
                    'text': segment_text,
                    'speaker': speaker_name
                })

        # Step 4: Generate SRT with formatting rules
        progress(0.85, desc="Cleaning text and formatting SRT subtitles...")

        srt_lines = []
        subtitle_number = 1

        for seg in segments_with_speakers:
            # Clean the text
            cleaned_text = clean_text(seg['text'])

            if not cleaned_text:
                continue

            # Split into sentences if needed
            sentences = split_into_sentences(cleaned_text)

            if not sentences:
                sentences = [cleaned_text]

            # Create subtitle blocks (one per sentence)
            for sentence in sentences:
                if not sentence:
                    continue

                start_time = format_timestamp(seg['start'])
                end_time = format_timestamp(seg['end'])

                # Format: subtitle number, timestamps, (Speaker) text
                srt_lines.append(f"{subtitle_number}")
                srt_lines.append(f"{start_time} --> {end_time}")
                srt_lines.append(f"({seg['speaker']}) {sentence}")
                srt_lines.append("")  # Blank line between subtitles

                subtitle_number += 1

        srt_content = "\n".join(srt_lines)

        # Clean up temporary file
        if wav_path != audio_path and os.path.exists(wav_path):
            os.remove(wav_path)

        debug_info = f"Processed successfully!\nTotal segments: {len(segments_with_speakers)}\nTotal subtitles: {subtitle_number - 1}"

        progress(1.0, desc="Complete! SRT file ready for download.")
        return srt_content, debug_info

    except Exception as e:
        return f"Error: {str(e)}", f"Processing failed: {str(e)}"

def save_srt_file(srt_content: str) -> str:
    """Save SRT content to a temporary file for download."""
    if not srt_content or srt_content.startswith("Error"):
        return None

    temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.srt', delete=False, encoding='utf-8')
    temp_file.write(srt_content)
    temp_file.close()
    return temp_file.name

# Create Gradio interface
with gr.Blocks(title="Audio to SRT Converter with Speaker Diarization", theme=gr.themes.Soft()) as demo:
    # Display GPU info
    gpu_info = f"Running on: {device.upper()}"
    if device == "cuda":
        gpu_name = torch.cuda.get_device_name(0)
        gpu_info += f" ({gpu_name})"

    gr.Markdown(f"""
    # Audio to SRT Converter with Speaker Diarization

    Convert audio files to formatted SRT subtitles with automatic speaker detection and identification.

    <div style="padding: 10px; background-color: #f0f0f0; border-radius: 5px; margin: 10px 0;">
    <b>{gpu_info}</b> | Processing time: 5-15 minutes
    </div>
    """)

    with gr.Tabs():
        with gr.Tab("Upload & Process"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### Step 1: Authentication")
                    gr.Markdown("""
                    <div style="background-color: #fff3cd; padding: 10px; border-radius: 5px; border-left: 4px solid #ffc107;">
                    <b>Required:</b> You need a Hugging Face token for speaker diarization.
                    </div>
                    """)

                    with gr.Accordion("How to get your token", open=False):
                        gr.Markdown("""
                        1. Create a free account at [Hugging Face](https://huggingface.co/join) (if you don't have one)
                        2. Get your token at [Settings → Access Tokens](https://huggingface.co/settings/tokens)
                        3. Accept the user agreement at [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
                        4. Paste your token below (starts with `hf_...`)
                        """)

                    hf_token_input = gr.Textbox(
                        label="Hugging Face Token",
                        placeholder="Enter your HF token here",
                        type="password",
                        max_lines=1,
                        info="Your token is not stored and only used for this session"
                    )

                    gr.Markdown("### Step 2: Upload Your Audio")
                    audio_input = gr.Audio(
                        label="Audio File",
                        type="filepath",
                        sources=["upload"]
                    )
                    gr.Markdown("*Supports MP3, WAV, Opus, M4A, and most audio formats*")

                    gr.Markdown("### Step 3: Identify Speakers (Optional)")

                    with gr.Accordion("About speaker identification", open=False):
                        gr.Markdown("""
                        The system automatically detects up to 3 speakers in order of appearance.

                        - **Without names:** Speakers appear as "Speaker 00", "Speaker 01", etc.
                        - **With names:** Your custom names appear instead (e.g., "Daniel", "Sarah")
                        - **Descriptions:** Optional notes to help you identify speakers (not shown in output)

                        **Tip:** Listen to the first 30 seconds of your audio to identify who speaks first!
                        """)

                    with gr.Accordion("Voice 1 (First speaker)", open=False):
                        voice1_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Daniel, John, Host",
                            max_lines=1
                        )
                        voice1_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Male voice, asks questions, host",
                            max_lines=2
                        )

                    with gr.Accordion("Voice 2 (Second speaker)", open=False):
                        voice2_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Sarah, Guest, Interviewer",
                            max_lines=1
                        )
                        voice2_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Female voice, provides answers, expert",
                            max_lines=2
                        )

                    with gr.Accordion("Voice 3 (Third speaker)", open=False):
                        voice3_name = gr.Textbox(
                            label="Speaker Name",
                            placeholder="e.g., Alex, Moderator",
                            max_lines=1
                        )
                        voice3_desc = gr.Textbox(
                            label="Description (optional)",
                            placeholder="e.g., Neutral voice, moderate pace",
                            max_lines=2
                        )

                    gr.Markdown("---")
                    process_btn = gr.Button(
                        "Generate SRT Subtitles",
                        variant="primary",
                        size="lg",
                        scale=1
                    )
                    gr.Markdown("""
                    <div style="background-color: #d1ecf1; padding: 10px; border-radius: 5px; margin-top: 10px;">
                    <b>Expected processing time:</b><br>
                    • Transcription: 2-5 minutes<br>
                    • Speaker detection: 3-10 minutes<br>
                    • Formatting: ~30 seconds<br>
                    <br>
                    Watch the progress bar for real-time updates!
                    </div>
                    """)

                with gr.Column(scale=1):
                    gr.Markdown("### Results")

                    srt_output = gr.Textbox(
                        label="Generated SRT Content",
                        lines=20,
                        max_lines=30,
                        show_copy_button=True,
                        placeholder="Your SRT subtitles will appear here after processing...",
                        info="Preview your subtitles or copy to clipboard"
                    )

                    download_btn = gr.File(
                        label="Download SRT File",
                        file_count="single"
                    )

                    debug_output = gr.Textbox(
                        label="Processing Info",
                        lines=3,
                        placeholder="Status updates will appear here..."
                    )

        with gr.Tab("Help & Info"):
            gr.Markdown("""
            ## How This Tool Works

            ### Process Overview

            1. **Audio Upload**
               - Upload any audio file (MP3, WAV, M4A, Opus, etc.)
               - File is automatically converted to WAV format for processing

            2. **Speech-to-Text Transcription**
               - Uses OpenAI's Whisper (large-v2 model)
               - Generates accurate word-level timestamps
               - Supports English language

            3. **Speaker Diarization**
               - Uses Pyannote Audio 3.1 for speaker detection
               - Automatically identifies up to 3 different speakers
               - Labels speakers in order of first appearance

            4. **Text Cleaning & Formatting**
               - Removes filler words (um, uh, like, you know, etc.)
               - Splits text into readable sentence blocks
               - Adds speaker labels to each subtitle
               - Generates standard SRT format

            ---

            ## Features

            - **Automatic speaker detection** - No manual marking needed
            - **Custom speaker names** - Replace "Speaker 00" with real names
            - **Clean text** - Filler words automatically removed
            - **Smart formatting** - One speaker per subtitle, one sentence per block
            - **Standard SRT format** - Works with all video players and editors
            - **GPU acceleration** - Fast processing on T4 GPU

            ---

            ## Tips for Best Results

            ### Before Processing
            - **Listen to the first minute** of your audio to identify speakers
            - **Note the order** speakers appear (first voice = Voice 1, etc.)
            - **Use clear names** for easy identification in subtitles

            ### Audio Quality
            - Better audio quality = more accurate transcription
            - Minimize background noise for best speaker detection
            - Clear speech separation helps diarization accuracy

            ### Speaker Identification
            - You don't need to fill in all 3 voices if you have fewer speakers
            - If you skip speaker names, output will show "Speaker 00", "Speaker 01", etc.
            - Descriptions are just for your reference and don't affect the output

            ---

            ## Output Format

            Your SRT file will look like this:

            ```
            1
            00:00:01,234 --> 00:00:05,678
            (Daniel) Welcome to the podcast.

            2
            00:00:06,123 --> 00:00:10,456
            (Sarah) Thanks for having me.

            3
            00:00:11,789 --> 00:00:15,234
            (Daniel) Let's dive into today's topic.
            ```

            Each subtitle block includes:
            - Subtitle number
            - Start and end timestamps (HH:MM:SS,mmm format)
            - Speaker name in parentheses
            - Cleaned, formatted text

            ---

            ## Troubleshooting

            ### "Error: You need to accept the user agreement"
            - Visit [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
            - Click "Agree and access repository"
            - Try processing again

            ### "Error: Invalid Hugging Face token"
            - Check your token at [HF Settings](https://huggingface.co/settings/tokens)
            - Make sure you copied the full token (starts with `hf_`)
            - Generate a new token if needed

            ### Processing takes too long
            - Normal processing: 5-15 minutes for typical audio files
            - First run may download models (~1-2 GB)
            - Longer files (60+ minutes) may take 20-30 minutes

            ### Wrong speaker labels
            - Speakers are detected in order of first appearance
            - Voice 1 = first person to speak, Voice 2 = second, etc.
            - Re-listen to your audio to identify the correct order

            ---

            ## Privacy & Security

            - Your audio files are processed temporarily and not stored
            - Your HF token is only used for this session and never saved
            - All processing happens on Hugging Face's secure infrastructure
            - Generated SRT files are temporarily stored for download only

            ---

            ## Technical Details

            **Models Used:**
            - Whisper large-v2 (OpenAI) - Speech-to-text
            - Pyannote 3.1 - Speaker diarization

            **Hardware:**
            - NVIDIA T4 GPU with CUDA support
            - 16GB GPU memory
            - Automatic FP16 optimization

            **Supported Audio Formats:**
            MP3, WAV, M4A, AAC, Opus, FLAC, OGG, WMA, and more

            ---

            ## Support

            If you encounter issues or have suggestions, please visit the Space's community tab or create an issue.
            """)

    # Process button click handler
    def process_and_prepare_download(audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc):
        srt_content, debug = process_audio_to_srt(
            audio, hf_token, v1_name, v1_desc, v2_name, v2_desc, v3_name, v3_desc
        )
        srt_file = save_srt_file(srt_content)
        return srt_content, srt_file, debug

    process_btn.click(
        fn=process_and_prepare_download,
        inputs=[
            audio_input,
            hf_token_input,
            voice1_name, voice1_desc,
            voice2_name, voice2_desc,
            voice3_name, voice3_desc
        ],
        outputs=[srt_output, download_btn, debug_output]
    )

if __name__ == "__main__":
    demo.launch()