Spaces:

lucamartinelli
/

whisper-diarization

Sleeping

File size: 8,309 Bytes

dd5bcef
ce8528d
dd5bcef
 
 
ce8528d
dd5bcef
 
 
 
 
3d5ee3a
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
ce8528d
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce8528d
dd5bcef
 
 
ce8528d
dd5bcef
 
 
ce8528d
 
 
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce8528d
dd5bcef
 
ce8528d
 
 
 
 
 
dd5bcef
ce8528d
dd5bcef
 
26055ba
dd5bcef
 
 
 
ce8528d
 
dd5bcef
 
ce8528d
 
dd5bcef
 
 
 
 
 
 
2a77a56
 
dd5bcef
 
 
 
 
ce8528d
dd5bcef
 
 
ce8528d
 
 
 
 
 
 
dd5bcef
 
ce8528d
 
 
dd5bcef
 
 
 
 
 
 
 
 
 
ce8528d
dd5bcef
 
 
 
 
 
 
1333284
dd5bcef
1333284
 
dd5bcef
 
 
 
ce8528d
 
 
 
379a259
dd5bcef
ce8528d
 
 
 
 
 
 
 
727ff34
dd5bcef
26055ba
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26055ba
dd5bcef
 
 
 
 
26055ba
dd5bcef
 
3d5ee3a
dd5bcef
 
 
 
 
 
26055ba
 
ce8528d
26055ba
 
dd5bcef
 
ce8528d
 
 
dd5bcef
 
 
 
 
 
ce8528d
dd5bcef
 
 
 
ce8528d
dd5bcef
 
 
 
ce8528d
dd5bcef
 
 
 
 
 
 
 
 
 
 
 
ce8528d
dd5bcef
 
 
 
 
26055ba
1333284
dd5bcef
26055ba
ce8528d
26055ba
 
 
 
 
ce8528d
dd5bcef
 
 
 
 
 
ce8528d
dd5bcef

"""Whisper + Pyannote Transcription & Diarization Web Interface."""

import logging
import tempfile
from pathlib import Path
from datetime import datetime

import gradio as gr

from src.audio_processor import AudioProcessor
from src.speaker_manager import SpeakerManager
from src.vtt_utils import validate_vtt

logging.basicConfig(level=logging.INFO)


def process_audio(
    audio_path: str,
    openai_api_key: str,
    hf_api_key: str,
    transcription_model: str,
    pyannote_model: str,
    openai_whisper_prompt: str,
    openai_whisper_language: str | None,
    progress=gr.Progress(),
):
    """
    Process audio file with diarization and transcription.

    Returns:
        Tuple of (vtt_content, transcripts, audio_filename)
    """
    if not audio_path:
        return "", [], ""

    processor = AudioProcessor(
        openai_api_key=openai_api_key,
        hf_api_key=hf_api_key,
        transcription_model=transcription_model,
        pyannote_model=pyannote_model,
        whisper_prompt=openai_whisper_prompt,
        whisper_language=openai_whisper_language,
    )

    return processor.process(
        audio_path=audio_path, progress_callback=lambda p, desc: progress(p, desc=desc)
    )


def rename_speaker_in_vtt(
    vtt_content: str, transcripts_state, old_speaker: str, new_speaker: str
):
    """Rename speaker and regenerate VTT."""
    if not vtt_content or not transcripts_state:
        return vtt_content

    return SpeakerManager.rename_speaker(transcripts_state, old_speaker, new_speaker)


def prepare_download(vtt_content: str, audio_filename: str) -> str | None:
    """
    Prepare VTT file for download.

    Args:
        vtt_content: VTT content as string
        audio_filename: Base filename for the audio

    Returns:
        Path to temporary VTT file, or None if inputs are invalid
    """
    if not vtt_content:
        return None

    if not audio_filename:
        audio_filename = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Create a unique temp directory to avoid caching issues
    temp_dir = Path(tempfile.mkdtemp())
    download_path = temp_dir / f"{audio_filename}.vtt"

    with open(download_path, "w", encoding="utf-8") as f:
        f.write(vtt_content)

    return str(download_path)


with gr.Blocks(title="Transcription & Diarization") as app:

    gr.Markdown(
        """
                # 🎙️ Transcription & Diarization
                Fill the required settings, upload an audio file, and start the transcription using Whisper and Pyannote!
                """
    )

    transcripts_state = gr.State([])
    audio_filename_state = gr.State("")

    with gr.Row():
        with gr.Column():
            with gr.Accordion("⚙️ Settings", open=True):
                openapi_api_key = gr.Textbox(label="OpenAI API key")
                hf_api_key = gr.Textbox(label="Hugging Face API key")

            with gr.Accordion("⚙️ Additional settings", open=False):
                transcription_model = gr.Dropdown(
                    label="Transcription model",
                    choices=[("Whisper", "whisper-1")],
                    value="whisper-1",
                )
                pyannote_model = gr.Dropdown(
                    label="Pyannote model",
                    choices=[
                        (
                            "Speaker diarization community 1",
                            "pyannote/speaker-diarization-community-1",
                        )
                    ],
                    value="pyannote/speaker-diarization-community-1",
                )

                openai_whisper_prompt = gr.Textbox(
                    label="Additional whisper prompt", value=""
                )
                openai_whisper_language = gr.Dropdown(
                    label="Whisper language",
                    choices=[
                        ("Default (Auto-detect)", None),
                        ("🇮🇹 Italian", "it"),
                        ("🇩🇪 German", "de"),
                        ("🇬🇧 English", "en"),
                        ("🇪🇸 Spanish", "es"),
                        ("🇫🇷 French", "fr"),
                    ],
                    value=None,
                )

            audio_input = gr.Audio(type="filepath", label="Upload audio")
            submit_btn = gr.Button("Transcript", variant="primary", interactive=False)

        with gr.Column():
            with gr.Group():
                output_vtt = gr.Code(
                    label="Transcription",
                    max_lines=40,
                    wrap_lines=True,
                )

                validation_status = gr.Markdown("⚪ No content", container=True)

            download_btn = gr.DownloadButton(
                "Download VTT", variant="primary", visible=False
            )

            with gr.Accordion("🎭 Rename speakers", open=True):
                with gr.Row():
                    old_speaker_name = gr.Textbox(
                        label="Current speaker name (e.g., SPEAKER_00)",
                        placeholder="SPEAKER_00",
                        value="SPEAKER_00",
                    )
                    new_speaker_name = gr.Textbox(
                        label="New speaker name", placeholder="Davide"
                    )

                rename_btn = gr.Button("Rename")

    def check_inputs(openai_key: str, hf_key: str, audio) -> gr.Button:
        """
        Enable submit button only if both API keys and audio are provided.

        Args:
            openai_key: OpenAI API key
            hf_key: Hugging Face API key
            audio: Audio file path

        Returns:
            Button component with updated interactive state
        """
        is_ready = bool(openai_key and hf_key and audio)
        return gr.Button(interactive=is_ready)

    def update_validation(vtt_content: str, audio_filename: str):
        """
        Update validation status and button states when VTT content changes.

        Args:
            vtt_content: VTT content to validate
            audio_filename: Audio filename for download

        Returns:
            Tuple of (status_message, download_file)
        """
        status, status_type = validate_vtt(vtt_content)

        # Enable buttons only if VTT is valid
        is_valid = status_type == "success"

        # Prepare download file if valid
        file_path = None
        if is_valid and vtt_content:
            file_path = prepare_download(vtt_content, audio_filename)

        return (
            status,
            gr.DownloadButton(
                value=file_path, visible=bool(file_path), interactive=True
            ),
        )

    # Enable/disable submit button based on API keys and audio input
    openapi_api_key.change(
        fn=check_inputs,
        inputs=[openapi_api_key, hf_api_key, audio_input],
        outputs=submit_btn,
    )
    hf_api_key.change(
        fn=check_inputs,
        inputs=[openapi_api_key, hf_api_key, audio_input],
        outputs=submit_btn,
    )
    audio_input.change(
        fn=check_inputs,
        inputs=[openapi_api_key, hf_api_key, audio_input],
        outputs=submit_btn,
    )

    # Main transcription process
    submit_btn.click(
        fn=process_audio,
        inputs=[
            audio_input,
            openapi_api_key,
            hf_api_key,
            transcription_model,
            pyannote_model,
            openai_whisper_prompt,
            openai_whisper_language,
        ],
        outputs=[output_vtt, transcripts_state, audio_filename_state],
    )

    # Real-time VTT validation and button state management
    # We need to update validation whenever VTT content OR filename changes
    output_vtt.input(
        fn=update_validation,
        inputs=[output_vtt, audio_filename_state],
        outputs=[validation_status, download_btn],
    )

    audio_filename_state.change(
        fn=update_validation,
        inputs=[output_vtt, audio_filename_state],
        outputs=[validation_status, download_btn],
    )

    # Speaker renaming
    rename_btn.click(
        fn=rename_speaker_in_vtt,
        inputs=[output_vtt, transcripts_state, old_speaker_name, new_speaker_name],
        outputs=output_vtt,
    )

if __name__ == "__main__":
    app.launch()