Spaces:

lucamartinelli
/

whisper-diarization

Sleeping

App Files Files Community

lucamartinelli commited on Nov 22, 2025

Commit

dd5bcef

1 Parent(s): a7c3098

Gradio

Browse files

Files changed (13) hide show

.gitattributes +1 -1
.gitignore +7 -0
README.md +1 -0
main.py +245 -0
poetry.lock +0 -0
pyproject.toml +33 -0
src/__init__.py +0 -0
src/audio_processor.py +128 -0
src/diarization.py +22 -0
src/speaker_manager.py +38 -0
src/vtt.py +32 -0
src/vtt_utils.py +160 -0
src/whisper.py +79 -0

.gitattributes CHANGED Viewed

@@ -32,4 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.vtt
+*.mp3
+*.wav
+.venv
+.env
+tmp
+__pycache__

README.md CHANGED Viewed

@@ -7,6 +7,7 @@ sdk: gradio
 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk_version: 5.49.1
 app_file: app.py
 pinned: false
+python_version: 3.13
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

main.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""Whisper + Pyannote Transcription & Diarization Web Interface."""
+import logging
+import tempfile
+from pathlib import Path
+import gradio as gr
+from src.audio_processor import AudioProcessor
+from src.speaker_manager import SpeakerManager
+from src.vtt_utils import clean_vtt, validate_vtt
+logging.basicConfig(level=logging.INFO)
+def process_audio(
+    audio_path: str,
+    openai_api_key: str,
+    hf_api_key: str,
+    transcription_model: str,
+    pyannote_model: str,
+    openai_whisper_prompt: str,
+    openai_whisper_language: str | None,
+    progress=gr.Progress()
+):
+    """
+    Process audio file with diarization and transcription.
+    Returns:
+        Tuple of (vtt_content, transcripts, audio_filename)
+    """
+    if not audio_path:
+        return "", [], ""
+    processor = AudioProcessor(
+        openai_api_key=openai_api_key,
+        hf_api_key=hf_api_key,
+        transcription_model=transcription_model,
+        pyannote_model=pyannote_model,
+        whisper_prompt=openai_whisper_prompt,
+        whisper_language=openai_whisper_language
+    )
+    return processor.process(
+        audio_path=audio_path,
+        progress_callback=lambda p, desc: progress(p, desc=desc)
+    )
+def rename_speaker_in_vtt(vtt_content: str, transcripts_state, old_speaker: str, new_speaker: str):
+    """Rename speaker and regenerate VTT."""
+    if not vtt_content or not transcripts_state:
+        return vtt_content
+    return SpeakerManager.rename_speaker(transcripts_state, old_speaker, new_speaker)
+def prepare_download(vtt_content: str, audio_filename: str) -> str | None:
+    """
+    Prepare VTT file for download.
+    Args:
+        vtt_content: VTT content as string
+        audio_filename: Base filename for the audio
+    Returns:
+        Path to temporary VTT file, or None if inputs are invalid
+    """
+    if not vtt_content or not audio_filename:
+        return None
+    download_path = Path(tempfile.gettempdir()) / f"{audio_filename}.vtt"
+    with open(download_path, 'w', encoding='utf-8') as f:
+        f.write(vtt_content)
+    return str(download_path)
+with gr.Blocks(title="Transcription & Diarization") as app:
+    gr.Markdown("""
+                # 🎙️ Transcription & Diarization
+                Fill the required settings, upload an audio file, and start the transcription using Whisper and Pyannote!
+                """)
+    transcripts_state = gr.State([])
+    audio_filename_state = gr.State("")
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion("⚙️ Settings", open=True):
+                openapi_api_key = gr.Textbox(label="OpenAI API key", type="password")
+                hf_api_key = gr.Textbox(label="Hugging Face API key", type="password")
+            with gr.Accordion("⚙️ Additional settings", open=False):
+                transcription_model = gr.Dropdown(
+                    label="Transcription model",
+                    choices=[("Whisper", "whisper-1")],
+                    value="whisper-1"
+                )
+                pyannote_model = gr.Dropdown(
+                    label="Pyannote model",
+                    choices=[("Speaker diarization community 1", "pyannote/speaker-diarization-community-1")],
+                    value="pyannote/speaker-diarization-community-1"
+                )
+                openai_whisper_prompt = gr.Textbox(label="Additional whisper prompt", value="")
+                openai_whisper_language = gr.Dropdown(
+                    label="Whisper language",
+                    choices=[
+                        ("Default (Auto-detect)", None),
+                        ("🇮🇹 Italian", "it"),
+                        ("🇩🇪 German", "de"),
+                        ("🇬🇧 English", "en"),
+                        ("🇪🇸 Spanish", "es"),
+                        ("🇫🇷 French", "fr"),
+                    ],
+                    value=None
+                )
+            audio_input = gr.Audio(type="filepath", label="Upload audio")
+            submit_btn = gr.Button("Transcript", variant="primary", interactive=False)
+        with gr.Column():
+            with gr.Group():
+                output_vtt = gr.Textbox(
+                    label="Transcription",
+                    lines=20,
+                    placeholder="Your transcription will appear here...",
+                    buttons=["copy"],
+                    container=False,
+                )
+                validation_status = gr.Markdown("⚪ No content", container=True)
+            with gr.Row():
+                clean_btn = gr.Button("Clean & improve VTT", variant="secondary", interactive=False)
+                download_file = gr.File(label="Download VTT", visible=False)
+                download_btn = gr.Button("Download VTT", variant="secondary", interactive=False)
+            with gr.Accordion("🎭 Rename speakers", open=False):
+                with gr.Row():
+                    old_speaker_name = gr.Textbox(label="Current speaker name (e.g., SPEAKER_00)", placeholder="SPEAKER_00", value="SPEAKER_00")
+                    new_speaker_name = gr.Textbox(label="New speaker name", placeholder="Davide")
+                rename_btn = gr.Button("Rename")
+    def check_inputs(openai_key: str, hf_key: str, audio) -> gr.Button:
+        """
+        Enable submit button only if both API keys and audio are provided.
+        Args:
+            openai_key: OpenAI API key
+            hf_key: Hugging Face API key
+            audio: Audio file path
+        Returns:
+            Button component with updated interactive state
+        """
+        is_ready = bool(openai_key and hf_key and audio)
+        return gr.Button(interactive=is_ready)
+    def update_validation(vtt_content: str):
+        """
+        Update validation status and button states when VTT content changes.
+        Args:
+            vtt_content: VTT content to validate
+        Returns:
+            Tuple of (status_message, clean_button, download_button)
+        """
+        status, status_type = validate_vtt(vtt_content)
+        # Enable buttons only if VTT is valid
+        is_valid = status_type == "success"
+        return (
+            status,
+            gr.Button(interactive=is_valid),  # clean_btn
+            gr.Button(interactive=is_valid)   # download_btn
+        )
+    # Enable/disable submit button based on API keys and audio input
+    openapi_api_key.change(
+        fn=check_inputs,
+        inputs=[openapi_api_key, hf_api_key, audio_input],
+        outputs=submit_btn
+    )
+    hf_api_key.change(
+        fn=check_inputs,
+        inputs=[openapi_api_key, hf_api_key, audio_input],
+        outputs=submit_btn
+    )
+    audio_input.change(
+        fn=check_inputs,
+        inputs=[openapi_api_key, hf_api_key, audio_input],
+        outputs=submit_btn
+    )
+    # Main transcription process
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[
+            audio_input,
+            openapi_api_key,
+            hf_api_key,
+            transcription_model,
+            pyannote_model,
+            openai_whisper_prompt,
+            openai_whisper_language
+        ],
+        outputs=[output_vtt, transcripts_state, audio_filename_state],
+    )
+    # Real-time VTT validation and button state management
+    output_vtt.change(
+        fn=update_validation,
+        inputs=[output_vtt],
+        outputs=[validation_status, clean_btn, download_btn]
+    )
+    # VTT cleaning and improvement
+    clean_btn.click(
+        fn=clean_vtt,
+        inputs=[output_vtt],
+        outputs=[output_vtt]
+    )
+    # VTT file download
+    download_btn.click(
+        fn=prepare_download,
+        inputs=[output_vtt, audio_filename_state],
+        outputs=[download_file]
+    )
+    # Speaker renaming
+    rename_btn.click(
+        fn=rename_speaker_in_vtt,
+        inputs=[output_vtt, transcripts_state, old_speaker_name, new_speaker_name],
+        outputs=output_vtt
+    )
+if __name__ == "__main__":
+    app.launch()

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,33 @@

+[project]
+name = "whisper-diarization"
+version = "0.1.0"
+description = ""
+authors = [
+    {name = "Luca Martinelli",email = "martinelliluca98@gmail.com"}
+]
+readme = "README.md"
+requires-python = ">=3.11,<3.14"
+dependencies = [
+    "openai (>=2.8.1,<3.0.0)",
+    "pydantic (>=2.12.4,<3.0.0)",
+    "pydub (>=0.25.1,<0.26.0)",
+    "pyannote-audio (>=4.0.2,<5.0.0)",
+    "audioop-lts (>=0.2.2,<0.3.0)",
+    "pydantic-settings (>=2.12.0,<3.0.0)",
+    "webvtt-py (>=0.5.1,<0.6.0)",
+    "numpy (>=2.2.2)",
+    "huggingface-hub (<1.0.0)",
+    "scipy (>=1.14.0)",
+    "gradio (>=6.0.0,<7.0.0)"
+]
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry]
+package-mode = false
+[tool.poetry.dependencies]
+audioop-lts = { version=">=0.2.2,<0.3.0", python = ">=3.13" }

src/__init__.py ADDED Viewed

File without changes

src/audio_processor.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Audio processing and transcription logic."""
+import logging
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Callable, List, Tuple
+from src.diarization import get_pipeline
+from src.vtt import create_vtt
+from src.whisper import TranscriptSegment, get_transcripts
+logger = logging.getLogger(__name__)
+class AudioProcessor:
+    """Handles audio processing, diarization, and transcription."""
+    def __init__(
+        self,
+        openai_api_key: str,
+        hf_api_key: str,
+        transcription_model: str,
+        pyannote_model: str,
+        whisper_prompt: str = "",
+        whisper_language: str | None = None
+    ):
+        """
+        Initialize AudioProcessor.
+        Args:
+            openai_api_key: OpenAI API key for Whisper
+            hf_api_key: Hugging Face API key for Pyannote
+            transcription_model: Model name for transcription
+            pyannote_model: Model name for diarization
+            whisper_prompt: Optional prompt for Whisper
+            whisper_language: Optional language code for Whisper
+        """
+        self.openai_api_key = openai_api_key
+        self.hf_api_key = hf_api_key
+        self.transcription_model = transcription_model
+        self.pyannote_model = pyannote_model
+        self.whisper_prompt = whisper_prompt
+        self.whisper_language = whisper_language
+    def process(
+        self,
+        audio_path: str | Path,
+        progress_callback: Callable[[float, str], None] | None = None
+    ) -> Tuple[str, List[TranscriptSegment], str]:
+        """
+        Process audio file: diarization + transcription.
+        Args:
+            audio_path: Path to audio file
+            progress_callback: Optional callback for progress updates (progress, description)
+        Returns:
+            Tuple of (vtt_content, transcripts, audio_filename)
+        """
+        if not audio_path:
+            return "", [], ""
+        audio_path = Path(audio_path).absolute()
+        tmp_dir = Path(tempfile.mkdtemp(prefix="whisper_diarization_"))
+        logger.info(f"📁 Created temporary directory: {tmp_dir}")
+        try:
+            # Step 1: Diarization
+            if progress_callback:
+                progress_callback(0, "Loading diarization model...")
+            logger.info("🔄 Starting diarization process")
+            audio_segment, diarization = get_pipeline(
+                audio_path,
+                self.hf_api_key,
+                self.pyannote_model,
+                tmp_dir
+            )
+            if progress_callback:
+                progress_callback(0.3, "Diarization complete. Starting transcription...")
+            logger.info("✅ Diarization complete")
+            # Step 2: Transcription
+            total_segments = sum(1 for _ in diarization.speaker_diarization.itertracks())
+            logger.info(f"📊 Found {total_segments} segments to transcribe")
+            def transcription_progress(i: int, total: int):
+                if progress_callback:
+                    progress_callback(
+                        0.3 + (0.6 * i / total),
+                        f"Transcribing segment {i}/{total}..."
+                    )
+            transcripts = get_transcripts(
+                diarization,
+                audio_segment,
+                self.openai_api_key,
+                self.transcription_model,
+                self.whisper_prompt,
+                self.whisper_language,
+                tmp_dir,
+                progress_callback=transcription_progress
+            )
+            # Step 3: Create VTT
+            if progress_callback:
+                progress_callback(0.9, "Creating VTT file...")
+            logger.info("📝 Creating VTT file")
+            vtt = create_vtt(transcripts)
+            if progress_callback:
+                progress_callback(1.0, "Complete!")
+            logger.info("✅ Process complete")
+            audio_filename = audio_path.stem
+            return vtt.content, transcripts, audio_filename
+        finally:
+            # Cleanup
+            if progress_callback:
+                progress_callback(0.95, "Cleaning up temporary files...")
+            logger.info("🧹 Cleaning up")
+            if tmp_dir.exists():
+                shutil.rmtree(tmp_dir)
+                logger.info(f"🗑️ Removed temporary directory: {tmp_dir}")

src/diarization.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from pathlib import Path
+from typing import Tuple
+import torch
+from pyannote.audio import Pipeline
+from pydub import AudioSegment
+def get_pipeline(filename: str | Path, hf_api_key: str, pyannote_model: str, tmp_dir: Path) -> Tuple[AudioSegment, Pipeline]:
+    pipeline = Pipeline.from_pretrained(
+        pyannote_model,
+        token=hf_api_key,
+    )
+    pipeline.to(torch.device("cuda"))
+    audio_segment = AudioSegment.from_mp3(filename)
+    wav_audio = tmp_dir.joinpath(Path(filename).name).with_suffix(".wav")
+    with open(wav_audio, "wb"):
+        audio_segment.export(wav_audio, format="wav")
+    return (audio_segment, pipeline(wav_audio))

src/speaker_manager.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Speaker management utilities."""
+from typing import List
+from src.vtt import create_vtt
+from src.whisper import TranscriptSegment
+class SpeakerManager:
+    """Manages speaker renaming operations."""
+    @staticmethod
+    def rename_speaker(
+        transcripts: List[TranscriptSegment],
+        old_speaker: str,
+        new_speaker: str
+    ) -> str:
+        """
+        Rename a speaker in transcripts and return updated VTT.
+        Args:
+            transcripts: List of transcript segments
+            old_speaker: Current speaker name
+            new_speaker: New speaker name
+        Returns:
+            Updated VTT content as string
+        """
+        if not transcripts:
+            return ""
+        # Update speaker names in place
+        for transcript in transcripts:
+            if transcript.speaker == old_speaker:
+                transcript.speaker = new_speaker
+        # Regenerate VTT with updated speakers
+        vtt = create_vtt(transcripts)
+        return vtt.content

src/vtt.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from typing import List
+from webvtt import Caption, WebVTT
+from src.whisper import TranscriptSegment
+def format_milliseconds(milliseconds):
+    seconds, milliseconds = divmod(milliseconds, 1000)
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+def create_vtt(transcripts: List[TranscriptSegment]) -> WebVTT:
+    vtt = WebVTT()
+    for transcript in transcripts:
+        for x in transcript.transcript.segments:
+            start = transcript.start + x.start * 1000
+            end = transcript.start + x.end * 1000
+            caption = Caption(
+                format_milliseconds(start),
+                format_milliseconds(end),
+                f"<v {transcript.speaker}>" + x.text,
+            )
+            vtt.captions.append(caption)
+    return vtt

src/vtt_utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""Utilities for VTT validation and cleaning."""
+import re
+from typing import Tuple
+def parse_timestamp(timestamp_str: str) -> int | None:
+    """
+    Parse timestamp string to milliseconds.
+    Args:
+        timestamp_str: Timestamp in format HH:MM:SS.mmm
+    Returns:
+        Milliseconds as integer, or None if parsing fails
+    """
+    try:
+        parts = timestamp_str.strip().split(':')
+        hours = int(parts[0])
+        minutes = int(parts[1])
+        seconds_parts = parts[2].split('.')
+        seconds = int(seconds_parts[0])
+        milliseconds = int(seconds_parts[1])
+        total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
+        return total_ms
+    except (ValueError, IndexError, AttributeError):
+        return None
+def validate_vtt(vtt_content: str) -> Tuple[str, str]:
+    """
+    Validate VTT format and return status message.
+    Args:
+        vtt_content: VTT file content as string
+    Returns:
+        Tuple of (status_message, status_type) where status_type is "error", "warning", "success", or ""
+    """
+    if not vtt_content or vtt_content.strip() == "":
+        return "⚪ No content", ""
+    try:
+        # Check if starts with WEBVTT
+        if not vtt_content.strip().startswith("WEBVTT"):
+            return "❌ Invalid: Missing WEBVTT header", "error"
+        lines = vtt_content.split('\n')
+        has_timestamps = False
+        timestamps = []
+        for line in lines:
+            if '-->' not in line:
+                continue
+            has_timestamps = True
+            # Validate timestamp format
+            match = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})', line)
+            if not match:
+                return "⚠️ Warning: Malformed timestamp found", "warning"
+            # Parse and validate timestamps
+            start_str, end_str = match.groups()
+            start_ms = parse_timestamp(start_str)
+            end_ms = parse_timestamp(end_str)
+            if start_ms is None or end_ms is None:
+                return "⚠️ Warning: Invalid timestamp values", "warning"
+            if start_ms >= end_ms:
+                return "⚠️ Warning: Start timestamp >= end timestamp", "warning"
+            timestamps.append((start_ms, end_ms))
+        if not has_timestamps:
+            return "❌ Invalid: No timestamps found", "error"
+        # Check for overlapping timestamps
+        for i in range(len(timestamps) - 1):
+            current_end = timestamps[i][1]
+            next_start = timestamps[i + 1][0]
+            if current_end > next_start:
+                return "⚠️ Warning: Overlapping timestamps detected", "warning"
+        return "✅ Valid VTT format", "success"
+    except Exception as e:
+        return f"❌ Validation error: {str(e)}", "error"
+def clean_vtt(vtt_content: str) -> str:
+    """
+    Clean and improve VTT content.
+    Improvements:
+    - Capitalizes after sentence-ending punctuation (. ! ?)
+    - Handles cross-segment capitalization intelligently
+    - Removes multiple spaces
+    - Preserves speaker tags
+    Args:
+        vtt_content: VTT file content as string
+    Returns:
+        Cleaned VTT content
+    """
+    if not vtt_content:
+        return vtt_content
+    lines = vtt_content.split('\n')
+    cleaned_lines = []
+    last_text_ended_with_sentence_end = False
+    for line in lines:
+        # Skip empty lines and WEBVTT header
+        if not line.strip() or line.startswith('WEBVTT'):
+            cleaned_lines.append(line)
+            continue
+        # Skip timestamp lines
+        if '-->' in line:
+            cleaned_lines.append(line)
+            continue
+        # Extract speaker tag if present
+        speaker_tag = ""
+        text_content = line
+        speaker_match = re.match(r'^(<v [^>]+>)\s*(.*)', line)
+        if speaker_match:
+            speaker_tag = speaker_match.group(1)
+            text_content = speaker_match.group(2)
+        # Capitalize first letter if previous segment ended with sentence-ending punctuation
+        if last_text_ended_with_sentence_end and text_content and text_content[0].islower():
+            text_content = text_content[0].upper() + text_content[1:]
+        # Fix capitalization after punctuation within the same line
+        text_content = re.sub(
+            r'([.!?])\s+([a-z])',
+            lambda m: m.group(1) + m.group(2).upper(),
+            text_content
+        )
+        # Remove multiple spaces
+        text_content = re.sub(r'\s{2,}', ' ', text_content)
+        # Trim leading/trailing spaces
+        text_content = text_content.strip()
+        # Rebuild line with speaker tag if it existed
+        cleaned_line = f"{speaker_tag} {text_content}" if speaker_tag else text_content
+        # Check if this line ends with sentence-ending punctuation
+        last_text_ended_with_sentence_end = bool(
+            text_content and re.search(r'[.!?]\s*$', text_content)
+        )
+        cleaned_lines.append(cleaned_line)
+    return '\n'.join(cleaned_lines)

src/whisper.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from pathlib import Path
+from typing import Callable, List
+from openai import OpenAI
+from openai.types.audio import TranscriptionVerbose
+from pyannote.pipeline import Pipeline
+from pydantic import BaseModel
+from pydub import AudioSegment
+class TranscriptSegment(BaseModel):
+    audio_file: str | Path
+    speaker: str
+    i: str
+    start: float
+    end: float
+    transcript: TranscriptionVerbose
+def get_transcripts(
+    diarization: Pipeline,
+    audio_segment: AudioSegment,
+    openai_api_key: str,
+    whisper_model: str,
+    whisper_prompt: str,
+    whisper_language: str | None,
+    tmp_dir: Path,
+    progress_callback: Callable[[int, int], None] | None = None
+) -> List[TranscriptSegment]:
+    client = OpenAI(api_key=openai_api_key)
+    transcripts = []
+    # Count total segments
+    total_segments = sum(1 for _ in diarization.speaker_diarization.itertracks())
+    segment_index = 0
+    for turn, i, speaker in diarization.speaker_diarization.itertracks(yield_label=True):
+        segment_index += 1
+        if progress_callback:
+            progress_callback(segment_index, total_segments)
+        start = turn.start * 1000
+        end = turn.end * 1000
+        chunck = audio_segment[slice(start, end)]
+        chunk_filename = tmp_dir.joinpath(f"segment-{start}.mp3")
+        chunck.export(chunk_filename, format="mp3")
+        audio_chunk_segment = open(chunk_filename, "rb")
+        params = {
+            "file": audio_chunk_segment,
+            "model": whisper_model,
+            "response_format": "verbose_json",
+            "timestamp_granularities": ["segment"],
+            "prompt": whisper_prompt,
+        }
+        if whisper_language:
+            params["language"] = whisper_language
+        transcript = client.audio.transcriptions.create(**params)
+        transcripts.append(
+            TranscriptSegment(
+                audio_file=chunk_filename,
+                speaker=speaker,
+                i=i,
+                start=start,
+                end=end,
+                transcript=transcript,
+            )
+        )
+    return transcripts