Spaces:

eubottura
/

anycoder-ef2321b6

Sleeping

File size: 12,247 Bytes

c2c1bfd

import gradio as gr
import json
import re
from collections import Counter
from datetime import timedelta
from typing import List, Dict, Any, Optional, Tuple

# Language-specific rules and dictionaries
LANGUAGE_RULES = {
    "en": {
        "trigger_words": ["however", "but", "therefore", "meanwhile", "nevertheless"],
        "forbidden_endings": ["a", "an", "the", "and", "but", "or", "for", "nor", "on", "at", "to", "from", "by", "of", "in", "with"],
        "sentence_boundaries": [".", "?", "!"]
    },
    "es": {
        "trigger_words": ["sin embargo", "pero", "por lo tanto", "mientras tanto", "no obstante"],
        "forbidden_endings": ["el", "la", "los", "las", "y", "o", "para", "por", "de", "en", "con", "a", "de", "por"],
        "sentence_boundaries": [".", "?", "!"]
    },
    "fr": {
        "trigger_words": ["cependant", "mais", "donc", "pendant ce temps", "néanmoins"],
        "forbidden_endings": ["le", "la", "les", "et", "ou", "pour", "par", "de", "en", "avec", "à", "de", "par"],
        "sentence_boundaries": [".", "?", "!"]
    }
}

def validate_input(json_input: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
    """
    Validate the input JSON structure.

    Args:
        json_input: JSON string to validate

    Returns:
        Tuple of (is_valid, parsed_data) where parsed_data is None if invalid
    """
    try:
        data = json.loads(json_input)
        if not isinstance(data, dict):
            return False, None
        if "text" not in data or "chunks" not in data:
            return False, None
        if not isinstance(data["chunks"], list) or len(data["chunks"]) == 0:
            return False, None
        return True, data
    except json.JSONDecodeError:
        return False, None

def format_time(seconds: float) -> str:
    """
    Convert seconds to SRT time format (HH:MM:SS,mmm).

    Args:
        seconds: Time in seconds

    Returns:
        Formatted time string
    """
    td = timedelta(seconds=seconds)
    hours, remainder = divmod(td.seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    milliseconds = td.microseconds // 1000
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

def count_words(text: str) -> int:
    """
    Count words in text (including spaces and punctuation).

    Args:
        text: Text to count words in

    Returns:
        Word count
    """
    return len(text.split())

def get_majority_speaker(chunks: List[Dict[str, Any]]) -> Optional[str]:
    """
    Determine majority speaker from chunks.

    Args:
        chunks: List of chunk dictionaries

    Returns:
        Majority speaker ID or None if no speaker info
    """
    speaker_counts = Counter()
    for chunk in chunks:
        if "speaker" in chunk:
            speaker_counts[chunk["speaker"]] += count_words(chunk["text"])
    if speaker_counts:
        return speaker_counts.most_common(1)[0][0]
    return None

def should_break_line(line: str, language: str, word_break_threshold: int) -> bool:
    """
    Determine if a line should break based on language rules.

    Args:
        line: Text line to check
        language: ISO language code
        word_break_threshold: Maximum words per line

    Returns:
        True if line should break
    """
    # Check word count threshold
    if count_words(line) > word_break_threshold:
        return True

    # Check character limit (11 chars excluding spaces)
    chars_excluding_spaces = len(re.sub(r'\s+', '', line))
    if chars_excluding_spaces > 11:
        return True

    # Check for trigger words
    rules = LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])
    for trigger in rules["trigger_words"]:
        if trigger.lower() in line.lower():
            return True

    # Check for forbidden endings
    last_word = line.strip().split()[-1].lower() if line.strip() else ""
    if last_word in rules["forbidden_endings"]:
        return True

    return False

def format_speaker_change(speaker_id: str) -> str:
    """
    Format speaker identifier for SRT.

    Args:
        speaker_id: Speaker identifier

    Returns:
        Formatted speaker marker
    """
    return f"[{speaker_id}] "

def process_chunks_to_srt(
    chunks: List[Dict[str, Any]],
    word_break_threshold: int,
    language: str,
    include_speaker: bool
) -> str:
    """
    Convert transcription chunks to SRT format.

    Args:
        chunks: List of chunk dictionaries
        word_break_threshold: Maximum words per subtitle block
        language: ISO language code
        include_speaker: Whether to include speaker information

    Returns:
        SRT formatted string
    """
    srt_segments = []
    current_segment = []
    current_speaker = None
    current_start_time = None
    current_end_time = None

    # Process chunks to create segments
    for i, chunk in enumerate(chunks):
        text = chunk["text"]
        start_time = chunk["timestamp"][0]
        end_time = chunk["timestamp"][1]

        # Initialize current segment with first chunk
        if not current_segment:
            current_segment = [text]
            current_start_time = start_time
            current_end_time = end_time
            current_speaker = chunk.get("speaker")
            continue

        # Check if we should start a new segment
        should_break = False

        # Check sentence boundaries
        if text.strip() and text.strip()[0] in LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])["sentence_boundaries"]:
            should_break = True

        # Check word count threshold
        total_words = sum(count_words(t) for t in current_segment)
        if total_words + count_words(text) > word_break_threshold:
            should_break = True

        # Check speaker change (if speaker info available)
        if include_speaker and "speaker" in chunk and chunk["speaker"] != current_speaker:
            should_break = True

        if should_break:
            # Finalize current segment
            segment_text = " ".join(current_segment).strip()
            srt_segments.append({
                "start": current_start_time,
                "end": current_end_time,
                "text": segment_text,
                "speaker": current_speaker
            })

            # Start new segment
            current_segment = [text]
            current_start_time = start_time
            current_end_time = end_time
            current_speaker = chunk.get("speaker")
        else:
            # Continue current segment
            current_segment.append(text)
            current_end_time = end_time

    # Add final segment
    if current_segment:
        segment_text = " ".join(current_segment).strip()
        srt_segments.append({
            "start": current_start_time,
            "end": current_end_time,
            "text": segment_text,
            "speaker": current_speaker
        })

    # Format segments as SRT
    srt_lines = []
    for i, segment in enumerate(srt_segments, 1):
        start_time = format_time(segment["start"])
        end_time = format_time(segment["end"])
        text = segment["text"]

        # Apply speaker marker if needed
        if include_speaker and segment["speaker"]:
            text = format_speaker_change(segment["speaker"]) + text

        # Format SRT block
        srt_lines.append(str(i))
        srt_lines.append(f"{start_time} --> {end_time}")
        srt_lines.append(text)
        srt_lines.append("")  # Blank line between segments

    return "\n".join(srt_lines).strip()

def convert_transcription(
    json_input: str,
    word_break_threshold: int,
    language: str,
    include_speaker: bool
) -> Tuple[str, str]:
    """
    Main conversion function from Transcribe JSON to SRT.

    Args:
        json_input: JSON input string
        word_break_threshold: Maximum words per subtitle block
        language: ISO language code
        include_speaker: Whether to include speaker information

    Returns:
        Tuple of (srt_output, status_message)
    """
    # Validate input
    is_valid, data = validate_input(json_input)
    if not is_valid:
        return "", "Invalid JSON input: Missing required 'text' or 'chunks' fields"

    # Process chunks to SRT
    try:
        srt_output = process_chunks_to_srt(
            data["chunks"],
            word_break_threshold,
            language,
            include_speaker
        )
        return srt_output, "Conversion successful"
    except Exception as e:
        return "", f"Error during conversion: {str(e)}"

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Transcription Format Converter")
    gr.Markdown("Convert Transcribe JSON format to SRT subtitle format with configurable options")
    gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")

    with gr.Row():
        with gr.Column():
            # Input section
            json_input = gr.Textbox(
                label="Transcribe JSON Input",
                placeholder='{"text": "Full text", "chunks": [{"text": "Segment 1", "timestamp": [0, 2.5]}, ...]}',
                lines=10
            )

            # Parameters
            word_break_threshold = gr.Slider(
                minimum=5,
                maximum=20,
                value=10,
                step=1,
                label="Word Break Threshold"
            )

            language = gr.Dropdown(
                choices=["en", "es", "fr"],
                value="en",
                label="Language"
            )

            include_speaker = gr.Checkbox(
                label="Include Speaker Information",
                value=False
            )

            convert_btn = gr.Button("Convert to SRT", variant="primary")

        with gr.Column():
            # Output section
            srt_output = gr.Textbox(
                label="SRT Output",
                lines=15,
                placeholder="SRT formatted subtitles will appear here..."
            )

            status_message = gr.Textbox(
                label="Status",
                interactive=False
            )

    # Examples
    examples = gr.Examples(
        examples=[
            [
                '{"text": "Hello world. This is a test. How are you today?", "chunks": [{"text": "Hello world.", "timestamp": [0, 1.5]}, {"text": "This is a test.", "timestamp": [1.5, 3.2]}, {"text": "How are you today?", "timestamp": [3.2, 5.0]}]}',
                10,
                "en",
                False
            ],
            [
                '{"text": "Hola mundo. Esto es una prueba. ¿Cómo estás hoy?", "chunks": [{"text": "Hola mundo.", "timestamp": [0, 1.5]}, {"text": "Esto es una prueba.", "timestamp": [1.5, 3.2]}, {"text": "¿Cómo estás hoy?", "timestamp": [3.2, 5.0]}]}',
                10,
                "es",
                False
            ]
        ],
        inputs=[json_input, word_break_threshold, language, include_speaker],
        outputs=[srt_output, status_message],
        fn=convert_transcription,
        cache_examples=True,
        label="Examples"
    )

    # Event listener
    convert_btn.click(
        fn=convert_transcription,
        inputs=[json_input, word_break_threshold, language, include_speaker],
        outputs=[srt_output, status_message],
        api_visibility="public"
    )

# Launch with modern theme and styling
demo.launch(
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="indigo",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        text_size="lg",
        spacing_size="lg",
        radius_size="md"
    ).set(
        button_primary_background_fill="*primary_600",
        button_primary_background_fill_hover="*primary_700",
        block_title_text_weight="600",
    ),
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "Gradio Docs", "url": "https://www.gradio.app/docs"},
        {"label": "GitHub", "url": "https://github.com/gradio-app/gradio"}
    ],
    css="""
    .gradio-container {
        max-width: 1200px !important;
        margin: 0 auto !important;
    }
    .gr-box {
        border-radius: 8px !important;
    }
    """
)