import gradio as gr import json import re from collections import Counter from datetime import timedelta from typing import List, Dict, Any, Optional, Tuple # Language-specific rules and dictionaries LANGUAGE_RULES = { "en": { "trigger_words": ["however", "but", "therefore", "meanwhile", "nevertheless"], "forbidden_endings": ["a", "an", "the", "and", "but", "or", "for", "nor", "on", "at", "to", "from", "by", "of", "in", "with"], "sentence_boundaries": [".", "?", "!"] }, "es": { "trigger_words": ["sin embargo", "pero", "por lo tanto", "mientras tanto", "no obstante"], "forbidden_endings": ["el", "la", "los", "las", "y", "o", "para", "por", "de", "en", "con", "a", "de", "por"], "sentence_boundaries": [".", "?", "!"] }, "fr": { "trigger_words": ["cependant", "mais", "donc", "pendant ce temps", "néanmoins"], "forbidden_endings": ["le", "la", "les", "et", "ou", "pour", "par", "de", "en", "avec", "à", "de", "par"], "sentence_boundaries": [".", "?", "!"] } } def validate_input(json_input: str) -> Tuple[bool, Optional[Dict[str, Any]]]: """ Validate the input JSON structure. Args: json_input: JSON string to validate Returns: Tuple of (is_valid, parsed_data) where parsed_data is None if invalid """ try: data = json.loads(json_input) if not isinstance(data, dict): return False, None if "text" not in data or "chunks" not in data: return False, None if not isinstance(data["chunks"], list) or len(data["chunks"]) == 0: return False, None return True, data except json.JSONDecodeError: return False, None def format_time(seconds: float) -> str: """ Convert seconds to SRT time format (HH:MM:SS,mmm). Args: seconds: Time in seconds Returns: Formatted time string """ td = timedelta(seconds=seconds) hours, remainder = divmod(td.seconds, 3600) minutes, seconds = divmod(remainder, 60) milliseconds = td.microseconds // 1000 return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" def count_words(text: str) -> int: """ Count words in text (including spaces and punctuation). Args: text: Text to count words in Returns: Word count """ return len(text.split()) def get_majority_speaker(chunks: List[Dict[str, Any]]) -> Optional[str]: """ Determine majority speaker from chunks. Args: chunks: List of chunk dictionaries Returns: Majority speaker ID or None if no speaker info """ speaker_counts = Counter() for chunk in chunks: if "speaker" in chunk: speaker_counts[chunk["speaker"]] += count_words(chunk["text"]) if speaker_counts: return speaker_counts.most_common(1)[0][0] return None def should_break_line(line: str, language: str, word_break_threshold: int) -> bool: """ Determine if a line should break based on language rules. Args: line: Text line to check language: ISO language code word_break_threshold: Maximum words per line Returns: True if line should break """ # Check word count threshold if count_words(line) > word_break_threshold: return True # Check character limit (11 chars excluding spaces) chars_excluding_spaces = len(re.sub(r'\s+', '', line)) if chars_excluding_spaces > 11: return True # Check for trigger words rules = LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"]) for trigger in rules["trigger_words"]: if trigger.lower() in line.lower(): return True # Check for forbidden endings last_word = line.strip().split()[-1].lower() if line.strip() else "" if last_word in rules["forbidden_endings"]: return True return False def format_speaker_change(speaker_id: str) -> str: """ Format speaker identifier for SRT. Args: speaker_id: Speaker identifier Returns: Formatted speaker marker """ return f"[{speaker_id}] " def process_chunks_to_srt( chunks: List[Dict[str, Any]], word_break_threshold: int, language: str, include_speaker: bool ) -> str: """ Convert transcription chunks to SRT format. Args: chunks: List of chunk dictionaries word_break_threshold: Maximum words per subtitle block language: ISO language code include_speaker: Whether to include speaker information Returns: SRT formatted string """ srt_segments = [] current_segment = [] current_speaker = None current_start_time = None current_end_time = None # Process chunks to create segments for i, chunk in enumerate(chunks): text = chunk["text"] start_time = chunk["timestamp"][0] end_time = chunk["timestamp"][1] # Initialize current segment with first chunk if not current_segment: current_segment = [text] current_start_time = start_time current_end_time = end_time current_speaker = chunk.get("speaker") continue # Check if we should start a new segment should_break = False # Check sentence boundaries if text.strip() and text.strip()[0] in LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])["sentence_boundaries"]: should_break = True # Check word count threshold total_words = sum(count_words(t) for t in current_segment) if total_words + count_words(text) > word_break_threshold: should_break = True # Check speaker change (if speaker info available) if include_speaker and "speaker" in chunk and chunk["speaker"] != current_speaker: should_break = True if should_break: # Finalize current segment segment_text = " ".join(current_segment).strip() srt_segments.append({ "start": current_start_time, "end": current_end_time, "text": segment_text, "speaker": current_speaker }) # Start new segment current_segment = [text] current_start_time = start_time current_end_time = end_time current_speaker = chunk.get("speaker") else: # Continue current segment current_segment.append(text) current_end_time = end_time # Add final segment if current_segment: segment_text = " ".join(current_segment).strip() srt_segments.append({ "start": current_start_time, "end": current_end_time, "text": segment_text, "speaker": current_speaker }) # Format segments as SRT srt_lines = [] for i, segment in enumerate(srt_segments, 1): start_time = format_time(segment["start"]) end_time = format_time(segment["end"]) text = segment["text"] # Apply speaker marker if needed if include_speaker and segment["speaker"]: text = format_speaker_change(segment["speaker"]) + text # Format SRT block srt_lines.append(str(i)) srt_lines.append(f"{start_time} --> {end_time}") srt_lines.append(text) srt_lines.append("") # Blank line between segments return "\n".join(srt_lines).strip() def convert_transcription( json_input: str, word_break_threshold: int, language: str, include_speaker: bool ) -> Tuple[str, str]: """ Main conversion function from Transcribe JSON to SRT. Args: json_input: JSON input string word_break_threshold: Maximum words per subtitle block language: ISO language code include_speaker: Whether to include speaker information Returns: Tuple of (srt_output, status_message) """ # Validate input is_valid, data = validate_input(json_input) if not is_valid: return "", "Invalid JSON input: Missing required 'text' or 'chunks' fields" # Process chunks to SRT try: srt_output = process_chunks_to_srt( data["chunks"], word_break_threshold, language, include_speaker ) return srt_output, "Conversion successful" except Exception as e: return "", f"Error during conversion: {str(e)}" # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# Transcription Format Converter") gr.Markdown("Convert Transcribe JSON format to SRT subtitle format with configurable options") gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)") with gr.Row(): with gr.Column(): # Input section json_input = gr.Textbox( label="Transcribe JSON Input", placeholder='{"text": "Full text", "chunks": [{"text": "Segment 1", "timestamp": [0, 2.5]}, ...]}', lines=10 ) # Parameters word_break_threshold = gr.Slider( minimum=5, maximum=20, value=10, step=1, label="Word Break Threshold" ) language = gr.Dropdown( choices=["en", "es", "fr"], value="en", label="Language" ) include_speaker = gr.Checkbox( label="Include Speaker Information", value=False ) convert_btn = gr.Button("Convert to SRT", variant="primary") with gr.Column(): # Output section srt_output = gr.Textbox( label="SRT Output", lines=15, placeholder="SRT formatted subtitles will appear here..." ) status_message = gr.Textbox( label="Status", interactive=False ) # Examples examples = gr.Examples( examples=[ [ '{"text": "Hello world. This is a test. How are you today?", "chunks": [{"text": "Hello world.", "timestamp": [0, 1.5]}, {"text": "This is a test.", "timestamp": [1.5, 3.2]}, {"text": "How are you today?", "timestamp": [3.2, 5.0]}]}', 10, "en", False ], [ '{"text": "Hola mundo. Esto es una prueba. ¿Cómo estás hoy?", "chunks": [{"text": "Hola mundo.", "timestamp": [0, 1.5]}, {"text": "Esto es una prueba.", "timestamp": [1.5, 3.2]}, {"text": "¿Cómo estás hoy?", "timestamp": [3.2, 5.0]}]}', 10, "es", False ] ], inputs=[json_input, word_break_threshold, language, include_speaker], outputs=[srt_output, status_message], fn=convert_transcription, cache_examples=True, label="Examples" ) # Event listener convert_btn.click( fn=convert_transcription, inputs=[json_input, word_break_threshold, language, include_speaker], outputs=[srt_output, status_message], api_visibility="public" ) # Launch with modern theme and styling demo.launch( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ), footer_links=[ {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, {"label": "Gradio Docs", "url": "https://www.gradio.app/docs"}, {"label": "GitHub", "url": "https://github.com/gradio-app/gradio"} ], css=""" .gradio-container { max-width: 1200px !important; margin: 0 auto !important; } .gr-box { border-radius: 8px !important; } """ )