Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import re | |
| from collections import Counter | |
| from datetime import timedelta | |
| from typing import List, Dict, Any, Optional, Tuple | |
| # Language-specific rules and dictionaries | |
| LANGUAGE_RULES = { | |
| "en": { | |
| "trigger_words": ["however", "but", "therefore", "meanwhile", "nevertheless"], | |
| "forbidden_endings": ["a", "an", "the", "and", "but", "or", "for", "nor", "on", "at", "to", "from", "by", "of", "in", "with"], | |
| "sentence_boundaries": [".", "?", "!"] | |
| }, | |
| "es": { | |
| "trigger_words": ["sin embargo", "pero", "por lo tanto", "mientras tanto", "no obstante"], | |
| "forbidden_endings": ["el", "la", "los", "las", "y", "o", "para", "por", "de", "en", "con", "a", "de", "por"], | |
| "sentence_boundaries": [".", "?", "!"] | |
| }, | |
| "fr": { | |
| "trigger_words": ["cependant", "mais", "donc", "pendant ce temps", "néanmoins"], | |
| "forbidden_endings": ["le", "la", "les", "et", "ou", "pour", "par", "de", "en", "avec", "à", "de", "par"], | |
| "sentence_boundaries": [".", "?", "!"] | |
| } | |
| } | |
| def validate_input(json_input: str) -> Tuple[bool, Optional[Dict[str, Any]]]: | |
| """ | |
| Validate the input JSON structure. | |
| Args: | |
| json_input: JSON string to validate | |
| Returns: | |
| Tuple of (is_valid, parsed_data) where parsed_data is None if invalid | |
| """ | |
| try: | |
| data = json.loads(json_input) | |
| if not isinstance(data, dict): | |
| return False, None | |
| if "text" not in data or "chunks" not in data: | |
| return False, None | |
| if not isinstance(data["chunks"], list) or len(data["chunks"]) == 0: | |
| return False, None | |
| return True, data | |
| except json.JSONDecodeError: | |
| return False, None | |
| def format_time(seconds: float) -> str: | |
| """ | |
| Convert seconds to SRT time format (HH:MM:SS,mmm). | |
| Args: | |
| seconds: Time in seconds | |
| Returns: | |
| Formatted time string | |
| """ | |
| td = timedelta(seconds=seconds) | |
| hours, remainder = divmod(td.seconds, 3600) | |
| minutes, seconds = divmod(remainder, 60) | |
| milliseconds = td.microseconds // 1000 | |
| return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" | |
| def count_words(text: str) -> int: | |
| """ | |
| Count words in text (including spaces and punctuation). | |
| Args: | |
| text: Text to count words in | |
| Returns: | |
| Word count | |
| """ | |
| return len(text.split()) | |
| def get_majority_speaker(chunks: List[Dict[str, Any]]) -> Optional[str]: | |
| """ | |
| Determine majority speaker from chunks. | |
| Args: | |
| chunks: List of chunk dictionaries | |
| Returns: | |
| Majority speaker ID or None if no speaker info | |
| """ | |
| speaker_counts = Counter() | |
| for chunk in chunks: | |
| if "speaker" in chunk: | |
| speaker_counts[chunk["speaker"]] += count_words(chunk["text"]) | |
| if speaker_counts: | |
| return speaker_counts.most_common(1)[0][0] | |
| return None | |
| def should_break_line(line: str, language: str, word_break_threshold: int) -> bool: | |
| """ | |
| Determine if a line should break based on language rules. | |
| Args: | |
| line: Text line to check | |
| language: ISO language code | |
| word_break_threshold: Maximum words per line | |
| Returns: | |
| True if line should break | |
| """ | |
| # Check word count threshold | |
| if count_words(line) > word_break_threshold: | |
| return True | |
| # Check character limit (11 chars excluding spaces) | |
| chars_excluding_spaces = len(re.sub(r'\s+', '', line)) | |
| if chars_excluding_spaces > 11: | |
| return True | |
| # Check for trigger words | |
| rules = LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"]) | |
| for trigger in rules["trigger_words"]: | |
| if trigger.lower() in line.lower(): | |
| return True | |
| # Check for forbidden endings | |
| last_word = line.strip().split()[-1].lower() if line.strip() else "" | |
| if last_word in rules["forbidden_endings"]: | |
| return True | |
| return False | |
| def format_speaker_change(speaker_id: str) -> str: | |
| """ | |
| Format speaker identifier for SRT. | |
| Args: | |
| speaker_id: Speaker identifier | |
| Returns: | |
| Formatted speaker marker | |
| """ | |
| return f"[{speaker_id}] " | |
| def process_chunks_to_srt( | |
| chunks: List[Dict[str, Any]], | |
| word_break_threshold: int, | |
| language: str, | |
| include_speaker: bool | |
| ) -> str: | |
| """ | |
| Convert transcription chunks to SRT format. | |
| Args: | |
| chunks: List of chunk dictionaries | |
| word_break_threshold: Maximum words per subtitle block | |
| language: ISO language code | |
| include_speaker: Whether to include speaker information | |
| Returns: | |
| SRT formatted string | |
| """ | |
| srt_segments = [] | |
| current_segment = [] | |
| current_speaker = None | |
| current_start_time = None | |
| current_end_time = None | |
| # Process chunks to create segments | |
| for i, chunk in enumerate(chunks): | |
| text = chunk["text"] | |
| start_time = chunk["timestamp"][0] | |
| end_time = chunk["timestamp"][1] | |
| # Initialize current segment with first chunk | |
| if not current_segment: | |
| current_segment = [text] | |
| current_start_time = start_time | |
| current_end_time = end_time | |
| current_speaker = chunk.get("speaker") | |
| continue | |
| # Check if we should start a new segment | |
| should_break = False | |
| # Check sentence boundaries | |
| if text.strip() and text.strip()[0] in LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])["sentence_boundaries"]: | |
| should_break = True | |
| # Check word count threshold | |
| total_words = sum(count_words(t) for t in current_segment) | |
| if total_words + count_words(text) > word_break_threshold: | |
| should_break = True | |
| # Check speaker change (if speaker info available) | |
| if include_speaker and "speaker" in chunk and chunk["speaker"] != current_speaker: | |
| should_break = True | |
| if should_break: | |
| # Finalize current segment | |
| segment_text = " ".join(current_segment).strip() | |
| srt_segments.append({ | |
| "start": current_start_time, | |
| "end": current_end_time, | |
| "text": segment_text, | |
| "speaker": current_speaker | |
| }) | |
| # Start new segment | |
| current_segment = [text] | |
| current_start_time = start_time | |
| current_end_time = end_time | |
| current_speaker = chunk.get("speaker") | |
| else: | |
| # Continue current segment | |
| current_segment.append(text) | |
| current_end_time = end_time | |
| # Add final segment | |
| if current_segment: | |
| segment_text = " ".join(current_segment).strip() | |
| srt_segments.append({ | |
| "start": current_start_time, | |
| "end": current_end_time, | |
| "text": segment_text, | |
| "speaker": current_speaker | |
| }) | |
| # Format segments as SRT | |
| srt_lines = [] | |
| for i, segment in enumerate(srt_segments, 1): | |
| start_time = format_time(segment["start"]) | |
| end_time = format_time(segment["end"]) | |
| text = segment["text"] | |
| # Apply speaker marker if needed | |
| if include_speaker and segment["speaker"]: | |
| text = format_speaker_change(segment["speaker"]) + text | |
| # Format SRT block | |
| srt_lines.append(str(i)) | |
| srt_lines.append(f"{start_time} --> {end_time}") | |
| srt_lines.append(text) | |
| srt_lines.append("") # Blank line between segments | |
| return "\n".join(srt_lines).strip() | |
| def convert_transcription( | |
| json_input: str, | |
| word_break_threshold: int, | |
| language: str, | |
| include_speaker: bool | |
| ) -> Tuple[str, str]: | |
| """ | |
| Main conversion function from Transcribe JSON to SRT. | |
| Args: | |
| json_input: JSON input string | |
| word_break_threshold: Maximum words per subtitle block | |
| language: ISO language code | |
| include_speaker: Whether to include speaker information | |
| Returns: | |
| Tuple of (srt_output, status_message) | |
| """ | |
| # Validate input | |
| is_valid, data = validate_input(json_input) | |
| if not is_valid: | |
| return "", "Invalid JSON input: Missing required 'text' or 'chunks' fields" | |
| # Process chunks to SRT | |
| try: | |
| srt_output = process_chunks_to_srt( | |
| data["chunks"], | |
| word_break_threshold, | |
| language, | |
| include_speaker | |
| ) | |
| return srt_output, "Conversion successful" | |
| except Exception as e: | |
| return "", f"Error during conversion: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Transcription Format Converter") | |
| gr.Markdown("Convert Transcribe JSON format to SRT subtitle format with configurable options") | |
| gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Input section | |
| json_input = gr.Textbox( | |
| label="Transcribe JSON Input", | |
| placeholder='{"text": "Full text", "chunks": [{"text": "Segment 1", "timestamp": [0, 2.5]}, ...]}', | |
| lines=10 | |
| ) | |
| # Parameters | |
| word_break_threshold = gr.Slider( | |
| minimum=5, | |
| maximum=20, | |
| value=10, | |
| step=1, | |
| label="Word Break Threshold" | |
| ) | |
| language = gr.Dropdown( | |
| choices=["en", "es", "fr"], | |
| value="en", | |
| label="Language" | |
| ) | |
| include_speaker = gr.Checkbox( | |
| label="Include Speaker Information", | |
| value=False | |
| ) | |
| convert_btn = gr.Button("Convert to SRT", variant="primary") | |
| with gr.Column(): | |
| # Output section | |
| srt_output = gr.Textbox( | |
| label="SRT Output", | |
| lines=15, | |
| placeholder="SRT formatted subtitles will appear here..." | |
| ) | |
| status_message = gr.Textbox( | |
| label="Status", | |
| interactive=False | |
| ) | |
| # Examples | |
| examples = gr.Examples( | |
| examples=[ | |
| [ | |
| '{"text": "Hello world. This is a test. How are you today?", "chunks": [{"text": "Hello world.", "timestamp": [0, 1.5]}, {"text": "This is a test.", "timestamp": [1.5, 3.2]}, {"text": "How are you today?", "timestamp": [3.2, 5.0]}]}', | |
| 10, | |
| "en", | |
| False | |
| ], | |
| [ | |
| '{"text": "Hola mundo. Esto es una prueba. ¿Cómo estás hoy?", "chunks": [{"text": "Hola mundo.", "timestamp": [0, 1.5]}, {"text": "Esto es una prueba.", "timestamp": [1.5, 3.2]}, {"text": "¿Cómo estás hoy?", "timestamp": [3.2, 5.0]}]}', | |
| 10, | |
| "es", | |
| False | |
| ] | |
| ], | |
| inputs=[json_input, word_break_threshold, language, include_speaker], | |
| outputs=[srt_output, status_message], | |
| fn=convert_transcription, | |
| cache_examples=True, | |
| label="Examples" | |
| ) | |
| # Event listener | |
| convert_btn.click( | |
| fn=convert_transcription, | |
| inputs=[json_input, word_break_threshold, language, include_speaker], | |
| outputs=[srt_output, status_message], | |
| api_visibility="public" | |
| ) | |
| # Launch with modern theme and styling | |
| demo.launch( | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="lg", | |
| spacing_size="lg", | |
| radius_size="md" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| ), | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "Gradio Docs", "url": "https://www.gradio.app/docs"}, | |
| {"label": "GitHub", "url": "https://github.com/gradio-app/gradio"} | |
| ], | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| margin: 0 auto !important; | |
| } | |
| .gr-box { | |
| border-radius: 8px !important; | |
| } | |
| """ | |
| ) |