eubottura's picture
Upload folder using huggingface_hub
c2c1bfd verified
import gradio as gr
import json
import re
from collections import Counter
from datetime import timedelta
from typing import List, Dict, Any, Optional, Tuple
# Language-specific rules and dictionaries
LANGUAGE_RULES = {
"en": {
"trigger_words": ["however", "but", "therefore", "meanwhile", "nevertheless"],
"forbidden_endings": ["a", "an", "the", "and", "but", "or", "for", "nor", "on", "at", "to", "from", "by", "of", "in", "with"],
"sentence_boundaries": [".", "?", "!"]
},
"es": {
"trigger_words": ["sin embargo", "pero", "por lo tanto", "mientras tanto", "no obstante"],
"forbidden_endings": ["el", "la", "los", "las", "y", "o", "para", "por", "de", "en", "con", "a", "de", "por"],
"sentence_boundaries": [".", "?", "!"]
},
"fr": {
"trigger_words": ["cependant", "mais", "donc", "pendant ce temps", "néanmoins"],
"forbidden_endings": ["le", "la", "les", "et", "ou", "pour", "par", "de", "en", "avec", "à", "de", "par"],
"sentence_boundaries": [".", "?", "!"]
}
}
def validate_input(json_input: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
"""
Validate the input JSON structure.
Args:
json_input: JSON string to validate
Returns:
Tuple of (is_valid, parsed_data) where parsed_data is None if invalid
"""
try:
data = json.loads(json_input)
if not isinstance(data, dict):
return False, None
if "text" not in data or "chunks" not in data:
return False, None
if not isinstance(data["chunks"], list) or len(data["chunks"]) == 0:
return False, None
return True, data
except json.JSONDecodeError:
return False, None
def format_time(seconds: float) -> str:
"""
Convert seconds to SRT time format (HH:MM:SS,mmm).
Args:
seconds: Time in seconds
Returns:
Formatted time string
"""
td = timedelta(seconds=seconds)
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
milliseconds = td.microseconds // 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def count_words(text: str) -> int:
"""
Count words in text (including spaces and punctuation).
Args:
text: Text to count words in
Returns:
Word count
"""
return len(text.split())
def get_majority_speaker(chunks: List[Dict[str, Any]]) -> Optional[str]:
"""
Determine majority speaker from chunks.
Args:
chunks: List of chunk dictionaries
Returns:
Majority speaker ID or None if no speaker info
"""
speaker_counts = Counter()
for chunk in chunks:
if "speaker" in chunk:
speaker_counts[chunk["speaker"]] += count_words(chunk["text"])
if speaker_counts:
return speaker_counts.most_common(1)[0][0]
return None
def should_break_line(line: str, language: str, word_break_threshold: int) -> bool:
"""
Determine if a line should break based on language rules.
Args:
line: Text line to check
language: ISO language code
word_break_threshold: Maximum words per line
Returns:
True if line should break
"""
# Check word count threshold
if count_words(line) > word_break_threshold:
return True
# Check character limit (11 chars excluding spaces)
chars_excluding_spaces = len(re.sub(r'\s+', '', line))
if chars_excluding_spaces > 11:
return True
# Check for trigger words
rules = LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])
for trigger in rules["trigger_words"]:
if trigger.lower() in line.lower():
return True
# Check for forbidden endings
last_word = line.strip().split()[-1].lower() if line.strip() else ""
if last_word in rules["forbidden_endings"]:
return True
return False
def format_speaker_change(speaker_id: str) -> str:
"""
Format speaker identifier for SRT.
Args:
speaker_id: Speaker identifier
Returns:
Formatted speaker marker
"""
return f"[{speaker_id}] "
def process_chunks_to_srt(
chunks: List[Dict[str, Any]],
word_break_threshold: int,
language: str,
include_speaker: bool
) -> str:
"""
Convert transcription chunks to SRT format.
Args:
chunks: List of chunk dictionaries
word_break_threshold: Maximum words per subtitle block
language: ISO language code
include_speaker: Whether to include speaker information
Returns:
SRT formatted string
"""
srt_segments = []
current_segment = []
current_speaker = None
current_start_time = None
current_end_time = None
# Process chunks to create segments
for i, chunk in enumerate(chunks):
text = chunk["text"]
start_time = chunk["timestamp"][0]
end_time = chunk["timestamp"][1]
# Initialize current segment with first chunk
if not current_segment:
current_segment = [text]
current_start_time = start_time
current_end_time = end_time
current_speaker = chunk.get("speaker")
continue
# Check if we should start a new segment
should_break = False
# Check sentence boundaries
if text.strip() and text.strip()[0] in LANGUAGE_RULES.get(language, LANGUAGE_RULES["en"])["sentence_boundaries"]:
should_break = True
# Check word count threshold
total_words = sum(count_words(t) for t in current_segment)
if total_words + count_words(text) > word_break_threshold:
should_break = True
# Check speaker change (if speaker info available)
if include_speaker and "speaker" in chunk and chunk["speaker"] != current_speaker:
should_break = True
if should_break:
# Finalize current segment
segment_text = " ".join(current_segment).strip()
srt_segments.append({
"start": current_start_time,
"end": current_end_time,
"text": segment_text,
"speaker": current_speaker
})
# Start new segment
current_segment = [text]
current_start_time = start_time
current_end_time = end_time
current_speaker = chunk.get("speaker")
else:
# Continue current segment
current_segment.append(text)
current_end_time = end_time
# Add final segment
if current_segment:
segment_text = " ".join(current_segment).strip()
srt_segments.append({
"start": current_start_time,
"end": current_end_time,
"text": segment_text,
"speaker": current_speaker
})
# Format segments as SRT
srt_lines = []
for i, segment in enumerate(srt_segments, 1):
start_time = format_time(segment["start"])
end_time = format_time(segment["end"])
text = segment["text"]
# Apply speaker marker if needed
if include_speaker and segment["speaker"]:
text = format_speaker_change(segment["speaker"]) + text
# Format SRT block
srt_lines.append(str(i))
srt_lines.append(f"{start_time} --> {end_time}")
srt_lines.append(text)
srt_lines.append("") # Blank line between segments
return "\n".join(srt_lines).strip()
def convert_transcription(
json_input: str,
word_break_threshold: int,
language: str,
include_speaker: bool
) -> Tuple[str, str]:
"""
Main conversion function from Transcribe JSON to SRT.
Args:
json_input: JSON input string
word_break_threshold: Maximum words per subtitle block
language: ISO language code
include_speaker: Whether to include speaker information
Returns:
Tuple of (srt_output, status_message)
"""
# Validate input
is_valid, data = validate_input(json_input)
if not is_valid:
return "", "Invalid JSON input: Missing required 'text' or 'chunks' fields"
# Process chunks to SRT
try:
srt_output = process_chunks_to_srt(
data["chunks"],
word_break_threshold,
language,
include_speaker
)
return srt_output, "Conversion successful"
except Exception as e:
return "", f"Error during conversion: {str(e)}"
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Transcription Format Converter")
gr.Markdown("Convert Transcribe JSON format to SRT subtitle format with configurable options")
gr.Markdown("Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)")
with gr.Row():
with gr.Column():
# Input section
json_input = gr.Textbox(
label="Transcribe JSON Input",
placeholder='{"text": "Full text", "chunks": [{"text": "Segment 1", "timestamp": [0, 2.5]}, ...]}',
lines=10
)
# Parameters
word_break_threshold = gr.Slider(
minimum=5,
maximum=20,
value=10,
step=1,
label="Word Break Threshold"
)
language = gr.Dropdown(
choices=["en", "es", "fr"],
value="en",
label="Language"
)
include_speaker = gr.Checkbox(
label="Include Speaker Information",
value=False
)
convert_btn = gr.Button("Convert to SRT", variant="primary")
with gr.Column():
# Output section
srt_output = gr.Textbox(
label="SRT Output",
lines=15,
placeholder="SRT formatted subtitles will appear here..."
)
status_message = gr.Textbox(
label="Status",
interactive=False
)
# Examples
examples = gr.Examples(
examples=[
[
'{"text": "Hello world. This is a test. How are you today?", "chunks": [{"text": "Hello world.", "timestamp": [0, 1.5]}, {"text": "This is a test.", "timestamp": [1.5, 3.2]}, {"text": "How are you today?", "timestamp": [3.2, 5.0]}]}',
10,
"en",
False
],
[
'{"text": "Hola mundo. Esto es una prueba. ¿Cómo estás hoy?", "chunks": [{"text": "Hola mundo.", "timestamp": [0, 1.5]}, {"text": "Esto es una prueba.", "timestamp": [1.5, 3.2]}, {"text": "¿Cómo estás hoy?", "timestamp": [3.2, 5.0]}]}',
10,
"es",
False
]
],
inputs=[json_input, word_break_threshold, language, include_speaker],
outputs=[srt_output, status_message],
fn=convert_transcription,
cache_examples=True,
label="Examples"
)
# Event listener
convert_btn.click(
fn=convert_transcription,
inputs=[json_input, word_break_threshold, language, include_speaker],
outputs=[srt_output, status_message],
api_visibility="public"
)
# Launch with modern theme and styling
demo.launch(
theme=gr.themes.Soft(
primary_hue="blue",
secondary_hue="indigo",
neutral_hue="slate",
font=gr.themes.GoogleFont("Inter"),
text_size="lg",
spacing_size="lg",
radius_size="md"
).set(
button_primary_background_fill="*primary_600",
button_primary_background_fill_hover="*primary_700",
block_title_text_weight="600",
),
footer_links=[
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
{"label": "Gradio Docs", "url": "https://www.gradio.app/docs"},
{"label": "GitHub", "url": "https://github.com/gradio-app/gradio"}
],
css="""
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.gr-box {
border-radius: 8px !important;
}
"""
)