shortsrender / video /caption.py
sam12345324's picture
Upload 26 files
0c8f7e3 verified
import string
from typing import List, Dict, Tuple
from loguru import logger
from typing import Dict, List
class Caption:
def is_punctuation(self, text):
return text in string.punctuation
def create_subtitle_segments_english(
self, captions: List[Dict], max_length=80, lines=2
):
"""
Breaks up the captions into segments of max_length characters
on two lines and merge punctuation with the last word
"""
if not captions:
return []
segments = []
current_segment_texts = ["" for _ in range(lines)]
current_line = 0
segment_start_ts = captions[0]["start_ts"]
segment_end_ts = captions[0]["end_ts"]
for caption in captions:
text = caption["text"]
start_ts = caption["start_ts"]
end_ts = caption["end_ts"]
# Update the segment end timestamp
segment_end_ts = end_ts
# If the caption is a punctuation, merge it with the current line
if self.is_punctuation(text):
if current_line < lines and current_segment_texts[current_line]:
current_segment_texts[current_line] += text
continue
# If the line is too long, move to the next one
if (
current_line < lines
and len(current_segment_texts[current_line] + text) > max_length
):
current_line += 1
# If we've filled all lines, save the current segment and start a new one
if current_line >= lines:
segments.append(
{
"text": current_segment_texts,
"start_ts": segment_start_ts,
"end_ts": segment_end_ts,
}
)
# Reset for next segment
current_segment_texts = ["" for _ in range(lines)]
current_line = 0
# Add a small gap (0.05s) between segments to prevent overlap
segment_start_ts = start_ts + 0.05
# Add the text to the current segment
if current_line < lines:
current_segment_texts[current_line] += (
" " if current_segment_texts[current_line] else ""
)
current_segment_texts[current_line] += text
# Add the last segment if there's any content
if any(current_segment_texts):
segments.append(
{
"text": current_segment_texts,
"start_ts": segment_start_ts,
"end_ts": segment_end_ts,
}
)
# Post-processing to ensure no overlaps by adjusting end times if needed
for i in range(len(segments) - 1):
if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
return segments
def create_subtitle_segments_international(
self, captions: List[Dict], max_length=80, lines=2
):
"""
Breaks up international captions (full sentences) into smaller segments that fit
within max_length characters per line, with proper timing distribution.
Handles both space-delimited languages like English and character-based languages like Chinese.
Args:
captions: List of caption dictionaries with text, start_ts, and end_ts
max_length: Maximum number of characters per line
lines: Number of lines per segment
Returns:
List of subtitle segments
"""
if not captions:
return []
segments = []
for caption in captions:
text = caption["text"].strip()
start_ts = caption["start_ts"]
end_ts = caption["end_ts"]
duration = end_ts - start_ts
# Check if text is using Chinese/Japanese/Korean characters (CJK)
# For CJK, we'll split by characters rather than words
is_cjk = any("\u4e00" <= char <= "\u9fff" for char in text)
parts = []
if is_cjk:
# For CJK languages, process character by character
current_part = ""
for char in text:
if len(current_part + char) > max_length:
parts.append(current_part)
current_part = char
else:
current_part += char
# Add the last part if not empty
if current_part:
parts.append(current_part)
else:
# Original word-based splitting for languages with spaces
words = text.split()
current_part = ""
for word in words:
# If adding this word would exceed max_length, start a new part
if len(current_part + " " + word) > max_length and current_part:
parts.append(current_part.strip())
current_part = word
else:
# Add space if not the first word in the part
if current_part:
current_part += " "
current_part += word
# Add the last part if not empty
if current_part:
parts.append(current_part.strip())
# Group parts into segments with 'lines' number of lines per segment
segment_parts = []
for i in range(0, len(parts), lines):
segment_parts.append(parts[i : i + lines])
# Calculate time proportionally based on segment text length
total_chars = sum(len("".join(part_group)) for part_group in segment_parts)
current_time = start_ts
for i, part_group in enumerate(segment_parts):
# Get character count for this segment group
segment_chars = len("".join(part_group))
# Calculate time proportionally, but ensure at least a minimum duration
if total_chars > 0:
segment_duration = (segment_chars / total_chars) * duration
segment_duration = max(
segment_duration, 0.5
) # Ensure minimum duration of 0.5s
else:
segment_duration = duration / len(segment_parts)
segment_start = current_time
segment_end = segment_start + segment_duration
# Move current time forward for next segment
current_time = segment_end
# Create segment with proper text array format for the subtitle renderer
segment_text = part_group + [""] * (lines - len(part_group))
segments.append(
{
"text": segment_text,
"start_ts": segment_start,
"end_ts": segment_end,
}
)
# Ensure no overlaps between segments by adjusting end times if needed
for i in range(len(segments) - 1):
if segments[i]["end_ts"] >= segments[i + 1]["start_ts"]:
segments[i]["end_ts"] = segments[i + 1]["start_ts"] - 0.05
return segments
@staticmethod
def hex_to_ass(hex_color: str, alpha: float = 1.0) -> str:
"""
Convert a hex color + transparency to ASS &HaaBBGGRR& format.
:param hex_color: CSS-style color string, e.g. "#FFA07A" or "00ff00"
:param alpha: transparency from 0.0 (opaque) to 1.0 (fully transparent)
:return: ASS color string, e.g. "&H8014C8FF&"
"""
# strip leading '#' if present
hex_color = hex_color.lstrip('#')
# support 3-digit shorthand like 'f0a'
if len(hex_color) == 3:
hex_color = ''.join([c*2 for c in hex_color])
if len(hex_color) != 6:
raise ValueError("hex_color must be in 'RRGGBB' or 'RGB' format")
# parse RGB
r = int(hex_color[0:2], 16)
g = int(hex_color[2:4], 16)
b = int(hex_color[4:6], 16)
# ASS alpha is inverted: 00=opaque, FF=transparent
# so we invert the user's alpha (0.0 = opaque)
a = int((1.0 - alpha) * 255)
a = max(0, min(255, a))
# build BGR and alpha bytes
aa = f"{a:02X}"
bb = f"{b:02X}"
gg = f"{g:02X}"
rr = f"{r:02X}"
return f"&H{aa}{bb}{gg}{rr}"
def create_subtitle(
self,
segments,
dimensions: Tuple[int, int],
output_path: str,
font_size=24,
font_color="#fff",
shadow_color="#000",
shadow_transparency=0.1,
shadow_blur=0,
stroke_color="#000",
stroke_size=0,
font_name="Arial",
font_bold=True,
font_italic=False,
subtitle_position="center",
):
width, height = dimensions
bold_value = -1 if font_bold else 0
italic_value = -1 if font_italic else 0
position_from_top = 0.2
if subtitle_position == "center":
position_from_top = 0.45
if subtitle_position == "bottom":
position_from_top = 0.75
ass_content = """[Script Info]
ScriptType: v4.00+
PlayResX: {width}
PlayResY: {height}
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,{font_name},{font_size},{font_color},&H000000FF,{stroke_color},&H00000000,{bold},{italic},0,0,100,100,0,0,1,{stroke_size},0,8,20,20,20,1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
""".format(
width=width,
height=height,
font_size=font_size,
font_color=self.hex_to_ass(font_color),
stroke_color=self.hex_to_ass(stroke_color),
stroke_size=stroke_size,
font_name=font_name,
bold=bold_value,
italic=italic_value
)
pos_x = int(width / 2)
pos_y = int(height * position_from_top)
# Process each segment and add to the subtitle file
for segment in segments:
start_time = self.format_time(segment["start_ts"])
end_time = self.format_time(segment["end_ts"])
# Create text with line breaks
text_lines = segment["text"]
formatted_text = ""
for i, line in enumerate(text_lines):
if line: # Only add non-empty lines
if i > 0: # Add line break if not the first line
formatted_text += "\\N"
formatted_text += line
# Create shadow if shadow_blur is specified or if we want a drop shadow effect
if shadow_blur > 0 or shadow_transparency < 1.0:
# Convert shadow color with transparency
shadow_color_ass = self.hex_to_ass(shadow_color, shadow_transparency)
# Offset shadow position slightly for drop shadow effect
shadow_pos_x = pos_x + 2
shadow_pos_y = pos_y + 2
# For shadow text, use shadow color only for primary color and set proper alpha
# Only apply shadow color to primary color (\1c) and use alpha for transparency
shadow_override_tags = f"\\pos({shadow_pos_x},{shadow_pos_y})\\1c{shadow_color_ass}\\bord0"
# Add alpha transparency if needed
if shadow_transparency > 0:
alpha_hex = hex(int((1.0 - shadow_transparency) * 255))[2:].upper().zfill(2)
shadow_override_tags += f"\\1a&H{alpha_hex}&"
if shadow_blur > 0:
shadow_override_tags += f"\\blur{shadow_blur}"
shadow_formatted_text = f"{{{shadow_override_tags}}}" + formatted_text
# Add shadow dialogue line first (so it appears behind)
ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{shadow_formatted_text}\n"
# Create main text layer
main_override_tags = f"\\pos({pos_x},{pos_y})"
main_formatted_text = f"{{{main_override_tags}}}" + formatted_text
# Add main dialogue line (appears on top)
ass_content += f"Dialogue: 0,{start_time},{end_time},Default,,0,0,0,,{main_formatted_text}\n"
with open(output_path, "w", encoding="utf-8") as f:
f.write(ass_content)
logger.debug("subtitle (ass) was created with drop shadow")
def format_time(self, seconds):
"""
Convert seconds to ASS time format (H:MM:SS.cc)
"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
centisecs = int((seconds % 1) * 100)
return f"{hours}:{minutes:02d}:{secs:02d}.{centisecs:02d}"