Create srt_utils.py
Browse files- srt_utils.py +205 -0
srt_utils.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
def srt_time_to_seconds(timestamp):
|
| 4 |
+
"""Converts SRT timestamp (HH:MM:SS,mmm) to seconds"""
|
| 5 |
+
try:
|
| 6 |
+
time_part, ms_part = timestamp.split(",")
|
| 7 |
+
h, m, s = map(int, time_part.split(":"))
|
| 8 |
+
ms = int(ms_part)
|
| 9 |
+
return h * 3600 + m * 60 + s + ms / 1000.0
|
| 10 |
+
except:
|
| 11 |
+
return 0.0
|
| 12 |
+
|
| 13 |
+
def seconds_to_srt_time(seconds):
|
| 14 |
+
"""Converts seconds to SRT timestamp (HH:MM:SS,mmm)"""
|
| 15 |
+
hours = int(seconds // 3600)
|
| 16 |
+
minutes = int((seconds % 3600) // 60)
|
| 17 |
+
secs = int(seconds % 60)
|
| 18 |
+
ms = int((seconds % 1) * 1000)
|
| 19 |
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
|
| 20 |
+
|
| 21 |
+
def parse_srt(srt_content):
|
| 22 |
+
"""Parses SRT content into a list of dictionaries. Returns VALIDATED list."""
|
| 23 |
+
pattern = re.compile(r"(\d+)\s*\n([^-\n]+?) --> ([^-\n]+?)\s*\n((?:(?!^\d+\s*\n).+\n?)*)", re.MULTILINE)
|
| 24 |
+
matches = pattern.findall(srt_content)
|
| 25 |
+
|
| 26 |
+
subtitles = []
|
| 27 |
+
for num, start, end, text in matches:
|
| 28 |
+
subtitles.append({
|
| 29 |
+
'start': srt_time_to_seconds(start.strip()),
|
| 30 |
+
'end': srt_time_to_seconds(end.strip()),
|
| 31 |
+
'text': text.strip()
|
| 32 |
+
})
|
| 33 |
+
return subtitles
|
| 34 |
+
|
| 35 |
+
def format_text_lines(text, max_chars=42):
|
| 36 |
+
"""Formats text into max 2 lines, balancing length or respecting max_chars"""
|
| 37 |
+
words = text.split()
|
| 38 |
+
if not words:
|
| 39 |
+
return ""
|
| 40 |
+
|
| 41 |
+
# If fits in one line
|
| 42 |
+
if len(text) <= max_chars:
|
| 43 |
+
return text
|
| 44 |
+
|
| 45 |
+
# Needs splitting
|
| 46 |
+
# Simple split strategy: find middle space
|
| 47 |
+
best_split_idx = -1
|
| 48 |
+
best_balance = float('inf')
|
| 49 |
+
|
| 50 |
+
# Try splitting at each word
|
| 51 |
+
for i in range(1, len(words)):
|
| 52 |
+
# Construct line 1 and line 2
|
| 53 |
+
line1 = " ".join(words[:i])
|
| 54 |
+
line2 = " ".join(words[i:])
|
| 55 |
+
|
| 56 |
+
len1 = len(line1)
|
| 57 |
+
len2 = len(line2)
|
| 58 |
+
|
| 59 |
+
# Valid split? Only if both fit max_chars
|
| 60 |
+
# (Or if single line is impossible, pick best fit)
|
| 61 |
+
if len1 <= max_chars and len2 <= max_chars:
|
| 62 |
+
balance = abs(len2 - len1)
|
| 63 |
+
# Bonus for bottom heavy (line2 >= line1)
|
| 64 |
+
if len2 >= len1:
|
| 65 |
+
balance -= 5
|
| 66 |
+
|
| 67 |
+
if balance < best_balance:
|
| 68 |
+
best_balance = balance
|
| 69 |
+
best_split_idx = i
|
| 70 |
+
|
| 71 |
+
if best_split_idx != -1:
|
| 72 |
+
line1 = " ".join(words[:best_split_idx])
|
| 73 |
+
line2 = " ".join(words[best_split_idx:])
|
| 74 |
+
return f"{line1}\n{line2}"
|
| 75 |
+
|
| 76 |
+
# Fallback: if no valid split found (e.g. one word is super long or total > 84)
|
| 77 |
+
# Just try to split in half by words regardless of limit (player will wrap or clip)
|
| 78 |
+
mid = len(words) // 2
|
| 79 |
+
return " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
|
| 80 |
+
|
| 81 |
+
def fix_word_timing(words):
|
| 82 |
+
"""
|
| 83 |
+
Ensures words are sequential in time (no overlaps) and preserves text order.
|
| 84 |
+
"""
|
| 85 |
+
if not words: return []
|
| 86 |
+
|
| 87 |
+
fixed_words = []
|
| 88 |
+
last_end = 0.0
|
| 89 |
+
|
| 90 |
+
for word in words:
|
| 91 |
+
start = word['start']
|
| 92 |
+
end = word['end']
|
| 93 |
+
duration = end - start
|
| 94 |
+
if duration < 0.01: duration = 0.01 # Minimal sanity check
|
| 95 |
+
|
| 96 |
+
# 1. Start must be >= last_end (Sequential constraint)
|
| 97 |
+
# However, if 'start' is significantly later (silence), keep 'start'.
|
| 98 |
+
# If 'start' is before 'last_end' (overlap), push 'start' to 'last_end'.
|
| 99 |
+
|
| 100 |
+
if start < last_end:
|
| 101 |
+
start = last_end
|
| 102 |
+
|
| 103 |
+
# 2. Recalculate end
|
| 104 |
+
end = start + duration
|
| 105 |
+
|
| 106 |
+
word['start'] = start
|
| 107 |
+
word['end'] = end
|
| 108 |
+
|
| 109 |
+
fixed_words.append(word)
|
| 110 |
+
last_end = end
|
| 111 |
+
|
| 112 |
+
return fixed_words
|
| 113 |
+
|
| 114 |
+
def apply_netflix_style_filter(srt_content):
|
| 115 |
+
"""
|
| 116 |
+
Groups word-level subtitles into Netflix-style phrases.
|
| 117 |
+
Rules:
|
| 118 |
+
- Max 42 chars/line
|
| 119 |
+
- Max 2 lines
|
| 120 |
+
- Max duration 7s
|
| 121 |
+
- Merge words
|
| 122 |
+
"""
|
| 123 |
+
words = parse_srt(srt_content)
|
| 124 |
+
if not words:
|
| 125 |
+
return srt_content
|
| 126 |
+
|
| 127 |
+
# FIX TIMING ISSUES FIRST
|
| 128 |
+
words = fix_word_timing(words)
|
| 129 |
+
|
| 130 |
+
grouped_events = []
|
| 131 |
+
current_group = []
|
| 132 |
+
|
| 133 |
+
MAX_CHARS_PER_LINE = 42
|
| 134 |
+
MAX_LINES = 2
|
| 135 |
+
MAX_TOTAL_CHARS = MAX_CHARS_PER_LINE * MAX_LINES
|
| 136 |
+
MAX_DURATION = 7.0
|
| 137 |
+
MIN_GAP_FOR_SPLIT = 0.5 # seconds
|
| 138 |
+
|
| 139 |
+
def get_group_text(group):
|
| 140 |
+
return " ".join(w['text'] for w in group)
|
| 141 |
+
|
| 142 |
+
def get_group_duration(group):
|
| 143 |
+
if not group: return 0
|
| 144 |
+
return group[-1]['end'] - group[0]['start']
|
| 145 |
+
|
| 146 |
+
for i, word in enumerate(words):
|
| 147 |
+
if not current_group:
|
| 148 |
+
current_group.append(word)
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
last_word = current_group[-1]
|
| 152 |
+
|
| 153 |
+
# 1. Check for Silence (Gap)
|
| 154 |
+
gap = word['start'] - last_word['end']
|
| 155 |
+
if gap > MIN_GAP_FOR_SPLIT:
|
| 156 |
+
grouped_events.append(current_group)
|
| 157 |
+
current_group = [word]
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
# 2. Check Limits (Length & Duration)
|
| 161 |
+
current_text = get_group_text(current_group)
|
| 162 |
+
new_text_proj = current_text + " " + word['text']
|
| 163 |
+
current_duration = last_word['end'] - current_group[0]['start']
|
| 164 |
+
new_duration_proj = word['end'] - current_group[0]['start']
|
| 165 |
+
|
| 166 |
+
is_too_long_char = len(new_text_proj) > MAX_TOTAL_CHARS
|
| 167 |
+
is_too_long_dur = new_duration_proj > MAX_DURATION
|
| 168 |
+
|
| 169 |
+
if is_too_long_char or is_too_long_dur:
|
| 170 |
+
grouped_events.append(current_group)
|
| 171 |
+
current_group = [word]
|
| 172 |
+
continue
|
| 173 |
+
|
| 174 |
+
# 3. Check Sentence Endings
|
| 175 |
+
if re.search(r'[.!?]$', last_word['text']):
|
| 176 |
+
# It's a sentence end.
|
| 177 |
+
# Only merge if the combined total is fitting well (e.g. single line)
|
| 178 |
+
# Netflix prefers sentence breaks.
|
| 179 |
+
# If new_text_proj fits in ONE line, maybe merge? (e.g. "Yes. I do.")
|
| 180 |
+
# If it forces TWO lines, prefer split.
|
| 181 |
+
if len(new_text_proj) > MAX_CHARS_PER_LINE:
|
| 182 |
+
grouped_events.append(current_group)
|
| 183 |
+
current_group = [word]
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
# 4. Line split lookahead (Advanced - skipped for now, relied on format_text_lines)
|
| 187 |
+
current_group.append(word)
|
| 188 |
+
|
| 189 |
+
if current_group:
|
| 190 |
+
grouped_events.append(current_group)
|
| 191 |
+
|
| 192 |
+
# Generate Output SRT
|
| 193 |
+
output_srt = ""
|
| 194 |
+
for i, group in enumerate(grouped_events, 1):
|
| 195 |
+
if not group: continue
|
| 196 |
+
|
| 197 |
+
start_time = seconds_to_srt_time(group[0]['start'])
|
| 198 |
+
end_time = seconds_to_srt_time(group[-1]['end'])
|
| 199 |
+
|
| 200 |
+
text = get_group_text(group)
|
| 201 |
+
formatted_text = format_text_lines(text, MAX_CHARS_PER_LINE)
|
| 202 |
+
|
| 203 |
+
output_srt += f"{i}\n{start_time} --> {end_time}\n{formatted_text}\n\n"
|
| 204 |
+
|
| 205 |
+
return output_srt.strip()
|