habulaj commited on
Commit
279f743
·
verified ·
1 Parent(s): 4620568

Create srt_utils.py

Browse files
Files changed (1) hide show
  1. srt_utils.py +205 -0
srt_utils.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def srt_time_to_seconds(timestamp):
4
+ """Converts SRT timestamp (HH:MM:SS,mmm) to seconds"""
5
+ try:
6
+ time_part, ms_part = timestamp.split(",")
7
+ h, m, s = map(int, time_part.split(":"))
8
+ ms = int(ms_part)
9
+ return h * 3600 + m * 60 + s + ms / 1000.0
10
+ except:
11
+ return 0.0
12
+
13
+ def seconds_to_srt_time(seconds):
14
+ """Converts seconds to SRT timestamp (HH:MM:SS,mmm)"""
15
+ hours = int(seconds // 3600)
16
+ minutes = int((seconds % 3600) // 60)
17
+ secs = int(seconds % 60)
18
+ ms = int((seconds % 1) * 1000)
19
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
20
+
21
+ def parse_srt(srt_content):
22
+ """Parses SRT content into a list of dictionaries. Returns VALIDATED list."""
23
+ pattern = re.compile(r"(\d+)\s*\n([^-\n]+?) --> ([^-\n]+?)\s*\n((?:(?!^\d+\s*\n).+\n?)*)", re.MULTILINE)
24
+ matches = pattern.findall(srt_content)
25
+
26
+ subtitles = []
27
+ for num, start, end, text in matches:
28
+ subtitles.append({
29
+ 'start': srt_time_to_seconds(start.strip()),
30
+ 'end': srt_time_to_seconds(end.strip()),
31
+ 'text': text.strip()
32
+ })
33
+ return subtitles
34
+
35
+ def format_text_lines(text, max_chars=42):
36
+ """Formats text into max 2 lines, balancing length or respecting max_chars"""
37
+ words = text.split()
38
+ if not words:
39
+ return ""
40
+
41
+ # If fits in one line
42
+ if len(text) <= max_chars:
43
+ return text
44
+
45
+ # Needs splitting
46
+ # Simple split strategy: find middle space
47
+ best_split_idx = -1
48
+ best_balance = float('inf')
49
+
50
+ # Try splitting at each word
51
+ for i in range(1, len(words)):
52
+ # Construct line 1 and line 2
53
+ line1 = " ".join(words[:i])
54
+ line2 = " ".join(words[i:])
55
+
56
+ len1 = len(line1)
57
+ len2 = len(line2)
58
+
59
+ # Valid split? Only if both fit max_chars
60
+ # (Or if single line is impossible, pick best fit)
61
+ if len1 <= max_chars and len2 <= max_chars:
62
+ balance = abs(len2 - len1)
63
+ # Bonus for bottom heavy (line2 >= line1)
64
+ if len2 >= len1:
65
+ balance -= 5
66
+
67
+ if balance < best_balance:
68
+ best_balance = balance
69
+ best_split_idx = i
70
+
71
+ if best_split_idx != -1:
72
+ line1 = " ".join(words[:best_split_idx])
73
+ line2 = " ".join(words[best_split_idx:])
74
+ return f"{line1}\n{line2}"
75
+
76
+ # Fallback: if no valid split found (e.g. one word is super long or total > 84)
77
+ # Just try to split in half by words regardless of limit (player will wrap or clip)
78
+ mid = len(words) // 2
79
+ return " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
80
+
81
+ def fix_word_timing(words):
82
+ """
83
+ Ensures words are sequential in time (no overlaps) and preserves text order.
84
+ """
85
+ if not words: return []
86
+
87
+ fixed_words = []
88
+ last_end = 0.0
89
+
90
+ for word in words:
91
+ start = word['start']
92
+ end = word['end']
93
+ duration = end - start
94
+ if duration < 0.01: duration = 0.01 # Minimal sanity check
95
+
96
+ # 1. Start must be >= last_end (Sequential constraint)
97
+ # However, if 'start' is significantly later (silence), keep 'start'.
98
+ # If 'start' is before 'last_end' (overlap), push 'start' to 'last_end'.
99
+
100
+ if start < last_end:
101
+ start = last_end
102
+
103
+ # 2. Recalculate end
104
+ end = start + duration
105
+
106
+ word['start'] = start
107
+ word['end'] = end
108
+
109
+ fixed_words.append(word)
110
+ last_end = end
111
+
112
+ return fixed_words
113
+
114
+ def apply_netflix_style_filter(srt_content):
115
+ """
116
+ Groups word-level subtitles into Netflix-style phrases.
117
+ Rules:
118
+ - Max 42 chars/line
119
+ - Max 2 lines
120
+ - Max duration 7s
121
+ - Merge words
122
+ """
123
+ words = parse_srt(srt_content)
124
+ if not words:
125
+ return srt_content
126
+
127
+ # FIX TIMING ISSUES FIRST
128
+ words = fix_word_timing(words)
129
+
130
+ grouped_events = []
131
+ current_group = []
132
+
133
+ MAX_CHARS_PER_LINE = 42
134
+ MAX_LINES = 2
135
+ MAX_TOTAL_CHARS = MAX_CHARS_PER_LINE * MAX_LINES
136
+ MAX_DURATION = 7.0
137
+ MIN_GAP_FOR_SPLIT = 0.5 # seconds
138
+
139
+ def get_group_text(group):
140
+ return " ".join(w['text'] for w in group)
141
+
142
+ def get_group_duration(group):
143
+ if not group: return 0
144
+ return group[-1]['end'] - group[0]['start']
145
+
146
+ for i, word in enumerate(words):
147
+ if not current_group:
148
+ current_group.append(word)
149
+ continue
150
+
151
+ last_word = current_group[-1]
152
+
153
+ # 1. Check for Silence (Gap)
154
+ gap = word['start'] - last_word['end']
155
+ if gap > MIN_GAP_FOR_SPLIT:
156
+ grouped_events.append(current_group)
157
+ current_group = [word]
158
+ continue
159
+
160
+ # 2. Check Limits (Length & Duration)
161
+ current_text = get_group_text(current_group)
162
+ new_text_proj = current_text + " " + word['text']
163
+ current_duration = last_word['end'] - current_group[0]['start']
164
+ new_duration_proj = word['end'] - current_group[0]['start']
165
+
166
+ is_too_long_char = len(new_text_proj) > MAX_TOTAL_CHARS
167
+ is_too_long_dur = new_duration_proj > MAX_DURATION
168
+
169
+ if is_too_long_char or is_too_long_dur:
170
+ grouped_events.append(current_group)
171
+ current_group = [word]
172
+ continue
173
+
174
+ # 3. Check Sentence Endings
175
+ if re.search(r'[.!?]$', last_word['text']):
176
+ # It's a sentence end.
177
+ # Only merge if the combined total is fitting well (e.g. single line)
178
+ # Netflix prefers sentence breaks.
179
+ # If new_text_proj fits in ONE line, maybe merge? (e.g. "Yes. I do.")
180
+ # If it forces TWO lines, prefer split.
181
+ if len(new_text_proj) > MAX_CHARS_PER_LINE:
182
+ grouped_events.append(current_group)
183
+ current_group = [word]
184
+ continue
185
+
186
+ # 4. Line split lookahead (Advanced - skipped for now, relied on format_text_lines)
187
+ current_group.append(word)
188
+
189
+ if current_group:
190
+ grouped_events.append(current_group)
191
+
192
+ # Generate Output SRT
193
+ output_srt = ""
194
+ for i, group in enumerate(grouped_events, 1):
195
+ if not group: continue
196
+
197
+ start_time = seconds_to_srt_time(group[0]['start'])
198
+ end_time = seconds_to_srt_time(group[-1]['end'])
199
+
200
+ text = get_group_text(group)
201
+ formatted_text = format_text_lines(text, MAX_CHARS_PER_LINE)
202
+
203
+ output_srt += f"{i}\n{start_time} --> {end_time}\n{formatted_text}\n\n"
204
+
205
+ return output_srt.strip()