Spaces:

habulaj
/

subapi

Running

App Files Files Community

habulaj commited on 27 days ago

Commit

279f743

verified ·

1 Parent(s): 4620568

Create srt_utils.py

Browse files

Files changed (1) hide show

srt_utils.py +205 -0

srt_utils.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import re
+def srt_time_to_seconds(timestamp):
+    """Converts SRT timestamp (HH:MM:SS,mmm) to seconds"""
+    try:
+        time_part, ms_part = timestamp.split(",")
+        h, m, s = map(int, time_part.split(":"))
+        ms = int(ms_part)
+        return h * 3600 + m * 60 + s + ms / 1000.0
+    except:
+        return 0.0
+def seconds_to_srt_time(seconds):
+    """Converts seconds to SRT timestamp (HH:MM:SS,mmm)"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    ms = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
+def parse_srt(srt_content):
+    """Parses SRT content into a list of dictionaries. Returns VALIDATED list."""
+    pattern = re.compile(r"(\d+)\s*\n([^-\n]+?) --> ([^-\n]+?)\s*\n((?:(?!^\d+\s*\n).+\n?)*)", re.MULTILINE)
+    matches = pattern.findall(srt_content)
+    subtitles = []
+    for num, start, end, text in matches:
+        subtitles.append({
+            'start': srt_time_to_seconds(start.strip()),
+            'end': srt_time_to_seconds(end.strip()),
+            'text': text.strip()
+        })
+    return subtitles
+def format_text_lines(text, max_chars=42):
+    """Formats text into max 2 lines, balancing length or respecting max_chars"""
+    words = text.split()
+    if not words:
+        return ""
+    # If fits in one line
+    if len(text) <= max_chars:
+        return text
+    # Needs splitting
+    # Simple split strategy: find middle space
+    best_split_idx = -1
+    best_balance = float('inf')
+    # Try splitting at each word
+    for i in range(1, len(words)):
+        # Construct line 1 and line 2
+        line1 = " ".join(words[:i])
+        line2 = " ".join(words[i:])
+        len1 = len(line1)
+        len2 = len(line2)
+        # Valid split? Only if both fit max_chars
+        # (Or if single line is impossible, pick best fit)
+        if len1 <= max_chars and len2 <= max_chars:
+            balance = abs(len2 - len1)
+            # Bonus for bottom heavy (line2 >= line1)
+            if len2 >= len1:
+                balance -= 5
+            if balance < best_balance:
+                best_balance = balance
+                best_split_idx = i
+    if best_split_idx != -1:
+        line1 = " ".join(words[:best_split_idx])
+        line2 = " ".join(words[best_split_idx:])
+        return f"{line1}\n{line2}"
+    # Fallback: if no valid split found (e.g. one word is super long or total > 84)
+    # Just try to split in half by words regardless of limit (player will wrap or clip)
+    mid = len(words) // 2
+    return " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
+def fix_word_timing(words):
+    """
+    Ensures words are sequential in time (no overlaps) and preserves text order.
+    """
+    if not words: return []
+    fixed_words = []
+    last_end = 0.0
+    for word in words:
+        start = word['start']
+        end = word['end']
+        duration = end - start
+        if duration < 0.01: duration = 0.01 # Minimal sanity check
+        # 1. Start must be >= last_end (Sequential constraint)
+        # However, if 'start' is significantly later (silence), keep 'start'.
+        # If 'start' is before 'last_end' (overlap), push 'start' to 'last_end'.
+        if start < last_end:
+            start = last_end
+        # 2. Recalculate end
+        end = start + duration
+        word['start'] = start
+        word['end'] = end
+        fixed_words.append(word)
+        last_end = end
+    return fixed_words
+def apply_netflix_style_filter(srt_content):
+    """
+    Groups word-level subtitles into Netflix-style phrases.
+    Rules:
+    - Max 42 chars/line
+    - Max 2 lines
+    - Max duration 7s
+    - Merge words
+    """
+    words = parse_srt(srt_content)
+    if not words:
+        return srt_content
+    # FIX TIMING ISSUES FIRST
+    words = fix_word_timing(words)
+    grouped_events = []
+    current_group = []
+    MAX_CHARS_PER_LINE = 42
+    MAX_LINES = 2
+    MAX_TOTAL_CHARS = MAX_CHARS_PER_LINE * MAX_LINES
+    MAX_DURATION = 7.0
+    MIN_GAP_FOR_SPLIT = 0.5 # seconds
+    def get_group_text(group):
+        return " ".join(w['text'] for w in group)
+    def get_group_duration(group):
+         if not group: return 0
+         return group[-1]['end'] - group[0]['start']
+    for i, word in enumerate(words):
+        if not current_group:
+            current_group.append(word)
+            continue
+        last_word = current_group[-1]
+        # 1. Check for Silence (Gap)
+        gap = word['start'] - last_word['end']
+        if gap > MIN_GAP_FOR_SPLIT:
+            grouped_events.append(current_group)
+            current_group = [word]
+            continue
+        # 2. Check Limits (Length & Duration)
+        current_text = get_group_text(current_group)
+        new_text_proj = current_text + " " + word['text']
+        current_duration = last_word['end'] - current_group[0]['start']
+        new_duration_proj = word['end'] - current_group[0]['start']
+        is_too_long_char = len(new_text_proj) > MAX_TOTAL_CHARS
+        is_too_long_dur = new_duration_proj > MAX_DURATION
+        if is_too_long_char or is_too_long_dur:
+            grouped_events.append(current_group)
+            current_group = [word]
+            continue
+        # 3. Check Sentence Endings
+        if re.search(r'[.!?]$', last_word['text']):
+            # It's a sentence end.
+            # Only merge if the combined total is fitting well (e.g. single line)
+            # Netflix prefers sentence breaks.
+            # If new_text_proj fits in ONE line, maybe merge? (e.g. "Yes. I do.")
+            # If it forces TWO lines, prefer split.
+            if len(new_text_proj) > MAX_CHARS_PER_LINE:
+                grouped_events.append(current_group)
+                current_group = [word]
+                continue
+        # 4. Line split lookahead (Advanced - skipped for now, relied on format_text_lines)
+        current_group.append(word)
+    if current_group:
+        grouped_events.append(current_group)
+    # Generate Output SRT
+    output_srt = ""
+    for i, group in enumerate(grouped_events, 1):
+        if not group: continue
+        start_time = seconds_to_srt_time(group[0]['start'])
+        end_time = seconds_to_srt_time(group[-1]['end'])
+        text = get_group_text(group)
+        formatted_text = format_text_lines(text, MAX_CHARS_PER_LINE)
+        output_srt += f"{i}\n{start_time} --> {end_time}\n{formatted_text}\n\n"
+    return output_srt.strip()