Spaces:

habulaj
/

subapi

Running

App Files Files Community

habulaj commited on 27 days ago

Commit

7463c33

verified ·

1 Parent(s): da37cf5

Create srt_utils.py

Browse files

Files changed (1) hide show

srt_utils.py +445 -0

srt_utils.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import re
+def srt_time_to_seconds(timestamp):
+    """Converts SRT timestamp (HH:MM:SS,mmm) to seconds"""
+    try:
+        time_part, ms_part = timestamp.split(",")
+        h, m, s = map(int, time_part.split(":"))
+        ms = int(ms_part)
+        return h * 3600 + m * 60 + s + ms / 1000.0
+    except:
+        return 0.0
+def seconds_to_srt_time(seconds):
+    """Converts seconds to SRT timestamp (HH:MM:SS,mmm)"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    ms = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{ms:03d}"
+def shift_srt_timestamps(srt_content, offset_seconds):
+    """Shifts all timestamps in SRT content by offset_seconds"""
+    subs = parse_srt(srt_content)
+    if not subs:
+        return srt_content
+    shifted_srt = ""
+    for i, sub in enumerate(subs, 1):
+        start = sub['start'] + offset_seconds
+        end = sub['end'] + offset_seconds
+        # Ensure non-negative
+        if start < 0: start = 0
+        if end < 1e-3: end = 1e-3 # avoid 0 overlap logic issues if possible
+        start_str = seconds_to_srt_time(start)
+        end_str = seconds_to_srt_time(end)
+        shifted_srt += f"{i}\n{start_str} --> {end_str}\n{sub['text']}\n\n"
+    return shifted_srt.strip()
+def parse_srt(srt_content):
+    """Parses SRT content into a list of dictionaries. Returns VALIDATED list."""
+    pattern = re.compile(r"(\d+)\s*\n([^-\n]+?) --> ([^-\n]+?)\s*\n((?:(?!\d+\s*\n\d{1,2}:\d{2}).+\n?)*)", re.MULTILINE)
+    matches = pattern.findall(srt_content)
+    subtitles = []
+    for num, start, end, text in matches:
+        subtitles.append({
+            'start': srt_time_to_seconds(start.strip()),
+            'end': srt_time_to_seconds(end.strip()),
+            'text': text.strip()
+        })
+    return subtitles
+def format_text_lines(text, max_chars=42):
+    """Formats text into max 2 lines, balancing length or respecting max_chars"""
+    words = text.split()
+    if not words:
+        return ""
+    # If fits in one line, but we might WANT to split if it's long (> 30 chars) for better reading (pyramid shape)
+    # The user complained about 42 chars being too long for one line.
+    FORCE_SPLIT_THRESHOLD = 30
+    if len(text) <= max_chars and len(text) <= FORCE_SPLIT_THRESHOLD:
+        return text
+    # Needs splitting (or we want to try splitting)
+    # Simple split strategy: find middle space
+    best_split_idx = -1
+    best_balance = float('inf')
+    # Try splitting at each word
+    for i in range(1, len(words)):
+        # Construct line 1 and line 2
+        line1 = " ".join(words[:i])
+        line2 = " ".join(words[i:])
+        len1 = len(line1)
+        len2 = len(line2)
+        # Valid split? Only if both fit max_chars
+        if len1 <= max_chars and len2 <= max_chars:
+            balance = abs(len2 - len1)
+            # Bonus for bottom heavy (line2 >= line1) which looks better often (pyramid)
+            if len2 >= len1:
+                balance -= 5
+            if balance < best_balance:
+                best_balance = balance
+                best_split_idx = i
+    # If we found a valid split
+    if best_split_idx != -1:
+        # If the original text fit in one line (< max_chars), only use the split if it's reasonably balanced.
+        # If the split results in a tiny orphan like "I\nam going", stick to 1 line if possible.
+        if len(text) <= max_chars:
+             line1 = " ".join(words[:best_split_idx])
+             line2 = " ".join(words[best_split_idx:])
+             # If one line is very short relative to the other, maybe don't split?
+             # But user wants "Netflix style", usually balanced.
+             pass
+        line1 = " ".join(words[:best_split_idx])
+        line2 = " ".join(words[best_split_idx:])
+        return f"{line1}\n{line2}"
+    # Fallback: if no valid split found (e.g. words too long), but whole text fits in max_chars
+    if len(text) <= max_chars:
+        return text
+    # Fallback 2: Really long text, just split in middle
+    mid = len(words) // 2
+    return " ".join(words[:mid]) + "\n" + " ".join(words[mid:])
+def fix_word_timing(words):
+    """
+    Ensures words are sequential in time.
+    Strategy:
+    1. If overlaps, prefer trimming the END of the previous word to preserve the START of the current word.
+    2. Only delay the current word if the previous word would become too short or inverted.
+    3. Ensure minimum duration for all words.
+    """
+    if not words: return []
+    # We edit in place / return modified list
+    for i in range(1, len(words)):
+        prev = words[i-1]
+        curr = words[i]
+        # Check for overlap
+        if curr['start'] < prev['end']:
+            # Overlap detected.
+            # Try to trim prev['end'] to match curr['start']
+            # Check if trimming leaves prev with enough time? (e.g. > 0s)
+            # Actually, standard logic: just clamp prev end.
+            new_prev_end = max(prev['start'], curr['start'])
+            # If trimming makes it zero/negative (meaning curr starts BEFORE prev starts),
+            # then we adhere to sequential text order implies we MUST delay curr.
+            if new_prev_end <= prev['start'] + 0.01:
+                # Impossible to trim prev enough. Push curr.
+                curr['start'] = prev['end']
+            else:
+                # Trim prev
+                prev['end'] = new_prev_end
+        # Ensure curr has valid duration
+        if curr['end'] <= curr['start']:
+            curr['end'] = curr['start'] + 0.1 # Minimum duration 100ms
+    return words
+def apply_netflix_style_filter(srt_content):
+    """
+    Groups word-level subtitles into Netflix-style phrases.
+    Rules:
+    - Max 42 chars/line
+    - Max 2 lines
+    - Max duration 7s
+    - Merge words
+    """
+    words = parse_srt(srt_content)
+    if not words:
+        return srt_content
+    # FIX TIMING ISSUES FIRST
+    words = fix_word_timing(words)
+    grouped_events = []
+    current_group = []
+    MAX_CHARS_PER_LINE = 42
+    MAX_LINES = 2
+    MAX_TOTAL_CHARS = MAX_CHARS_PER_LINE * MAX_LINES
+    MAX_DURATION = 7.0
+    MIN_GAP_FOR_SPLIT = 0.5 # seconds
+    def get_group_text(group):
+        return " ".join(w['text'] for w in group)
+    def get_group_duration(group):
+         if not group: return 0
+         return group[-1]['end'] - group[0]['start']
+    for i, word in enumerate(words):
+        if not current_group:
+            current_group.append(word)
+            continue
+        last_word = current_group[-1]
+        # 1. Check for Silence (Gap)
+        gap = word['start'] - last_word['end']
+        if gap > MIN_GAP_FOR_SPLIT:
+            grouped_events.append(current_group)
+            current_group = [word]
+            continue
+        # 2. Check Limits (Length & Duration)
+        current_text = get_group_text(current_group)
+        new_text_proj = current_text + " " + word['text']
+        current_duration = last_word['end'] - current_group[0]['start']
+        new_duration_proj = word['end'] - current_group[0]['start']
+        # New Logic: Prefer single lines
+        # If adding the word exceeds 42 chars (MAX_CHARS_PER_LINE)
+        if len(new_text_proj) > MAX_CHARS_PER_LINE:
+            # We are crossing the single line boundary.
+            # Check if we SHOULD split now or allow 2 lines.
+            # Reasons to split (make a new subtitle):
+            # A. Current subtitle is already "long enough" in duration (> 1s)
+            is_long_enough_dur = current_duration > 1.0
+            # B. Current subtitle is a complete sentence?
+            # (Handled by step 3, but this is size check)
+            # C. The projected text is HUGE (e.g. > 70 chars).
+            # Netflix allows up to 84 (2 lines), but user wants "separation".
+            # Let's cap at something smaller for 2 lines, e.g. 70.
+            is_too_huge = len(new_text_proj) > 70
+            # If it's long enough duration OR becoming huge -> BREAK
+            if is_long_enough_dur or is_too_huge:
+                grouped_events.append(current_group)
+                current_group = [word]
+                continue
+            # Otherwise, allow merging into 2nd line (e.g. fast speech, short duration)
+        # Check absolute absolute URL limit (MAX_TOTAL_CHARS) just in case
+        if len(new_text_proj) > MAX_TOTAL_CHARS or new_duration_proj > MAX_DURATION:
+            grouped_events.append(current_group)
+            current_group = [word]
+            continue
+        # 3. Check Sentence Endings (CRITICAL)
+        # If previous word was a sentence end, ALWAYS split, unless current group is tiny (<15 chars)
+        if re.search(r'[.!?]$', last_word['text']):
+             # Exception: "No." (Very short). "again." (6 chars) will break.
+             if len(current_text) > 3:
+                 grouped_events.append(current_group)
+                 current_group = [word]
+                 continue
+        current_group.append(word)
+    if current_group:
+        grouped_events.append(current_group)
+    # --- POST-PROCESSING: Merge Orphans ---
+    # Attempt to merge single/short words into previous block if they are close
+    merged_events = []
+    if grouped_events:
+        merged_events.append(grouped_events[0])
+        for i in range(1, len(grouped_events)):
+            prev_group = merged_events[-1]
+            curr_group = grouped_events[i]
+            # Check if current group is "orphan-candidate"
+            # Criteria: 1 word OR very short text (< 10 chars)
+            curr_text = get_group_text(curr_group)
+            is_orphan = len(curr_group) == 1 or len(curr_text) < 10
+            if is_orphan:
+                # Check gap
+                gap = curr_group[0]['start'] - prev_group[-1]['end']
+                # If gap is small enough (user said "very close")
+                # Let's say < 1.0s is reasonably close for a "continuation"
+                if gap < 1.0:
+                    # Check if merging breaks limits
+                    # We need to simulate the merge
+                    combined_text = get_group_text(prev_group + curr_group)
+                    formatted = format_text_lines(combined_text, MAX_CHARS_PER_LINE)
+                    lines = formatted.split('\n')
+                    # Valid if max 2 lines and lines aren't too long (format_text_lines handles length balancing,
+                    # but we check if it forced 3 lines or something weird, though helper only does max 2 usually)
+                    # format_text_lines guarantees max 2 lines usually, unless it fails fallback.
+                    # Check char limit on lines just to be safe
+                    valid_merge = True
+                    for line in lines:
+                        if len(line) > MAX_CHARS_PER_LINE + 5: # Tolerance
+                             valid_merge = False
+                             break
+                    if valid_merge:
+                         # MERGE!
+                         prev_group.extend(curr_group)
+                         continue
+            # If not merged, append
+            merged_events.append(curr_group)
+    # Generate Output SRT
+    output_srt = ""
+    for i, group in enumerate(merged_events, 1):
+        if not group: continue
+        start_time = seconds_to_srt_time(group[0]['start'])
+        end_time = seconds_to_srt_time(group[-1]['end'])
+        text = get_group_text(group)
+        formatted_text = format_text_lines(text, MAX_CHARS_PER_LINE)
+        output_srt += f"{i}\n{start_time} --> {end_time}\n{formatted_text}\n\n"
+    return output_srt.strip()
+import subprocess
+import shutil
+import os
+def process_audio_for_transcription(input_file: str, has_bg_music: bool = False, time_start: float = None, time_end: float = None) -> str:
+    """
+    Process audio to maximize speech clarity.
+    Args:
+        input_file: Path to input audio
+        has_bg_music: If True, uses Demucs to remove background music (slow).
+                      If False, skips Demucs but applies voice enhancement filters (fast).
+    Returns path to processed .mp3 file (vocals)
+    """
+    # Output directory for processed files
+    output_dir = os.path.join("static", "processed")
+    os.makedirs(output_dir, exist_ok=True)
+    input_filename = os.path.basename(input_file)
+    input_stem = os.path.splitext(input_filename)[0]
+    # Adicionar sufixo se houver corte, para evitar cache/conflito incorreto
+    suffix = ""
+    if time_start is not None: suffix += f"_s{int(time_start)}"
+    if time_end is not None: suffix += f"_e{int(time_end)}"
+    final_output = os.path.join(output_dir, f"{input_stem}{suffix}.processed.mp3")
+    ffmpeg_cmd = shutil.which("ffmpeg")
+    if not ffmpeg_cmd:
+        print("⚠️ FFmpeg não encontrado!")
+        return input_file
+    vocals_path = input_file
+    # 1. Background Music Removal (Demucs) - OPTIONAL
+    if has_bg_music:
+        print(f"🔊 [Demucs] Iniciando isolamento de voz via AI (has_bg_music=True)...")
+        demucs_output_dir = os.path.join("static", "separated")
+        os.makedirs(demucs_output_dir, exist_ok=True)
+        # Check demucs availability
+        demucs_cmd = shutil.which("demucs")
+        if not demucs_cmd:
+            demucs_cmd = "demucs" # Fallback to path alias
+        try:
+            model = "htdemucs"
+            command = [
+                demucs_cmd,
+                "--two-stems=vocals",
+                "-n", model,
+                "-d", "cpu",
+                "--mp3",
+                "--mp3-bitrate", "128",
+                input_file,
+                "-o", demucs_output_dir
+            ]
+            print(f"🔊 Executando Demucs...")
+            result = subprocess.run(command, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+            if result.returncode == 0:
+                # Demucs success
+                # Path: output_dir / model_name / input_filename_no_ext / vocals.mp3
+                demucs_vocals = os.path.join(demucs_output_dir, model, input_stem, "vocals.mp3")
+                if os.path.exists(demucs_vocals):
+                    print(f"✅ Demucs sucesso: {demucs_vocals}")
+                    vocals_path = demucs_vocals
+            else:
+                print(f"⚠️ Erro no Demucs (Code {result.returncode}), continuando com audio original.")
+        except Exception as e:
+            print(f"⚠️ Falha no Demucs: {e}")
+    else:
+        print(f"⏩ [Demucs] Pulando remoção de música (has_bg_music=False).")
+    # 2. Voice Enhancement (FFmpeg Filters) - ALWAYS RUN
+    print(f"🔊 [FFmpeg] Aplicando filtros de melhoria de voz...")
+    # Compress to mono mp3 16k with aggressive voice enhancement
+    # Filters include highpass, noise reduction, compression, EQ, and normalization
+    filter_chain = (
+        "highpass=f=100,"
+        "afftdn=nr=10:nf=-50:tn=1,"
+        "compand=attacks=0:points=-80/-90|-45/-25|-27/-9|0/-7:gain=5,"
+        "equalizer=f=3000:width_type=h:width=1000:g=5,"
+        "loudnorm"
+    )
+    cmd_convert = [
+        ffmpeg_cmd, "-y",
+        "-i", vocals_path,
+    ]
+    # Apply cutting if requested (Output seeking for accuracy)
+    if time_start is not None:
+        cmd_convert.extend(["-ss", str(time_start)])
+    if time_end is not None:
+        cmd_convert.extend(["-to", str(time_end)])
+    cmd_convert.extend([
+        "-ac", "1", "-ar", "16000",
+        "-af", filter_chain,
+        "-c:a", "libmp3lame", "-q:a", "2",
+        final_output
+    ])
+    try:
+        subprocess.run(cmd_convert, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        # Cleanup demucs folder if it was used
+        if has_bg_music and "separated" in vocals_path:
+             try:
+                 # We need to find the parent folder of 'vocals.mp3' which is the song folder
+                 song_folder = os.path.dirname(vocals_path)
+                 shutil.rmtree(song_folder)
+             except: pass
+        return final_output
+    except Exception as e:
+        print(f"⚠️ Erro no FFmpeg: {e}")
+        return vocals_path