Spaces:

xTHExBEASTx
/

srt

Sleeping

App Files Files Community

xTHExBEASTx commited on Feb 12

Commit

0a1e2fb

verified ·

1 Parent(s): 82d594e

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -78

app.py CHANGED Viewed

@@ -5,11 +5,13 @@ import torch
 import os
 import math
 from datetime import timedelta
-import subprocess  # Needed to run FFMPEG commands
 # --- Configuration ---
 TRANSLATION_MODEL = "facebook/nllb-200-distilled-1.3B"
-WHISPER_MODEL = "distil-whisper/distil-large-v3"
 print("Loading Models...")
@@ -30,30 +32,89 @@ whisper_pipe = pipeline(
 print("Models Loaded Successfully!")
 # ---------------------------------------------------------
-# Helper: Extract Audio from Video
 # ---------------------------------------------------------
 def extract_audio(video_path):
-    """
-    Converts video to mp3 using ffmpeg.
-    Returns the path to the generated audio file.
-    """
     output_audio_path = "temp_audio.mp3"
-    # Check if previous temp file exists and remove it
     if os.path.exists(output_audio_path):
         os.remove(output_audio_path)
-    # Run ffmpeg command to extract audio
-    # -i: input, -vn: no video, -acodec: audio codec, -y: overwrite
     command = [
         "ffmpeg", "-i", video_path,
         "-vn", "-acodec", "libmp3lame",
         "-y", output_audio_path
     ]
     subprocess.run(command, check=True)
     return output_audio_path
 # ---------------------------------------------------------
 # Logic 1: Translation (NLLB)
 # ---------------------------------------------------------
@@ -61,31 +122,22 @@ def batch_translate(texts, src_lang, tgt_lang, batch_size=8, progress=gr.Progres
     results = []
     tokenizer_nllb.src_lang = src_lang
-    total_batches = (len(texts) + batch_size - 1) // batch_size
     for i, start_idx in enumerate(range(0, len(texts), batch_size)):
         batch = texts[start_idx : start_idx + batch_size]
         inputs = tokenizer_nllb(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         forced_bos_token_id = tokenizer_nllb.convert_tokens_to_ids(tgt_lang)
         with torch.no_grad():
-            generated_tokens = model_nllb.generate(
-                **inputs,
-                forced_bos_token_id=forced_bos_token_id,
-                max_length=512
-            )
-        batch_results = tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True)
-        results.extend(batch_results)
     return results
 def process_translation(filepath, src_lang_code, tgt_lang_code):
     if filepath is None: return None
     try:
         with open(filepath, 'r', encoding='utf-8') as f:
-            content = f.read()
-        subtitles = list(srt.parse(content))
     except Exception as e:
         return f"Error: {str(e)}"
@@ -106,15 +158,14 @@ def process_translation(filepath, src_lang_code, tgt_lang_code):
 def video_to_srt(video_path, progress=gr.Progress()):
     if video_path is None: return None
-    # 1. Extract Audio First (Fixes the ValueError)
-    progress(0.1, desc="Extracting Audio from Video...")
     try:
         audio_path = extract_audio(video_path)
     except Exception as e:
         return f"Error extracting audio: {str(e)}"
-    # 2. Run Transcription on the Audio file
-    progress(0.3, desc="Transcribing Audio (this may take a while)...")
     outputs = whisper_pipe(audio_path, return_timestamps=True, generate_kwargs={"language": "english"})
     chunks = outputs.get("chunks", [])
@@ -123,24 +174,8 @@ def video_to_srt(video_path, progress=gr.Progress()):
     progress(0.8, desc="Formatting SRT...")
-    srt_subtitles = []
-    for i, chunk in enumerate(chunks):
-        text = chunk['text'].strip()
-        timestamp = chunk['timestamp']
-        # Handle cases where timestamp might be None or single value
-        if isinstance(timestamp, (list, tuple)):
-            start, end = timestamp
-        else:
-            start = 0.0
-            end = 5.0
-        if end is None:
-            end = start + 5.0
-        srt_subtitles.append(
-            srt.Subtitle(index=i+1, start=timedelta(seconds=start), end=timedelta(seconds=end), content=text)
-        )
     out_path = "generated_captions.srt"
     with open(out_path, 'w', encoding='utf-8') as f:
@@ -151,45 +186,28 @@ def video_to_srt(video_path, progress=gr.Progress()):
 # ---------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------
-with gr.Blocks(title="The Ultimate Subtitler") as demo:
-    gr.Markdown("# 🎥 The Ultimate Subtitle Tool")
     with gr.Tabs():
-        # Tab 1: Video to SRT
-        with gr.TabItem("Step 1: Video to SRT (Whisper)"):
-            gr.Markdown("### Upload a video to generate English captions")
             with gr.Row():
                 video_input = gr.Video(label="Upload Video")
-                srt_output_gen = gr.File(label="Generated English SRT")
-            gen_btn = gr.Button("Generate SRT", variant="primary")
-            gen_btn.click(video_to_srt, inputs=video_input, outputs=srt_output_gen)
-        # Tab 2: Translate SRT
-        with gr.TabItem("Step 2: Translate SRT (NLLB)"):
-            gr.Markdown("### Translate any SRT file")
             with gr.Row():
-                srt_input = gr.File(label="Upload SRT File")
                 with gr.Column():
-                    src_lang = gr.Dropdown(
-                        ["eng_Latn", "spa_Latn", "fra_Latn", "deu_Latn"],
-                        label="Source Language", value="eng_Latn"
-                    )
-                    tgt_lang = gr.Dropdown(
-                        ["arb_Arab", "arz_Arab", "eng_Latn", "fra_Latn"],
-                        label="Target Language", value="arb_Arab"
-                    )
                 srt_output_trans = gr.File(label="Translated SRT")
-            trans_btn = gr.Button("Translate", variant="primary")
-            trans_btn.click(
-                process_translation,
-                inputs=[srt_input, src_lang, tgt_lang],
-                outputs=srt_output_trans
-            )
 if __name__ == "__main__":
     demo.launch()

 import os
 import math
 from datetime import timedelta
+import subprocess
 # --- Configuration ---
 TRANSLATION_MODEL = "facebook/nllb-200-distilled-1.3B"
+# We use OpenAI's original small model for better segmentation on CPU
+# It is often better at splitting sentences than Distil-Large for subtitles
+WHISPER_MODEL = "openai/whisper-small"
 print("Loading Models...")
 print("Models Loaded Successfully!")
 # ---------------------------------------------------------
+# Helper: Extract Audio
 # ---------------------------------------------------------
 def extract_audio(video_path):
     output_audio_path = "temp_audio.mp3"
     if os.path.exists(output_audio_path):
         os.remove(output_audio_path)
     command = [
         "ffmpeg", "-i", video_path,
         "-vn", "-acodec", "libmp3lame",
         "-y", output_audio_path
     ]
     subprocess.run(command, check=True)
     return output_audio_path
+# ---------------------------------------------------------
+# Helper: Smart SRT Splitter (The Fix!)
+# ---------------------------------------------------------
+def split_text_into_lines(text, max_chars=80):
+    """Breaks long text into smaller lines based on character limit."""
+    words = text.split()
+    lines = []
+    current_line = []
+    current_length = 0
+    for word in words:
+        if current_length + len(word) + 1 > max_chars:
+            lines.append(" ".join(current_line))
+            current_line = [word]
+            current_length = len(word)
+        else:
+            current_line.append(word)
+            current_length += len(word) + 1
+    if current_line:
+        lines.append(" ".join(current_line))
+    return lines
+def create_srt_segments(chunks):
+    """
+    Takes raw Whisper chunks and breaks them down into clean SRT subtitles.
+    Distributes time proportionally if a chunk is split into multiple lines.
+    """
+    srt_subtitles = []
+    index_counter = 1
+    for chunk in chunks:
+        text = chunk['text'].strip()
+        timestamp = chunk['timestamp']
+        # Safe unpacking of timestamps
+        if isinstance(timestamp, (list, tuple)):
+            start_time, end_time = timestamp
+        else:
+            continue # Skip bad chunks
+        if end_time is None: end_time = start_time + 5.0
+        # Smart Split: If text is too long (>80 chars), split it
+        lines = split_text_into_lines(text, max_chars=80)
+        # Calculate duration per line (Proportional split)
+        total_duration = end_time - start_time
+        duration_per_line = total_duration / len(lines) if lines else 0
+        current_start = start_time
+        for line in lines:
+            current_end = current_start + duration_per_line
+            srt_subtitles.append(
+                srt.Subtitle(
+                    index=index_counter,
+                    start=timedelta(seconds=current_start),
+                    end=timedelta(seconds=current_end),
+                    content=line
+                )
+            )
+            index_counter += 1
+            current_start = current_end # Next line starts where this one ended
+    return srt_subtitles
 # ---------------------------------------------------------
 # Logic 1: Translation (NLLB)
 # ---------------------------------------------------------
     results = []
     tokenizer_nllb.src_lang = src_lang
     for i, start_idx in enumerate(range(0, len(texts), batch_size)):
         batch = texts[start_idx : start_idx + batch_size]
         inputs = tokenizer_nllb(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         forced_bos_token_id = tokenizer_nllb.convert_tokens_to_ids(tgt_lang)
         with torch.no_grad():
+            generated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512)
+        results.extend(tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True))
     return results
 def process_translation(filepath, src_lang_code, tgt_lang_code):
     if filepath is None: return None
     try:
         with open(filepath, 'r', encoding='utf-8') as f:
+            subtitles = list(srt.parse(f.read()))
     except Exception as e:
         return f"Error: {str(e)}"
 def video_to_srt(video_path, progress=gr.Progress()):
     if video_path is None: return None
+    progress(0.1, desc="Extracting Audio...")
     try:
         audio_path = extract_audio(video_path)
     except Exception as e:
         return f"Error extracting audio: {str(e)}"
+    progress(0.3, desc="Transcribing...")
+    # We enable return_timestamps=True to get segment-level timing
     outputs = whisper_pipe(audio_path, return_timestamps=True, generate_kwargs={"language": "english"})
     chunks = outputs.get("chunks", [])
     progress(0.8, desc="Formatting SRT...")
+    # Use the new Smart Splitter function
+    srt_subtitles = create_srt_segments(chunks)
     out_path = "generated_captions.srt"
     with open(out_path, 'w', encoding='utf-8') as f:
 # ---------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------
+with gr.Blocks(title="SRT Master Tool") as demo:
+    gr.Markdown("# 🎬 Auto Subtitle & Translator")
     with gr.Tabs():
+        with gr.TabItem("Step 1: Video to SRT"):
+            gr.Markdown("### Convert Video to English Subtitles")
             with gr.Row():
                 video_input = gr.Video(label="Upload Video")
+                srt_output_gen = gr.File(label="Generated SRT")
+            btn1 = gr.Button("Generate SRT", variant="primary")
+            btn1.click(video_to_srt, inputs=video_input, outputs=srt_output_gen)
+        with gr.TabItem("Step 2: Translate SRT"):
+            gr.Markdown("### Translate Subtitles to Arabic")
             with gr.Row():
+                srt_input = gr.File(label="Upload SRT")
                 with gr.Column():
+                    src_l = gr.Dropdown(["eng_Latn", "fra_Latn"], label="From", value="eng_Latn")
+                    tgt_l = gr.Dropdown(["arb_Arab", "arz_Arab"], label="To", value="arb_Arab")
                 srt_output_trans = gr.File(label="Translated SRT")
+            btn2 = gr.Button("Translate", variant="primary")
+            btn2.click(process_translation, inputs=[srt_input, src_l, tgt_l], outputs=srt_output_trans)
 if __name__ == "__main__":
     demo.launch()