Spaces:

xTHExBEASTx
/

srt

Sleeping

App Files Files Community

xTHExBEASTx commited on Feb 12

Commit

0268049

verified ·

1 Parent(s): e6b8cce

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -129

app.py CHANGED Viewed

@@ -1,54 +1,43 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import srt
 import torch
 import os
-import math
 from datetime import timedelta
 import subprocess
 import re
 # --- Configuration ---
 TRANSLATION_MODEL = "facebook/nllb-200-distilled-1.3B"
-WHISPER_MODEL = "openai/whisper-small"
 print("Loading Models...")
-# --- Load Translation Model ---
 tokenizer_nllb = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)
 model_nllb = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)
-# --- Load Audio Model ---
-whisper_pipe = pipeline(
-    "automatic-speech-recognition",
-    model=WHISPER_MODEL,
-    torch_dtype=torch.float32,
-    device="cpu",
-    chunk_length_s=30,
-    stride_length_s=5,
-)
 print("Models Loaded Successfully!")
 # ---------------------------------------------------------
-# Helper: Extract Audio & Duration
 # ---------------------------------------------------------
-def get_media_duration(filename):
-    try:
-        result = subprocess.run(
-            ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", filename],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT
-        )
-        return float(result.stdout)
-    except:
-        return 30.0
 def extract_audio(video_path):
     output_audio_path = "temp_audio.mp3"
     if os.path.exists(output_audio_path):
         os.remove(output_audio_path)
     command = [
         "ffmpeg", "-i", video_path,
         "-vn", "-acodec", "libmp3lame",
@@ -66,85 +55,83 @@ def srt_to_vtt(srt_path):
     with open(srt_path, 'r', encoding='utf-8') as f:
         content = f.read()
-    # VTT Header
     vtt_content = "WEBVTT\n\n"
-    # Replace comma timestamps (00:00:01,000) with dot (00:00:01.000)
     vtt_content += re.sub(r'(\d{2}:\d{2}:\d{2}),(\d{3})', r'\1.\2', content)
     with open(vtt_path, 'w', encoding='utf-8') as f:
         f.write(vtt_content)
     return vtt_path
 # ---------------------------------------------------------
-# Helper: Smart Splitter logic
 # ---------------------------------------------------------
-def split_text_into_lines(text, max_chars=80):
-    words = text.split()
-    lines = []
-    current_line = []
-    current_length = 0
-    for word in words:
-        if current_length + len(word) + 1 > max_chars:
-            lines.append(" ".join(current_line))
-            current_line = [word]
-            current_length = len(word)
-        else:
-            current_line.append(word)
-            current_length += len(word) + 1
-    if current_line:
-        lines.append(" ".join(current_line))
-    return lines
-def create_srt_segments(chunks, total_video_duration):
     srt_subtitles = []
-    index_counter = 1
-    for chunk in chunks:
-        text = chunk['text'].strip()
-        timestamp = chunk['timestamp']
-        if isinstance(timestamp, (list, tuple)):
-            start_time, end_time = timestamp
-        else:
-            start_time, end_time = 0.0, None
-        if end_time is None:
-            end_time = total_video_duration
-        lines = split_text_into_lines(text, max_chars=80)
-        duration = end_time - start_time
-        if duration <= 0: duration = 5.0
-        step = duration / len(lines) if lines else 0
-        current_start = start_time
-        for line in lines:
-            current_end = current_start + step
-            srt_subtitles.append(
-                srt.Subtitle(index=index_counter, start=timedelta(seconds=current_start), end=timedelta(seconds=current_end), content=line)
             )
-            index_counter += 1
-            current_start = current_end
-    return srt_subtitles
 # ---------------------------------------------------------
-# Logic 1: Translation (NLLB)
 # ---------------------------------------------------------
-def batch_translate(texts, src_lang, tgt_lang, batch_size=8, progress=gr.Progress()):
     results = []
     tokenizer_nllb.src_lang = src_lang
-    for i, start_idx in enumerate(range(0, len(texts), batch_size)):
-        batch = texts[start_idx : start_idx + batch_size]
         inputs = tokenizer_nllb(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         forced_bos_token_id = tokenizer_nllb.convert_tokens_to_ids(tgt_lang)
         with torch.no_grad():
             generated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512)
         results.extend(tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True))
     return results
@@ -165,58 +152,13 @@ def process_translation(filepath, src_lang_code, tgt_lang_code):
     out_path = "translated_subtitles.srt"
     with open(out_path, 'w', encoding='utf-8') as f:
         f.write(srt.compose(subtitles))
     return out_path
-# ---------------------------------------------------------
-# Logic 2: Video to SRT + Preview
-# ---------------------------------------------------------
-def video_to_srt(video_path, progress=gr.Progress()):
-    if video_path is None: return None, None
-    # 1. Audio & Duration
-    progress(0.1, desc="Extracting Audio...")
-    try:
-        audio_path = extract_audio(video_path)
-        duration = get_media_duration(audio_path)
-    except Exception as e:
-        return None, f"Error: {str(e)}"
-    # 2. Transcribe
-    progress(0.3, desc="Transcribing...")
-    outputs = whisper_pipe(audio_path, return_timestamps=True, generate_kwargs={"language": "english"})
-    chunks = outputs.get("chunks", [])
-    if not chunks: chunks = [{"text": outputs.get("text", ""), "timestamp": (0.0, None)}]
-    # 3. Format SRT
-    progress(0.8, desc="Formatting...")
-    srt_subtitles = create_srt_segments(chunks, duration)
-    srt_path = "generated_captions.srt"
-    with open(srt_path, 'w', encoding='utf-8') as f:
-        f.write(srt.compose(srt_subtitles))
-    # 4. Create Preview (HTML + VTT)
-    vtt_path = srt_to_vtt(srt_path)
-    # Create the HTML player
-    html_preview = f"""
-    <h3>Video Preview</h3>
-    <video controls width="100%" height="400px" style="background:black">
-        <source src="/file={video_path}" type="video/mp4">
-        <track kind="captions" src="/file={vtt_path}" srclang="en" label="English" default>
-        Your browser does not support the video tag.
-    </video>
-    <p style="margin-top:10px; color: #666;">Note: Subtitles are overlaid for preview only. They are not burned into the video.</p>
-    """
-    return srt_path, html_preview
 # ---------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------
 with gr.Blocks(title="SRT Master Tool") as demo:
-    gr.Markdown("# 🎬 Auto Subtitle & Translator")
     with gr.Tabs():
         # --- TAB 1 ---
@@ -224,7 +166,6 @@ with gr.Blocks(title="SRT Master Tool") as demo:
             gr.Markdown("### 1. Upload Video -> 2. Check Preview -> 3. Download SRT")
             with gr.Row():
                 video_input = gr.Video(label="Upload Video", sources=["upload"])
                 with gr.Column():
                     preview_output = gr.HTML(label="Preview Player")
                     srt_output_gen = gr.File(label="Download Generated SRT")

 import gradio as gr
+import whisper
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import srt
 import torch
 import os
 from datetime import timedelta
 import subprocess
 import re
 # --- Configuration ---
+# Translation Model (NLLB)
 TRANSLATION_MODEL = "facebook/nllb-200-distilled-1.3B"
+# Whisper Model Size: "medium" is the best balance for CPU.
+# You can change to "large" or "large-v3" but it will be 2x slower.
+WHISPER_MODEL_SIZE = "medium"
 print("Loading Models...")
+# --- Load Translation Model (NLLB) ---
 tokenizer_nllb = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)
 model_nllb = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)
+# --- Load Audio Model (Official OpenAI Whisper) ---
+# This downloads the model to the container
+print(f"Loading Whisper '{WHISPER_MODEL_SIZE}' model...")
+whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device="cpu")
 print("Models Loaded Successfully!")
 # ---------------------------------------------------------
+# Helper: Extract Audio
 # ---------------------------------------------------------
 def extract_audio(video_path):
     output_audio_path = "temp_audio.mp3"
     if os.path.exists(output_audio_path):
         os.remove(output_audio_path)
+    # Simple FFMPEG extraction
     command = [
         "ffmpeg", "-i", video_path,
         "-vn", "-acodec", "libmp3lame",
     with open(srt_path, 'r', encoding='utf-8') as f:
         content = f.read()
     vtt_content = "WEBVTT\n\n"
+    # Regex to convert SRT comma timestamps to VTT dot timestamps
     vtt_content += re.sub(r'(\d{2}:\d{2}:\d{2}),(\d{3})', r'\1.\2', content)
     with open(vtt_path, 'w', encoding='utf-8') as f:
         f.write(vtt_content)
     return vtt_path
 # ---------------------------------------------------------
+# Logic 1: Video to SRT (Using Native Whisper)
 # ---------------------------------------------------------
+def video_to_srt(video_path, progress=gr.Progress()):
+    if video_path is None: return None, None
+    # 1. Extract Audio
+    progress(0.1, desc="Extracting Audio...")
+    try:
+        audio_path = extract_audio(video_path)
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+    # 2. Transcribe using Native Whisper
+    progress(0.3, desc=f"Transcribing with Whisper {WHISPER_MODEL_SIZE}...")
+    # The native transcribe function handles segmentation automatically!
+    result = whisper_model.transcribe(audio_path, language="en")
+    # 3. Format to SRT
+    progress(0.8, desc="Formatting SRT...")
     srt_subtitles = []
+    for i, segment in enumerate(result["segments"]):
+        start_seconds = segment["start"]
+        end_seconds = segment["end"]
+        text = segment["text"].strip()
+        srt_subtitles.append(
+            srt.Subtitle(
+                index=i+1,
+                start=timedelta(seconds=start_seconds),
+                end=timedelta(seconds=end_seconds),
+                content=text
             )
+        )
+    srt_path = "generated_captions.srt"
+    with open(srt_path, 'w', encoding='utf-8') as f:
+        f.write(srt.compose(srt_subtitles))
+    # 4. Create Preview
+    vtt_path = srt_to_vtt(srt_path)
+    html_preview = f"""
+    <h3>Video Preview</h3>
+    <video controls width="100%" height="400px" style="background:black">
+        <source src="/file={video_path}" type="video/mp4">
+        <track kind="captions" src="/file={vtt_path}" srclang="en" label="English" default>
+        Your browser does not support the video tag.
+    </video>
+    """
+    return srt_path, html_preview
 # ---------------------------------------------------------
+# Logic 2: Translation (NLLB)
 # ---------------------------------------------------------
+def batch_translate(texts, src_lang, tgt_lang, batch_size=8):
     results = []
     tokenizer_nllb.src_lang = src_lang
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i : i + batch_size]
         inputs = tokenizer_nllb(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
         forced_bos_token_id = tokenizer_nllb.convert_tokens_to_ids(tgt_lang)
         with torch.no_grad():
             generated_tokens = model_nllb.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512)
         results.extend(tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True))
     return results
     out_path = "translated_subtitles.srt"
     with open(out_path, 'w', encoding='utf-8') as f:
         f.write(srt.compose(subtitles))
     return out_path
 # ---------------------------------------------------------
 # Gradio Interface
 # ---------------------------------------------------------
 with gr.Blocks(title="SRT Master Tool") as demo:
+    gr.Markdown(f"# 🎬 Auto Subtitle (Whisper {WHISPER_MODEL_SIZE}) & Translator")
     with gr.Tabs():
         # --- TAB 1 ---
             gr.Markdown("### 1. Upload Video -> 2. Check Preview -> 3. Download SRT")
             with gr.Row():
                 video_input = gr.Video(label="Upload Video", sources=["upload"])
                 with gr.Column():
                     preview_output = gr.HTML(label="Preview Player")
                     srt_output_gen = gr.File(label="Download Generated SRT")