import gradio as gr import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import srt import torch import os from datetime import timedelta import subprocess import re # --- Configuration --- # Translation Model (NLLB) TRANSLATION_MODEL = "facebook/nllb-200-distilled-1.3B" # Whisper Model Size: "medium" is the best balance for CPU. # You can change to "large" or "large-v3" but it will be 2x slower. WHISPER_MODEL_SIZE = "medium" print("Loading Models...") # --- Load Translation Model (NLLB) --- tokenizer_nllb = AutoTokenizer.from_pretrained(TRANSLATION_MODEL) model_nllb = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL) # --- Load Audio Model (Official OpenAI Whisper) --- # This downloads the model to the container print(f"Loading Whisper '{WHISPER_MODEL_SIZE}' model...") whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device="cpu") print("Models Loaded Successfully!") # --------------------------------------------------------- # Helper: Extract Audio # --------------------------------------------------------- def extract_audio(video_path): output_audio_path = "temp_audio.mp3" if os.path.exists(output_audio_path): os.remove(output_audio_path) # Simple FFMPEG extraction command = [ "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-y", output_audio_path ] subprocess.run(command, check=True) return output_audio_path # --------------------------------------------------------- # Helper: VTT Converter (For Browser Preview) # --------------------------------------------------------- def srt_to_vtt(srt_path): """Converts SRT to VTT format for the HTML5 video player.""" vtt_path = srt_path.replace(".srt", ".vtt") with open(srt_path, 'r', encoding='utf-8') as f: content = f.read() vtt_content = "WEBVTT\n\n" # Regex to convert SRT comma timestamps to VTT dot timestamps vtt_content += re.sub(r'(\d{2}:\d{2}:\d{2}),(\d{3})', r'\1.\2', content) with open(vtt_path, 'w', encoding='utf-8') as f: f.write(vtt_content) return vtt_path # --------------------------------------------------------- # Logic 1: Video to SRT (Using Native Whisper) # --------------------------------------------------------- def video_to_srt(video_path, progress=gr.Progress()): if video_path is None: return None, None # 1. Extract Audio progress(0.1, desc="Extracting Audio...") try: audio_path = extract_audio(video_path) except Exception as e: return None, f"Error: {str(e)}" # 2. Transcribe using Native Whisper progress(0.3, desc=f"Transcribing with Whisper {WHISPER_MODEL_SIZE}...") # The native transcribe function handles segmentation automatically! result = whisper_model.transcribe(audio_path, language="en") # 3. Format to SRT progress(0.8, desc="Formatting SRT...") srt_subtitles = [] for i, segment in enumerate(result["segments"]): start_seconds = segment["start"] end_seconds = segment["end"] text = segment["text"].strip() srt_subtitles.append( srt.Subtitle( index=i+1, start=timedelta(seconds=start_seconds), end=timedelta(seconds=end_seconds), content=text ) ) srt_path = "generated_captions.srt" with open(srt_path, 'w', encoding='utf-8') as f: f.write(srt.compose(srt_subtitles)) # 4. Create Preview vtt_path = srt_to_vtt(srt_path) html_preview = f"""