import os import time import gradio as gr import google.generativeai as genai from phi.agent import Agent from phi.model.google import Gemini from google.generativeai import upload_file, get_file from dotenv import load_dotenv from phi.model.groq import Groq # --- Load API Key --- load_dotenv() API_KEY = os.getenv("GOOGLE_API_KEY") if not API_KEY: raise EnvironmentError("GOOGLE_API_KEY not found in environment.") genai.configure(api_key=API_KEY) # --- Fix SRT format --- def fix_srt_format(input_file, output_file): with open(input_file, "r", encoding="utf-8") as infile: lines = infile.readlines() fixed_lines = [] for i, line in enumerate(lines): if line.strip().isdigit(): if i > 0 and lines[i - 1].strip() != "": fixed_lines.append("\n") fixed_lines.append(line) with open(output_file, "w", encoding="utf-8") as outfile: outfile.writelines(fixed_lines) # --- Initialize Gemini Agent --- def initialize_agent(): return Agent( name="Video AI Subtitle Generator", model=Gemini(id="gemini-2.0-flash-exp"), # model=Groq(id="llama-3.3-70b-versatile"), markdown=True, ) # --- Prompt Template --- subtitle_prompt_tuned = ''' You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when each word is spoken. Please follow these instructions strictly: 1. For **every single spoken word**, include: - A unique **line number** - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm - The **spoken word** on the next line (exactly one word per block) 2. Do **not** include more or fewer than one word per timestamp. 3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**. 4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc. 5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary. 6. Maintain the exact **chronological order** as spoken in the video. ***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for each word**. Example format: 1 00:00:01,000 --> 00:00:01,300 Hello 2 00:00:01,310 --> 00:00:01,600 everyone 3 00:00:01,610 --> 00:00:01,900 welcome ... Only output the transcription in the above format (like **SRT** file format). Do not return any additional text. ''' # --- Gradio Interface Function --- def generate_subtitles(video): if not video: return None video_path = video output_txt = "raw_subtitles.srt" output_fixed = "output_subtitles.srt" agent = initialize_agent() print("[INFO] Uploading video...") uploaded_video = upload_file(video_path) while uploaded_video.state.name == "PROCESSING": time.sleep(1) uploaded_video = get_file(uploaded_video.name) print("[INFO] Generating subtitles...") response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video]) raw_text = response.content.strip() with open(output_txt, "w", encoding="utf-8") as f: f.write(raw_text) fix_srt_format(output_txt, output_fixed) return output_fixed # --- Launch Gradio App --- demo = gr.Interface( fn=generate_subtitles, inputs=gr.Video(label="Upload MP4 Video"), outputs=gr.File(label="Download .srt Subtitle File"), title="Subtitle Generator", description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.", ) if __name__ == "__main__": demo.launch(share=True)