subashpoudel's picture
Update app.py
8e937fe verified
import os
import time
import gradio as gr
import google.generativeai as genai
from phi.agent import Agent
from phi.model.google import Gemini
from google.generativeai import upload_file, get_file
from dotenv import load_dotenv
from phi.model.groq import Groq
# --- Load API Key ---
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
genai.configure(api_key=API_KEY)
# --- Fix SRT format ---
def fix_srt_format(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as infile:
lines = infile.readlines()
fixed_lines = []
for i, line in enumerate(lines):
if line.strip().isdigit():
if i > 0 and lines[i - 1].strip() != "":
fixed_lines.append("\n")
fixed_lines.append(line)
with open(output_file, "w", encoding="utf-8") as outfile:
outfile.writelines(fixed_lines)
# --- Initialize Gemini Agent ---
def initialize_agent():
return Agent(
name="Video AI Subtitle Generator",
model=Gemini(id="gemini-2.0-flash-exp"),
# model=Groq(id="llama-3.3-70b-versatile"),
markdown=True,
)
# --- Prompt Template ---
subtitle_prompt_tuned = '''
You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when each word is spoken.
Please follow these instructions strictly:
1. For **every single spoken word**, include:
- A unique **line number**
- The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
- The **spoken word** on the next line (exactly one word per block)
2. Do **not** include more or fewer than one word per timestamp.
3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
6. Maintain the exact **chronological order** as spoken in the video.
***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for each word**.
Example format:
1
00:00:01,000 --> 00:00:01,300
Hello
2
00:00:01,310 --> 00:00:01,600
everyone
3
00:00:01,610 --> 00:00:01,900
welcome
...
Only output the transcription in the above format (like **SRT** file format). Do not return any additional text.
'''
# --- Gradio Interface Function ---
def generate_subtitles(video):
if not video:
return None
video_path = video
output_txt = "raw_subtitles.srt"
output_fixed = "output_subtitles.srt"
agent = initialize_agent()
print("[INFO] Uploading video...")
uploaded_video = upload_file(video_path)
while uploaded_video.state.name == "PROCESSING":
time.sleep(1)
uploaded_video = get_file(uploaded_video.name)
print("[INFO] Generating subtitles...")
response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
raw_text = response.content.strip()
with open(output_txt, "w", encoding="utf-8") as f:
f.write(raw_text)
fix_srt_format(output_txt, output_fixed)
return output_fixed
# --- Launch Gradio App ---
demo = gr.Interface(
fn=generate_subtitles,
inputs=gr.Video(label="Upload MP4 Video"),
outputs=gr.File(label="Download .srt Subtitle File"),
title="Subtitle Generator",
description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
)
if __name__ == "__main__":
demo.launch(share=True)