Spaces:

subashpoudel
/

Subtitle-Generator

Sleeping

File size: 3,692 Bytes

b4ba70c
 
 
 
 
 
 
 
9017ae5
b4ba70c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1281f7f
 
b4ba70c
 
 
 
 
8e937fe
b4ba70c
 
 
8e937fe
b4ba70c
 
8e937fe
 
b4ba70c
 
 
 
 
8e937fe
b4ba70c
 
 
 
8e937fe
 
b4ba70c
 
8e937fe
 
 
 
 
 
b4ba70c
 
 
9017ae5
b4ba70c

import os
import time
import gradio as gr
import google.generativeai as genai
from phi.agent import Agent
from phi.model.google import Gemini
from google.generativeai import upload_file, get_file
from dotenv import load_dotenv
from phi.model.groq import Groq

# --- Load API Key ---
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
genai.configure(api_key=API_KEY)

# --- Fix SRT format ---
def fix_srt_format(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    fixed_lines = []
    for i, line in enumerate(lines):
        if line.strip().isdigit():
            if i > 0 and lines[i - 1].strip() != "":
                fixed_lines.append("\n")
        fixed_lines.append(line)

    with open(output_file, "w", encoding="utf-8") as outfile:
        outfile.writelines(fixed_lines)

# --- Initialize Gemini Agent ---
def initialize_agent():
    return Agent(
        name="Video AI Subtitle Generator",
        model=Gemini(id="gemini-2.0-flash-exp"),
        # model=Groq(id="llama-3.3-70b-versatile"),
        markdown=True,
    )

# --- Prompt Template ---
subtitle_prompt_tuned = '''
You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when each word is spoken.

Please follow these instructions strictly:

1. For **every single spoken word**, include:
   - A unique **line number**
   - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
   - The **spoken word** on the next line (exactly one word per block)
2. Do **not** include more or fewer than one word per timestamp.
3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
6. Maintain the exact **chronological order** as spoken in the video.

***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for each word**.

Example format:

1  
00:00:01,000 --> 00:00:01,300  
Hello  

2  
00:00:01,310 --> 00:00:01,600  
everyone  

3  
00:00:01,610 --> 00:00:01,900  
welcome  

...

Only output the transcription in the above format (like **SRT** file format). Do not return any additional text.
'''

# --- Gradio Interface Function ---
def generate_subtitles(video):
    if not video:
        return None

    video_path = video
    output_txt = "raw_subtitles.srt"
    output_fixed = "output_subtitles.srt"

    agent = initialize_agent()

    print("[INFO] Uploading video...")
    uploaded_video = upload_file(video_path)

    while uploaded_video.state.name == "PROCESSING":
        time.sleep(1)
        uploaded_video = get_file(uploaded_video.name)

    print("[INFO] Generating subtitles...")
    response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
    raw_text = response.content.strip()

    with open(output_txt, "w", encoding="utf-8") as f:
        f.write(raw_text)

    fix_srt_format(output_txt, output_fixed)
    return output_fixed

# --- Launch Gradio App ---
demo = gr.Interface(
    fn=generate_subtitles,
    inputs=gr.Video(label="Upload MP4 Video"),
    outputs=gr.File(label="Download .srt Subtitle File"),
    title="Subtitle Generator",
    description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
)

if __name__ == "__main__":
    demo.launch(share=True)