import os
import time
import gradio as gr
import google.generativeai as genai
from phi.agent import Agent
from phi.model.google import Gemini
from google.generativeai import upload_file, get_file
from dotenv import load_dotenv
from phi.model.groq import Groq

# --- Load API Key ---
load_dotenv()
API_KEY = os.getenv("GOOGLE_API_KEY")
if not API_KEY:
    raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
genai.configure(api_key=API_KEY)

# --- Fix SRT format ---
def fix_srt_format(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    fixed_lines = []
    for i, line in enumerate(lines):
        if line.strip().isdigit():
            if i > 0 and lines[i - 1].strip() != "":
                fixed_lines.append("\n")
        fixed_lines.append(line)

    with open(output_file, "w", encoding="utf-8") as outfile:
        outfile.writelines(fixed_lines)

# --- Initialize Gemini Agent ---
def initialize_agent():
    return Agent(
        name="Video AI Subtitle Generator",
        model=Gemini(id="gemini-2.0-flash-exp"),
        # model=Groq(id="llama-3.3-70b-versatile"),
        markdown=True,
    )

# --- Prompt Template ---
subtitle_prompt_tuned = '''
You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when each word is spoken.

Please follow these instructions strictly:

1. For **every single spoken word**, include:
   - A unique **line number**
   - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
   - The **spoken word** on the next line (exactly one word per block)
2. Do **not** include more or fewer than one word per timestamp.
3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
6. Maintain the exact **chronological order** as spoken in the video.

***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for each word**.

Example format:

1  
00:00:01,000 --> 00:00:01,300  
Hello  

2  
00:00:01,310 --> 00:00:01,600  
everyone  

3  
00:00:01,610 --> 00:00:01,900  
welcome  

...

Only output the transcription in the above format (like **SRT** file format). Do not return any additional text.
'''

# --- Gradio Interface Function ---
def generate_subtitles(video):
    if not video:
        return None

    video_path = video
    output_txt = "raw_subtitles.srt"
    output_fixed = "output_subtitles.srt"

    agent = initialize_agent()

    print("[INFO] Uploading video...")
    uploaded_video = upload_file(video_path)

    while uploaded_video.state.name == "PROCESSING":
        time.sleep(1)
        uploaded_video = get_file(uploaded_video.name)

    print("[INFO] Generating subtitles...")
    response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
    raw_text = response.content.strip()

    with open(output_txt, "w", encoding="utf-8") as f:
        f.write(raw_text)

    fix_srt_format(output_txt, output_fixed)
    return output_fixed

# --- Launch Gradio App ---
demo = gr.Interface(
    fn=generate_subtitles,
    inputs=gr.Video(label="Upload MP4 Video"),
    outputs=gr.File(label="Download .srt Subtitle File"),
    title="Subtitle Generator",
    description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
)

if __name__ == "__main__":
    demo.launch(share=True)