Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import gradio as gr | |
| import google.generativeai as genai | |
| from phi.agent import Agent | |
| from phi.model.google import Gemini | |
| from google.generativeai import upload_file, get_file | |
| from dotenv import load_dotenv | |
| from phi.model.groq import Groq | |
| # --- Load API Key --- | |
| load_dotenv() | |
| API_KEY = os.getenv("GOOGLE_API_KEY") | |
| if not API_KEY: | |
| raise EnvironmentError("GOOGLE_API_KEY not found in environment.") | |
| genai.configure(api_key=API_KEY) | |
| # --- Fix SRT format --- | |
| def fix_srt_format(input_file, output_file): | |
| with open(input_file, "r", encoding="utf-8") as infile: | |
| lines = infile.readlines() | |
| fixed_lines = [] | |
| for i, line in enumerate(lines): | |
| if line.strip().isdigit(): | |
| if i > 0 and lines[i - 1].strip() != "": | |
| fixed_lines.append("\n") | |
| fixed_lines.append(line) | |
| with open(output_file, "w", encoding="utf-8") as outfile: | |
| outfile.writelines(fixed_lines) | |
| # --- Initialize Gemini Agent --- | |
| def initialize_agent(): | |
| return Agent( | |
| name="Video AI Subtitle Generator", | |
| model=Gemini(id="gemini-2.0-flash-exp"), | |
| # model=Groq(id="llama-3.3-70b-versatile"), | |
| markdown=True, | |
| ) | |
| # --- Prompt Template --- | |
| subtitle_prompt_tuned = ''' | |
| You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when each word is spoken. | |
| Please follow these instructions strictly: | |
| 1. For **every single spoken word**, include: | |
| - A unique **line number** | |
| - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm | |
| - The **spoken word** on the next line (exactly one word per block) | |
| 2. Do **not** include more or fewer than one word per timestamp. | |
| 3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**. | |
| 4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc. | |
| 5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary. | |
| 6. Maintain the exact **chronological order** as spoken in the video. | |
| ***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for each word**. | |
| Example format: | |
| 1 | |
| 00:00:01,000 --> 00:00:01,300 | |
| Hello | |
| 2 | |
| 00:00:01,310 --> 00:00:01,600 | |
| everyone | |
| 3 | |
| 00:00:01,610 --> 00:00:01,900 | |
| welcome | |
| ... | |
| Only output the transcription in the above format (like **SRT** file format). Do not return any additional text. | |
| ''' | |
| # --- Gradio Interface Function --- | |
| def generate_subtitles(video): | |
| if not video: | |
| return None | |
| video_path = video | |
| output_txt = "raw_subtitles.srt" | |
| output_fixed = "output_subtitles.srt" | |
| agent = initialize_agent() | |
| print("[INFO] Uploading video...") | |
| uploaded_video = upload_file(video_path) | |
| while uploaded_video.state.name == "PROCESSING": | |
| time.sleep(1) | |
| uploaded_video = get_file(uploaded_video.name) | |
| print("[INFO] Generating subtitles...") | |
| response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video]) | |
| raw_text = response.content.strip() | |
| with open(output_txt, "w", encoding="utf-8") as f: | |
| f.write(raw_text) | |
| fix_srt_format(output_txt, output_fixed) | |
| return output_fixed | |
| # --- Launch Gradio App --- | |
| demo = gr.Interface( | |
| fn=generate_subtitles, | |
| inputs=gr.Video(label="Upload MP4 Video"), | |
| outputs=gr.File(label="Download .srt Subtitle File"), | |
| title="Subtitle Generator", | |
| description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True) | |