Spaces:

subashpoudel
/

Subtitle-Generator

Sleeping

App Files Files Community

subashpoudel commited on May 11, 2025

Commit

b4ba70c

verified ·

1 Parent(s): f479ceb

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -111

app.py CHANGED Viewed

@@ -1,111 +1,111 @@
-import os
-import time
-import gradio as gr
-import google.generativeai as genai
-from phi.agent import Agent
-from phi.model.google import Gemini
-from google.generativeai import upload_file, get_file
-from dotenv import load_dotenv
-# --- Load API Key ---
-load_dotenv()
-API_KEY = os.getenv("GOOGLE_API_KEY")
-if not API_KEY:
-    raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
-genai.configure(api_key=API_KEY)
-# --- Fix SRT format ---
-def fix_srt_format(input_file, output_file):
-    with open(input_file, "r", encoding="utf-8") as infile:
-        lines = infile.readlines()
-    fixed_lines = []
-    for i, line in enumerate(lines):
-        if line.strip().isdigit():
-            if i > 0 and lines[i - 1].strip() != "":
-                fixed_lines.append("\n")
-        fixed_lines.append(line)
-    with open(output_file, "w", encoding="utf-8") as outfile:
-        outfile.writelines(fixed_lines)
-# --- Initialize Gemini Agent ---
-def initialize_agent():
-    return Agent(
-        name="Video AI Subtitle Generator",
-        model=Gemini(id="gemini-2.0-flash-exp"),
-        markdown=True,
-    )
-# --- Prompt Template ---
-subtitle_prompt_tuned = '''
-You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when they are spoken.
-Please follow these instructions strictly:
-1. For **every three consecutive spoken words**, include:
-   - A unique **line number**
-   - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
-   - The **three spoken words** on the next line (exactly three words per block, separated by spaces)
-2. Do **not** include more or fewer than three words per timestamp.
-3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
-4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
-5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
-6. Maintain the exact **chronological order** as spoken in the video.
-***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for every group of three words**.
-Example format:
-1
-00:00:01,000 --> 00:00:01,800
-Hello everyone welcome
-2
-00:00:01,810 --> 00:00:02,600
-to this tutorial
-...
-Only output the transcription in the above format. Do not return any additional text.
-'''
-# --- Gradio Interface Function ---
-def generate_subtitles(video):
-    if not video:
-        return None
-    video_path = video
-    output_txt = "raw_subtitles.srt"
-    output_fixed = "output_subtitles.srt"
-    agent = initialize_agent()
-    print("[INFO] Uploading video...")
-    uploaded_video = upload_file(video_path)
-    while uploaded_video.state.name == "PROCESSING":
-        time.sleep(1)
-        uploaded_video = get_file(uploaded_video.name)
-    print("[INFO] Generating subtitles...")
-    response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
-    raw_text = response.content.strip()
-    with open(output_txt, "w", encoding="utf-8") as f:
-        f.write(raw_text)
-    fix_srt_format(output_txt, output_fixed)
-    return output_fixed
-# --- Launch Gradio App ---
-demo = gr.Interface(
-    fn=generate_subtitles,
-    inputs=gr.Video(label="Upload MP4 Video"),
-    outputs=gr.File(label="Download .srt Subtitle File"),
-    title="Gemini Subtitle Generator",
-    description="Upload a video to extract precise subtitles using Gemini API. Output is a .srt file with exact timestamps.",
-)
-if __name__ == "__main__":
-    demo.launch(share=True)

+import os
+import time
+import gradio as gr
+import google.generativeai as genai
+from phi.agent import Agent
+from phi.model.google import Gemini
+from google.generativeai import upload_file, get_file
+from dotenv import load_dotenv
+# --- Load API Key ---
+load_dotenv()
+API_KEY = os.getenv("GOOGLE_API_KEY")
+if not API_KEY:
+    raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
+genai.configure(api_key=API_KEY)
+# --- Fix SRT format ---
+def fix_srt_format(input_file, output_file):
+    with open(input_file, "r", encoding="utf-8") as infile:
+        lines = infile.readlines()
+    fixed_lines = []
+    for i, line in enumerate(lines):
+        if line.strip().isdigit():
+            if i > 0 and lines[i - 1].strip() != "":
+                fixed_lines.append("\n")
+        fixed_lines.append(line)
+    with open(output_file, "w", encoding="utf-8") as outfile:
+        outfile.writelines(fixed_lines)
+# --- Initialize Gemini Agent ---
+def initialize_agent():
+    return Agent(
+        name="Video AI Subtitle Generator",
+        model=Gemini(id="gemini-2.0-flash-exp"),
+        markdown=True,
+    )
+# --- Prompt Template ---
+subtitle_prompt_tuned = '''
+You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when they are spoken.
+Please follow these instructions strictly:
+1. For **every three consecutive spoken words**, include:
+   - A unique **line number**
+   - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
+   - The **three spoken words** on the next line (exactly three words per block, separated by spaces)
+2. Do **not** include more or fewer than three words per timestamp.
+3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
+4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
+5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
+6. Maintain the exact **chronological order** as spoken in the video.
+***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for every group of three words**.
+Example format:
+1
+00:00:01,000 --> 00:00:01,800
+Hello everyone welcome
+2
+00:00:01,810 --> 00:00:02,600
+to this tutorial
+...
+Only output the transcription in the above format. Do not return any additional text.
+'''
+# --- Gradio Interface Function ---
+def generate_subtitles(video):
+    if not video:
+        return None
+    video_path = video
+    output_txt = "raw_subtitles.srt"
+    output_fixed = "output_subtitles.srt"
+    agent = initialize_agent()
+    print("[INFO] Uploading video...")
+    uploaded_video = upload_file(video_path)
+    while uploaded_video.state.name == "PROCESSING":
+        time.sleep(1)
+        uploaded_video = get_file(uploaded_video.name)
+    print("[INFO] Generating subtitles...")
+    response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
+    raw_text = response.content.strip()
+    with open(output_txt, "w", encoding="utf-8") as f:
+        f.write(raw_text)
+    fix_srt_format(output_txt, output_fixed)
+    return output_fixed
+# --- Launch Gradio App ---
+demo = gr.Interface(
+    fn=generate_subtitles,
+    inputs=gr.Video(label="Upload MP4 Video"),
+    outputs=gr.File(label="Download .srt Subtitle File"),
+    title="Subtitle Generator",
+    description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
+)
+if __name__ == "__main__":
+    demo.launch(share=True)