subashpoudel commited on
Commit
b4ba70c
·
verified ·
1 Parent(s): f479ceb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -111
app.py CHANGED
@@ -1,111 +1,111 @@
1
- import os
2
- import time
3
- import gradio as gr
4
- import google.generativeai as genai
5
- from phi.agent import Agent
6
- from phi.model.google import Gemini
7
- from google.generativeai import upload_file, get_file
8
- from dotenv import load_dotenv
9
-
10
- # --- Load API Key ---
11
- load_dotenv()
12
- API_KEY = os.getenv("GOOGLE_API_KEY")
13
- if not API_KEY:
14
- raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
15
- genai.configure(api_key=API_KEY)
16
-
17
- # --- Fix SRT format ---
18
- def fix_srt_format(input_file, output_file):
19
- with open(input_file, "r", encoding="utf-8") as infile:
20
- lines = infile.readlines()
21
-
22
- fixed_lines = []
23
- for i, line in enumerate(lines):
24
- if line.strip().isdigit():
25
- if i > 0 and lines[i - 1].strip() != "":
26
- fixed_lines.append("\n")
27
- fixed_lines.append(line)
28
-
29
- with open(output_file, "w", encoding="utf-8") as outfile:
30
- outfile.writelines(fixed_lines)
31
-
32
- # --- Initialize Gemini Agent ---
33
- def initialize_agent():
34
- return Agent(
35
- name="Video AI Subtitle Generator",
36
- model=Gemini(id="gemini-2.0-flash-exp"),
37
- markdown=True,
38
- )
39
-
40
- # --- Prompt Template ---
41
- subtitle_prompt_tuned = '''
42
- You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when they are spoken.
43
-
44
- Please follow these instructions strictly:
45
-
46
- 1. For **every three consecutive spoken words**, include:
47
- - A unique **line number**
48
- - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
49
- - The **three spoken words** on the next line (exactly three words per block, separated by spaces)
50
- 2. Do **not** include more or fewer than three words per timestamp.
51
- 3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
52
- 4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
53
- 5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
54
- 6. Maintain the exact **chronological order** as spoken in the video.
55
-
56
- ***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for every group of three words**.
57
-
58
- Example format:
59
-
60
- 1
61
- 00:00:01,000 --> 00:00:01,800
62
- Hello everyone welcome
63
-
64
- 2
65
- 00:00:01,810 --> 00:00:02,600
66
- to this tutorial
67
-
68
- ...
69
-
70
- Only output the transcription in the above format. Do not return any additional text.
71
- '''
72
-
73
- # --- Gradio Interface Function ---
74
- def generate_subtitles(video):
75
- if not video:
76
- return None
77
-
78
- video_path = video
79
- output_txt = "raw_subtitles.srt"
80
- output_fixed = "output_subtitles.srt"
81
-
82
- agent = initialize_agent()
83
-
84
- print("[INFO] Uploading video...")
85
- uploaded_video = upload_file(video_path)
86
-
87
- while uploaded_video.state.name == "PROCESSING":
88
- time.sleep(1)
89
- uploaded_video = get_file(uploaded_video.name)
90
-
91
- print("[INFO] Generating subtitles...")
92
- response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
93
- raw_text = response.content.strip()
94
-
95
- with open(output_txt, "w", encoding="utf-8") as f:
96
- f.write(raw_text)
97
-
98
- fix_srt_format(output_txt, output_fixed)
99
- return output_fixed
100
-
101
- # --- Launch Gradio App ---
102
- demo = gr.Interface(
103
- fn=generate_subtitles,
104
- inputs=gr.Video(label="Upload MP4 Video"),
105
- outputs=gr.File(label="Download .srt Subtitle File"),
106
- title="Gemini Subtitle Generator",
107
- description="Upload a video to extract precise subtitles using Gemini API. Output is a .srt file with exact timestamps.",
108
- )
109
-
110
- if __name__ == "__main__":
111
- demo.launch(share=True)
 
1
+ import os
2
+ import time
3
+ import gradio as gr
4
+ import google.generativeai as genai
5
+ from phi.agent import Agent
6
+ from phi.model.google import Gemini
7
+ from google.generativeai import upload_file, get_file
8
+ from dotenv import load_dotenv
9
+
10
+ # --- Load API Key ---
11
+ load_dotenv()
12
+ API_KEY = os.getenv("GOOGLE_API_KEY")
13
+ if not API_KEY:
14
+ raise EnvironmentError("GOOGLE_API_KEY not found in environment.")
15
+ genai.configure(api_key=API_KEY)
16
+
17
+ # --- Fix SRT format ---
18
+ def fix_srt_format(input_file, output_file):
19
+ with open(input_file, "r", encoding="utf-8") as infile:
20
+ lines = infile.readlines()
21
+
22
+ fixed_lines = []
23
+ for i, line in enumerate(lines):
24
+ if line.strip().isdigit():
25
+ if i > 0 and lines[i - 1].strip() != "":
26
+ fixed_lines.append("\n")
27
+ fixed_lines.append(line)
28
+
29
+ with open(output_file, "w", encoding="utf-8") as outfile:
30
+ outfile.writelines(fixed_lines)
31
+
32
+ # --- Initialize Gemini Agent ---
33
+ def initialize_agent():
34
+ return Agent(
35
+ name="Video AI Subtitle Generator",
36
+ model=Gemini(id="gemini-2.0-flash-exp"),
37
+ markdown=True,
38
+ )
39
+
40
+ # --- Prompt Template ---
41
+ subtitle_prompt_tuned = '''
42
+ You are given a video. Your task is to extract the **spoken words** along with the **exact timestamps** of when they are spoken.
43
+
44
+ Please follow these instructions strictly:
45
+
46
+ 1. For **every three consecutive spoken words**, include:
47
+ - A unique **line number**
48
+ - The **exact start time** and **end time** in the format: HH:MM:SS,mmm --> HH:MM:SS,mmm
49
+ - The **three spoken words** on the next line (exactly three words per block, separated by spaces)
50
+ 2. Do **not** include more or fewer than three words per timestamp.
51
+ 3. Do not summarize, paraphrase, or skip any spoken content — include **all spoken words verbatim**.
52
+ 4. Do **not** include any sound effects or non-verbal cues like [Music], [Laughter], etc.
53
+ 5. Your output must be a **raw transcription** — no extra formatting, no explanations, no commentary.
54
+ 6. Maintain the exact **chronological order** as spoken in the video.
55
+
56
+ ***FINAL AND CRITICAL REMINDER***: The **timestamp accuracy is the highest priority**. Focus on getting the **precise start and end time for every group of three words**.
57
+
58
+ Example format:
59
+
60
+ 1
61
+ 00:00:01,000 --> 00:00:01,800
62
+ Hello everyone welcome
63
+
64
+ 2
65
+ 00:00:01,810 --> 00:00:02,600
66
+ to this tutorial
67
+
68
+ ...
69
+
70
+ Only output the transcription in the above format. Do not return any additional text.
71
+ '''
72
+
73
+ # --- Gradio Interface Function ---
74
+ def generate_subtitles(video):
75
+ if not video:
76
+ return None
77
+
78
+ video_path = video
79
+ output_txt = "raw_subtitles.srt"
80
+ output_fixed = "output_subtitles.srt"
81
+
82
+ agent = initialize_agent()
83
+
84
+ print("[INFO] Uploading video...")
85
+ uploaded_video = upload_file(video_path)
86
+
87
+ while uploaded_video.state.name == "PROCESSING":
88
+ time.sleep(1)
89
+ uploaded_video = get_file(uploaded_video.name)
90
+
91
+ print("[INFO] Generating subtitles...")
92
+ response = agent.run(subtitle_prompt_tuned, videos=[uploaded_video])
93
+ raw_text = response.content.strip()
94
+
95
+ with open(output_txt, "w", encoding="utf-8") as f:
96
+ f.write(raw_text)
97
+
98
+ fix_srt_format(output_txt, output_fixed)
99
+ return output_fixed
100
+
101
+ # --- Launch Gradio App ---
102
+ demo = gr.Interface(
103
+ fn=generate_subtitles,
104
+ inputs=gr.Video(label="Upload MP4 Video"),
105
+ outputs=gr.File(label="Download .srt Subtitle File"),
106
+ title="Subtitle Generator",
107
+ description="Upload a video to extract precise subtitles using AI. Output is a .srt file with exact timestamps.",
108
+ )
109
+
110
+ if __name__ == "__main__":
111
+ demo.launch(share=True)