dure-waseem commited on
Commit
99c978a
·
1 Parent(s): a73cb84

initial code

Browse files
Files changed (2) hide show
  1. main.py +283 -0
  2. requirements.txt +6 -0
main.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # #!/usr/bin/env python3
2
+ # import cv2
3
+ # import base64
4
+ # import time
5
+ # import os
6
+ # import json
7
+ # import argparse
8
+ # import sys
9
+ # from openai import OpenAI
10
+ # from dotenv import load_dotenv
11
+ # from gtts import gTTS
12
+
13
+ # def main():
14
+ # # Set up argument parsing
15
+ # parser = argparse.ArgumentParser(description='Video Explanation Agent')
16
+ # parser.add_argument('--video-file', required=True, help='Path to the video file to process')
17
+ # parser.add_argument('--prompt-file', required=True, help='Path to the file containing the explanation prompt')
18
+ # args = parser.parse_args()
19
+
20
+ # # Check if files exist
21
+ # if not os.path.exists(args.video_file):
22
+ # print(json.dumps({
23
+ # "error": f"Video file not found: {args.video_file}"
24
+ # }))
25
+ # sys.exit(1)
26
+
27
+ # if not os.path.exists(args.prompt_file):
28
+ # print(json.dumps({
29
+ # "error": f"Prompt file not found: {args.prompt_file}"
30
+ # }))
31
+ # sys.exit(1)
32
+
33
+ # # Load environment variables from .env file
34
+ # load_dotenv()
35
+
36
+ # # Get the OpenAI API key from the environment
37
+ # api_key = os.getenv("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>")
38
+ # client = OpenAI(api_key=api_key)
39
+
40
+ # # Read the custom prompt from the file
41
+ # with open(args.prompt_file, 'r') as f:
42
+ # custom_prompt = f.read().strip()
43
+
44
+ # # Open the video file
45
+ # video = cv2.VideoCapture(args.video_file)
46
+ # if not video.isOpened():
47
+ # print(json.dumps({
48
+ # "error": f"Failed to open video file: {args.video_file}"
49
+ # }))
50
+ # sys.exit(1)
51
+
52
+ # # Extract frames from video
53
+ # base64Frames = []
54
+ # frame_count = 0
55
+ # total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
56
+
57
+ # # Calculate sampling rate to get around 20-30 frames total
58
+ # if total_frames > 0:
59
+ # sampling_rate = max(1, total_frames // 25)
60
+ # else:
61
+ # sampling_rate = 50 # Default fallback
62
+
63
+ # while video.isOpened():
64
+ # success, frame = video.read()
65
+ # if not success:
66
+ # break
67
+
68
+ # # Only take every Nth frame to reduce processing
69
+ # if frame_count % sampling_rate == 0:
70
+ # _, buffer = cv2.imencode(".jpg", frame)
71
+ # base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
72
+
73
+ # frame_count += 1
74
+
75
+ # video.release()
76
+ # print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.", file=sys.stderr)
77
+
78
+ # # Create the timestamp for unique filenames
79
+ # timestamp = int(time.time())
80
+ # # Create data directory inside the vidExp-agent directory
81
+ # output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
82
+ # os.makedirs(output_dir, exist_ok=True)
83
+
84
+ # # Generate explanation based on the custom prompt
85
+ # PROMPT_MESSAGES = [
86
+ # {
87
+ # "role": "user",
88
+ # "content": [
89
+ # f"{custom_prompt}",
90
+ # *map(lambda x: {"image": x, "resize": 768}, base64Frames),
91
+ # ],
92
+ # },
93
+ # ]
94
+
95
+ # params = {
96
+ # "model": "gpt-4o-mini",
97
+ # "messages": PROMPT_MESSAGES,
98
+ # "max_tokens": 500,
99
+ # }
100
+
101
+ # try:
102
+ # result = client.chat.completions.create(**params)
103
+ # explanation = result.choices[0].message.content
104
+ # print(f"Generated explanation based on provided prompt.", file=sys.stderr)
105
+ # except Exception as e:
106
+ # print(json.dumps({
107
+ # "error": f"Error generating explanation: {str(e)}"
108
+ # }))
109
+ # sys.exit(1)
110
+
111
+ # # Save the explanation as a text file
112
+ # explanation_file = os.path.join(output_dir, f"explanation_{timestamp}.txt")
113
+ # with open(explanation_file, "w") as f:
114
+ # f.write(explanation)
115
+
116
+ # # Generate audio from the explanation
117
+ # audio_filename = f"explanation_{timestamp}.mp3"
118
+ # audio_path = os.path.join(output_dir, audio_filename)
119
+
120
+ # try:
121
+ # # Default to English for TTS
122
+ # tts = gTTS(text=explanation, lang='en')
123
+ # tts.save(audio_path)
124
+ # print(f"Generated audio file.", file=sys.stderr)
125
+ # except Exception as e:
126
+ # print(json.dumps({
127
+ # "error": f"Error generating audio: {str(e)}"
128
+ # }))
129
+ # sys.exit(1)
130
+
131
+ # # Return the results as JSON
132
+ # result = {
133
+ # "success": True,
134
+ # "explanation": explanation,
135
+ # "explanationFilePath": explanation_file,
136
+ # "audioFilename": audio_filename,
137
+ # "audioFilePath": audio_path
138
+ # }
139
+
140
+ # print(json.dumps(result))
141
+
142
+ # if __name__ == "__main__":
143
+ # main()
144
+
145
+
146
+
147
+ import gradio as gr
148
+ import cv2
149
+ import base64
150
+ import time
151
+ import os
152
+ import json
153
+ import sys
154
+ from openai import OpenAI
155
+ from dotenv import load_dotenv
156
+ from gtts import gTTS
157
+ import tempfile # To handle temporary files for Gradio uploads
158
+
159
+ # Load environment variables from .env file (for local testing)
160
+ load_dotenv()
161
+
162
+ def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
163
+ """
164
+ Processes a video, generates an explanation using OpenAI, and converts it to audio.
165
+ This function is designed to be called by Gradio.
166
+ """
167
+
168
+ # Prioritize API key from environment variables (Hugging Face Secrets)
169
+ # If not found, use the key provided in the Gradio UI.
170
+ api_key = os.getenv("OPENAI_API_KEY")
171
+ if not api_key:
172
+ api_key = openai_api_key_input
173
+ if not api_key or api_key == "<your OpenAI API key if not set as env var>":
174
+ return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None
175
+
176
+ client = OpenAI(api_key=api_key)
177
+ print(f"Video file path: {video_file_path}")
178
+ if not video_file_path:
179
+ return "Error: Please upload a video file.", None
180
+ if not prompt_text:
181
+ return "Error: Please provide an explanation prompt.", None
182
+
183
+
184
+ # Open the video file
185
+ video = cv2.VideoCapture(video_file_path)
186
+ if not video.isOpened():
187
+ return f"Error: Failed to open video file: {video_file_path}", None
188
+
189
+ # Extract frames from video
190
+ base64Frames = []
191
+ frame_count = 0
192
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
193
+
194
+ # Calculate sampling rate to get around 20-30 frames total
195
+ if total_frames > 0:
196
+ sampling_rate = max(1, total_frames // 25)
197
+ else:
198
+ sampling_rate = 50 # Default fallback if total_frames is 0
199
+
200
+ while video.isOpened():
201
+ success, frame = video.read()
202
+ if not success:
203
+ break
204
+
205
+ # Only take every Nth frame to reduce processing
206
+ if frame_count % sampling_rate == 0:
207
+ _, buffer = cv2.imencode(".jpg", frame)
208
+ base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
209
+
210
+ frame_count += 1
211
+
212
+ video.release()
213
+ print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")
214
+
215
+
216
+ PROMPT_MESSAGES = [
217
+ {
218
+ "role": "user",
219
+ "content": [
220
+ {
221
+ "type": "text",
222
+ "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
223
+ },
224
+ *[
225
+ {
226
+ "type": "image_url",
227
+ "image_url": {
228
+ "url": f"data:image/jpeg;base64,{frame}",
229
+ "detail": "low"
230
+ }
231
+ } for frame in base64Frames
232
+ ]
233
+ ],
234
+ },
235
+ ]
236
+
237
+ params = {
238
+ "model": "gpt-4o-mini",
239
+ "messages": PROMPT_MESSAGES,
240
+ "max_tokens": 500,
241
+ }
242
+
243
+ explanation = ""
244
+ try:
245
+ result = client.chat.completions.create(**params)
246
+ explanation = result.choices[0].message.content
247
+ print("Generated explanation based on provided prompt.")
248
+ except Exception as e:
249
+ return f"Error generating explanation: {str(e)}", None
250
+
251
+
252
+ # Generate audio from the explanation
253
+ # Use tempfile to create a temporary audio file that Gradio can serve
254
+ try:
255
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
256
+ tts = gTTS(text=explanation, lang='en')
257
+ tts.save(temp_audio_file.name)
258
+ audio_path = temp_audio_file.name
259
+ print("Generated audio file.")
260
+ except Exception as e:
261
+ return f"Error generating audio: {str(e)}", None
262
+
263
+ return explanation, audio_path
264
+
265
+ # Create the Gradio Interface
266
+ iface = gr.Interface(
267
+ fn=generate_explanation,
268
+ inputs=[
269
+ gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
270
+
271
+ gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
272
+ gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
273
+ ],
274
+ outputs=[
275
+ gr.Textbox(label="Generated Explanation", lines=10),
276
+ gr.Audio(label="Explanation Audio", type="filepath")
277
+ ],
278
+ title="Video Explanation Agent ",
279
+ description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
280
+ )
281
+
282
+ if __name__ == "__main__":
283
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ gradio
3
+ opencv-python
4
+ openai
5
+ python-dotenv
6
+ gTTS