|
|
|
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import cv2 |
|
|
import base64 |
|
|
import time |
|
|
import os |
|
|
import json |
|
|
import sys |
|
|
from openai import OpenAI |
|
|
from dotenv import load_dotenv |
|
|
from gtts import gTTS |
|
|
import tempfile |
|
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
def generate_explanation(video_file_path, prompt_text, openai_api_key_input): |
|
|
""" |
|
|
Processes a video, generates an explanation using OpenAI, and converts it to audio. |
|
|
This function is designed to be called by Gradio. |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
api_key = os.getenv("OPENAI_API_KEY") |
|
|
if not api_key: |
|
|
api_key = openai_api_key_input |
|
|
if not api_key or api_key == "<your OpenAI API key if not set as env var>": |
|
|
return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None |
|
|
|
|
|
client = OpenAI(api_key=api_key) |
|
|
print(f"Video file path: {video_file_path}") |
|
|
if not video_file_path: |
|
|
return "Error: Please upload a video file.", None |
|
|
if not prompt_text: |
|
|
return "Error: Please provide an explanation prompt.", None |
|
|
|
|
|
|
|
|
|
|
|
video = cv2.VideoCapture(video_file_path) |
|
|
if not video.isOpened(): |
|
|
return f"Error: Failed to open video file: {video_file_path}", None |
|
|
|
|
|
|
|
|
base64Frames = [] |
|
|
frame_count = 0 |
|
|
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) |
|
|
|
|
|
|
|
|
if total_frames > 0: |
|
|
sampling_rate = max(1, total_frames // 25) |
|
|
else: |
|
|
sampling_rate = 50 |
|
|
|
|
|
while video.isOpened(): |
|
|
success, frame = video.read() |
|
|
if not success: |
|
|
break |
|
|
|
|
|
|
|
|
if frame_count % sampling_rate == 0: |
|
|
_, buffer = cv2.imencode(".jpg", frame) |
|
|
base64Frames.append(base64.b64encode(buffer).decode("utf-8")) |
|
|
|
|
|
frame_count += 1 |
|
|
|
|
|
video.release() |
|
|
print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.") |
|
|
|
|
|
|
|
|
PROMPT_MESSAGES = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "text", |
|
|
"text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you" |
|
|
}, |
|
|
*[ |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": f"data:image/jpeg;base64,{frame}", |
|
|
"detail": "low" |
|
|
} |
|
|
} for frame in base64Frames |
|
|
] |
|
|
], |
|
|
}, |
|
|
] |
|
|
|
|
|
params = { |
|
|
"model": "gpt-4o-mini", |
|
|
"messages": PROMPT_MESSAGES, |
|
|
"max_tokens": 500, |
|
|
} |
|
|
|
|
|
explanation = "" |
|
|
try: |
|
|
result = client.chat.completions.create(**params) |
|
|
explanation = result.choices[0].message.content |
|
|
print("Generated explanation based on provided prompt.") |
|
|
except Exception as e: |
|
|
return f"Error generating explanation: {str(e)}", None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file: |
|
|
tts = gTTS(text=explanation, lang='en') |
|
|
tts.save(temp_audio_file.name) |
|
|
audio_path = temp_audio_file.name |
|
|
print("Generated audio file.") |
|
|
except Exception as e: |
|
|
return f"Error generating audio: {str(e)}", None |
|
|
|
|
|
return explanation, audio_path |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=generate_explanation, |
|
|
inputs=[ |
|
|
gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]), |
|
|
|
|
|
gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5), |
|
|
gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") |
|
|
], |
|
|
outputs=[ |
|
|
gr.Textbox(label="Generated Explanation", lines=10), |
|
|
gr.Audio(label="Explanation Audio", type="filepath") |
|
|
], |
|
|
title="Video Explanation Agent ", |
|
|
description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.", |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |
|
|
|