File size: 4,887 Bytes
99c978a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# #!/usr/bin/env python3
import gradio as gr
import cv2
import base64
import time
import os
import json
import sys
from openai import OpenAI
from dotenv import load_dotenv
from gtts import gTTS
import tempfile # To handle temporary files for Gradio uploads
# Load environment variables from .env file (for local testing)
load_dotenv()
def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
"""
Processes a video, generates an explanation using OpenAI, and converts it to audio.
This function is designed to be called by Gradio.
"""
# Prioritize API key from environment variables (Hugging Face Secrets)
# If not found, use the key provided in the Gradio UI.
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
api_key = openai_api_key_input
if not api_key or api_key == "<your OpenAI API key if not set as env var>":
return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None
client = OpenAI(api_key=api_key)
print(f"Video file path: {video_file_path}")
if not video_file_path:
return "Error: Please upload a video file.", None
if not prompt_text:
return "Error: Please provide an explanation prompt.", None
# Open the video file
video = cv2.VideoCapture(video_file_path)
if not video.isOpened():
return f"Error: Failed to open video file: {video_file_path}", None
# Extract frames from video
base64Frames = []
frame_count = 0
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
# Calculate sampling rate to get around 20-30 frames total
if total_frames > 0:
sampling_rate = max(1, total_frames // 25)
else:
sampling_rate = 50 # Default fallback if total_frames is 0
while video.isOpened():
success, frame = video.read()
if not success:
break
# Only take every Nth frame to reduce processing
if frame_count % sampling_rate == 0:
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
frame_count += 1
video.release()
print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")
PROMPT_MESSAGES = [
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
},
*[
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{frame}",
"detail": "low"
}
} for frame in base64Frames
]
],
},
]
params = {
"model": "gpt-4o-mini",
"messages": PROMPT_MESSAGES,
"max_tokens": 500,
}
explanation = ""
try:
result = client.chat.completions.create(**params)
explanation = result.choices[0].message.content
print("Generated explanation based on provided prompt.")
except Exception as e:
return f"Error generating explanation: {str(e)}", None
# Generate audio from the explanation
# Use tempfile to create a temporary audio file that Gradio can serve
try:
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
tts = gTTS(text=explanation, lang='en')
tts.save(temp_audio_file.name)
audio_path = temp_audio_file.name
print("Generated audio file.")
except Exception as e:
return f"Error generating audio: {str(e)}", None
return explanation, audio_path
# Create the Gradio Interface
iface = gr.Interface(
fn=generate_explanation,
inputs=[
gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
],
outputs=[
gr.Textbox(label="Generated Explanation", lines=10),
gr.Audio(label="Explanation Audio", type="filepath")
],
title="Video Explanation Agent ",
description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
)
if __name__ == "__main__":
iface.launch()
|