geekphoenix's picture
Update app.py
1a67653 verified
# #!/usr/bin/env python3
import gradio as gr
import cv2
import base64
import time
import os
import json
import sys
from openai import OpenAI
from dotenv import load_dotenv
from gtts import gTTS
import tempfile # To handle temporary files for Gradio uploads
# Load environment variables from .env file (for local testing)
load_dotenv()
def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
"""
Processes a video, generates an explanation using OpenAI, and converts it to audio.
This function is designed to be called by Gradio.
"""
# Prioritize API key from environment variables (Hugging Face Secrets)
# If not found, use the key provided in the Gradio UI.
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
api_key = openai_api_key_input
if not api_key or api_key == "<your OpenAI API key if not set as env var>":
return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None
client = OpenAI(api_key=api_key)
print(f"Video file path: {video_file_path}")
if not video_file_path:
return "Error: Please upload a video file.", None
if not prompt_text:
return "Error: Please provide an explanation prompt.", None
# Open the video file
video = cv2.VideoCapture(video_file_path)
if not video.isOpened():
return f"Error: Failed to open video file: {video_file_path}", None
# Extract frames from video
base64Frames = []
frame_count = 0
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
# Calculate sampling rate to get around 20-30 frames total
if total_frames > 0:
sampling_rate = max(1, total_frames // 25)
else:
sampling_rate = 50 # Default fallback if total_frames is 0
while video.isOpened():
success, frame = video.read()
if not success:
break
# Only take every Nth frame to reduce processing
if frame_count % sampling_rate == 0:
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
frame_count += 1
video.release()
print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")
PROMPT_MESSAGES = [
{
"role": "user",
"content": [
{
"type": "text",
"text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
},
*[
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{frame}",
"detail": "low"
}
} for frame in base64Frames
]
],
},
]
params = {
"model": "gpt-4o-mini",
"messages": PROMPT_MESSAGES,
"max_tokens": 500,
}
explanation = ""
try:
result = client.chat.completions.create(**params)
explanation = result.choices[0].message.content
print("Generated explanation based on provided prompt.")
except Exception as e:
return f"Error generating explanation: {str(e)}", None
# Generate audio from the explanation
# Use tempfile to create a temporary audio file that Gradio can serve
try:
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
tts = gTTS(text=explanation, lang='en')
tts.save(temp_audio_file.name)
audio_path = temp_audio_file.name
print("Generated audio file.")
except Exception as e:
return f"Error generating audio: {str(e)}", None
return explanation, audio_path
# Create the Gradio Interface
iface = gr.Interface(
fn=generate_explanation,
inputs=[
gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
],
outputs=[
gr.Textbox(label="Generated Explanation", lines=10),
gr.Audio(label="Explanation Audio", type="filepath")
],
title="Video Explanation Agent ",
description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
)
if __name__ == "__main__":
iface.launch()