# #!/usr/bin/env python3 import gradio as gr import cv2 import base64 import time import os import json import sys from openai import OpenAI from dotenv import load_dotenv from gtts import gTTS import tempfile # To handle temporary files for Gradio uploads # Load environment variables from .env file (for local testing) load_dotenv() def generate_explanation(video_file_path, prompt_text, openai_api_key_input): """ Processes a video, generates an explanation using OpenAI, and converts it to audio. This function is designed to be called by Gradio. """ # Prioritize API key from environment variables (Hugging Face Secrets) # If not found, use the key provided in the Gradio UI. api_key = os.getenv("OPENAI_API_KEY") if not api_key: api_key = openai_api_key_input if not api_key or api_key == "": return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None client = OpenAI(api_key=api_key) print(f"Video file path: {video_file_path}") if not video_file_path: return "Error: Please upload a video file.", None if not prompt_text: return "Error: Please provide an explanation prompt.", None # Open the video file video = cv2.VideoCapture(video_file_path) if not video.isOpened(): return f"Error: Failed to open video file: {video_file_path}", None # Extract frames from video base64Frames = [] frame_count = 0 total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) # Calculate sampling rate to get around 20-30 frames total if total_frames > 0: sampling_rate = max(1, total_frames // 25) else: sampling_rate = 50 # Default fallback if total_frames is 0 while video.isOpened(): success, frame = video.read() if not success: break # Only take every Nth frame to reduce processing if frame_count % sampling_rate == 0: _, buffer = cv2.imencode(".jpg", frame) base64Frames.append(base64.b64encode(buffer).decode("utf-8")) frame_count += 1 video.release() print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.") PROMPT_MESSAGES = [ { "role": "user", "content": [ { "type": "text", "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you" }, *[ { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{frame}", "detail": "low" } } for frame in base64Frames ] ], }, ] params = { "model": "gpt-4o-mini", "messages": PROMPT_MESSAGES, "max_tokens": 500, } explanation = "" try: result = client.chat.completions.create(**params) explanation = result.choices[0].message.content print("Generated explanation based on provided prompt.") except Exception as e: return f"Error generating explanation: {str(e)}", None # Generate audio from the explanation # Use tempfile to create a temporary audio file that Gradio can serve try: with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file: tts = gTTS(text=explanation, lang='en') tts.save(temp_audio_file.name) audio_path = temp_audio_file.name print("Generated audio file.") except Exception as e: return f"Error generating audio: {str(e)}", None return explanation, audio_path # Create the Gradio Interface iface = gr.Interface( fn=generate_explanation, inputs=[ gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]), gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5), gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") ], outputs=[ gr.Textbox(label="Generated Explanation", lines=10), gr.Audio(label="Explanation Audio", type="filepath") ], title="Video Explanation Agent ", description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.", ) if __name__ == "__main__": iface.launch()