Spaces:
Runtime error
Runtime error
| # #!/usr/bin/env python3 | |
| import gradio as gr | |
| import cv2 | |
| import base64 | |
| import time | |
| import os | |
| import json | |
| import sys | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| from gtts import gTTS | |
| import tempfile # To handle temporary files for Gradio uploads | |
| # Load environment variables from .env file (for local testing) | |
| load_dotenv() | |
| def generate_explanation(video_file_path, prompt_text, openai_api_key_input): | |
| """ | |
| Processes a video, generates an explanation using OpenAI, and converts it to audio. | |
| This function is designed to be called by Gradio. | |
| """ | |
| # Prioritize API key from environment variables (Hugging Face Secrets) | |
| # If not found, use the key provided in the Gradio UI. | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| api_key = openai_api_key_input | |
| if not api_key or api_key == "<your OpenAI API key if not set as env var>": | |
| return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None | |
| client = OpenAI(api_key=api_key) | |
| print(f"Video file path: {video_file_path}") | |
| if not video_file_path: | |
| return "Error: Please upload a video file.", None | |
| if not prompt_text: | |
| return "Error: Please provide an explanation prompt.", None | |
| # Open the video file | |
| video = cv2.VideoCapture(video_file_path) | |
| if not video.isOpened(): | |
| return f"Error: Failed to open video file: {video_file_path}", None | |
| # Extract frames from video | |
| base64Frames = [] | |
| frame_count = 0 | |
| total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) | |
| # Calculate sampling rate to get around 20-30 frames total | |
| if total_frames > 0: | |
| sampling_rate = max(1, total_frames // 25) | |
| else: | |
| sampling_rate = 50 # Default fallback if total_frames is 0 | |
| while video.isOpened(): | |
| success, frame = video.read() | |
| if not success: | |
| break | |
| # Only take every Nth frame to reduce processing | |
| if frame_count % sampling_rate == 0: | |
| _, buffer = cv2.imencode(".jpg", frame) | |
| base64Frames.append(base64.b64encode(buffer).decode("utf-8")) | |
| frame_count += 1 | |
| video.release() | |
| print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.") | |
| PROMPT_MESSAGES = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you" | |
| }, | |
| *[ | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": f"data:image/jpeg;base64,{frame}", | |
| "detail": "low" | |
| } | |
| } for frame in base64Frames | |
| ] | |
| ], | |
| }, | |
| ] | |
| params = { | |
| "model": "gpt-4o-mini", | |
| "messages": PROMPT_MESSAGES, | |
| "max_tokens": 500, | |
| } | |
| explanation = "" | |
| try: | |
| result = client.chat.completions.create(**params) | |
| explanation = result.choices[0].message.content | |
| print("Generated explanation based on provided prompt.") | |
| except Exception as e: | |
| return f"Error generating explanation: {str(e)}", None | |
| # Generate audio from the explanation | |
| # Use tempfile to create a temporary audio file that Gradio can serve | |
| try: | |
| with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file: | |
| tts = gTTS(text=explanation, lang='en') | |
| tts.save(temp_audio_file.name) | |
| audio_path = temp_audio_file.name | |
| print("Generated audio file.") | |
| except Exception as e: | |
| return f"Error generating audio: {str(e)}", None | |
| return explanation, audio_path | |
| # Create the Gradio Interface | |
| iface = gr.Interface( | |
| fn=generate_explanation, | |
| inputs=[ | |
| gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]), | |
| gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5), | |
| gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Generated Explanation", lines=10), | |
| gr.Audio(label="Explanation Audio", type="filepath") | |
| ], | |
| title="Video Explanation Agent ", | |
| description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |