Spaces:

BluescarfAI
/

Video-Explaination-Agent

Sleeping

File size: 4,887 Bytes

99c978a

# #!/usr/bin/env python3


import gradio as gr
import cv2
import base64
import time
import os
import json
import sys
from openai import OpenAI
from dotenv import load_dotenv
from gtts import gTTS
import tempfile # To handle temporary files for Gradio uploads

# Load environment variables from .env file (for local testing)
load_dotenv()

def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
    """
    Processes a video, generates an explanation using OpenAI, and converts it to audio.
    This function is designed to be called by Gradio.
    """
    
    # Prioritize API key from environment variables (Hugging Face Secrets)
    # If not found, use the key provided in the Gradio UI.
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        api_key = openai_api_key_input
        if not api_key or api_key == "<your OpenAI API key if not set as env var>":
            return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None

    client = OpenAI(api_key=api_key)
    print(f"Video file path: {video_file_path}")
    if not video_file_path:
        return "Error: Please upload a video file.", None
    if not prompt_text:
        return "Error: Please provide an explanation prompt.", None
   

    # Open the video file
    video = cv2.VideoCapture(video_file_path)
    if not video.isOpened():
        return f"Error: Failed to open video file: {video_file_path}", None

    # Extract frames from video
    base64Frames = []
    frame_count = 0
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Calculate sampling rate to get around 20-30 frames total
    if total_frames > 0:
        sampling_rate = max(1, total_frames // 25)
    else:
        sampling_rate = 50  # Default fallback if total_frames is 0

    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        
        # Only take every Nth frame to reduce processing
        if frame_count % sampling_rate == 0:
            _, buffer = cv2.imencode(".jpg", frame)
            base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        
        frame_count += 1

    video.release()
    print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")

    
    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
                },
                *[
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{frame}",
                            "detail": "low"
                        }
                    } for frame in base64Frames
                ]
            ],
        },
    ]

    params = {
        "model": "gpt-4o-mini",
        "messages": PROMPT_MESSAGES,
        "max_tokens": 500,
    }

    explanation = ""
    try:
        result = client.chat.completions.create(**params)
        explanation = result.choices[0].message.content
        print("Generated explanation based on provided prompt.")
    except Exception as e:
        return f"Error generating explanation: {str(e)}", None


    # Generate audio from the explanation
    # Use tempfile to create a temporary audio file that Gradio can serve
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
            tts = gTTS(text=explanation, lang='en')
            tts.save(temp_audio_file.name)
            audio_path = temp_audio_file.name
        print("Generated audio file.")
    except Exception as e:
        return f"Error generating audio: {str(e)}", None

    return explanation, audio_path

# Create the Gradio Interface
iface = gr.Interface(
    fn=generate_explanation,
    inputs=[
        gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
        
        gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
        gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
    ],
    outputs=[
        gr.Textbox(label="Generated Explanation", lines=10),
        gr.Audio(label="Explanation Audio", type="filepath")
    ],
    title="Video Explanation Agent ",
    description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
)

if __name__ == "__main__":
    iface.launch()