File size: 4,887 Bytes
99c978a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# #!/usr/bin/env python3


import gradio as gr
import cv2
import base64
import time
import os
import json
import sys
from openai import OpenAI
from dotenv import load_dotenv
from gtts import gTTS
import tempfile # To handle temporary files for Gradio uploads

# Load environment variables from .env file (for local testing)
load_dotenv()

def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
    """
    Processes a video, generates an explanation using OpenAI, and converts it to audio.
    This function is designed to be called by Gradio.
    """
    
    # Prioritize API key from environment variables (Hugging Face Secrets)
    # If not found, use the key provided in the Gradio UI.
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        api_key = openai_api_key_input
        if not api_key or api_key == "<your OpenAI API key if not set as env var>":
            return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None

    client = OpenAI(api_key=api_key)
    print(f"Video file path: {video_file_path}")
    if not video_file_path:
        return "Error: Please upload a video file.", None
    if not prompt_text:
        return "Error: Please provide an explanation prompt.", None
   

    # Open the video file
    video = cv2.VideoCapture(video_file_path)
    if not video.isOpened():
        return f"Error: Failed to open video file: {video_file_path}", None

    # Extract frames from video
    base64Frames = []
    frame_count = 0
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Calculate sampling rate to get around 20-30 frames total
    if total_frames > 0:
        sampling_rate = max(1, total_frames // 25)
    else:
        sampling_rate = 50  # Default fallback if total_frames is 0

    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        
        # Only take every Nth frame to reduce processing
        if frame_count % sampling_rate == 0:
            _, buffer = cv2.imencode(".jpg", frame)
            base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        
        frame_count += 1

    video.release()
    print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")

    
    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
                },
                *[
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{frame}",
                            "detail": "low"
                        }
                    } for frame in base64Frames
                ]
            ],
        },
    ]

    params = {
        "model": "gpt-4o-mini",
        "messages": PROMPT_MESSAGES,
        "max_tokens": 500,
    }

    explanation = ""
    try:
        result = client.chat.completions.create(**params)
        explanation = result.choices[0].message.content
        print("Generated explanation based on provided prompt.")
    except Exception as e:
        return f"Error generating explanation: {str(e)}", None


    # Generate audio from the explanation
    # Use tempfile to create a temporary audio file that Gradio can serve
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
            tts = gTTS(text=explanation, lang='en')
            tts.save(temp_audio_file.name)
            audio_path = temp_audio_file.name
        print("Generated audio file.")
    except Exception as e:
        return f"Error generating audio: {str(e)}", None

    return explanation, audio_path

# Create the Gradio Interface
iface = gr.Interface(
    fn=generate_explanation,
    inputs=[
        gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
        
        gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
        gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
    ],
    outputs=[
        gr.Textbox(label="Generated Explanation", lines=10),
        gr.Audio(label="Explanation Audio", type="filepath")
    ],
    title="Video Explanation Agent ",
    description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
)

if __name__ == "__main__":
    iface.launch()