Spaces:

BluescarfAI
/

Video-Explaination-Agent

Runtime error

App Files Files Community

dure-waseem commited on Jul 17, 2025

Commit

99c978a

1 Parent(s): a73cb84

initial code

Browse files

Files changed (2) hide show

main.py +283 -0
requirements.txt +6 -0

main.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# #!/usr/bin/env python3
+# import cv2
+# import base64
+# import time
+# import os
+# import json
+# import argparse
+# import sys
+# from openai import OpenAI
+# from dotenv import load_dotenv
+# from gtts import gTTS
+# def main():
+#     # Set up argument parsing
+#     parser = argparse.ArgumentParser(description='Video Explanation Agent')
+#     parser.add_argument('--video-file', required=True, help='Path to the video file to process')
+#     parser.add_argument('--prompt-file', required=True, help='Path to the file containing the explanation prompt')
+#     args = parser.parse_args()
+#     # Check if files exist
+#     if not os.path.exists(args.video_file):
+#         print(json.dumps({
+#             "error": f"Video file not found: {args.video_file}"
+#         }))
+#         sys.exit(1)
+#     if not os.path.exists(args.prompt_file):
+#         print(json.dumps({
+#             "error": f"Prompt file not found: {args.prompt_file}"
+#         }))
+#         sys.exit(1)
+#     # Load environment variables from .env file
+#     load_dotenv()
+#     # Get the OpenAI API key from the environment
+#     api_key = os.getenv("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>")
+#     client = OpenAI(api_key=api_key)
+#     # Read the custom prompt from the file
+#     with open(args.prompt_file, 'r') as f:
+#         custom_prompt = f.read().strip()
+#     # Open the video file
+#     video = cv2.VideoCapture(args.video_file)
+#     if not video.isOpened():
+#         print(json.dumps({
+#             "error": f"Failed to open video file: {args.video_file}"
+#         }))
+#         sys.exit(1)
+#     # Extract frames from video
+#     base64Frames = []
+#     frame_count = 0
+#     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+#     # Calculate sampling rate to get around 20-30 frames total
+#     if total_frames > 0:
+#         sampling_rate = max(1, total_frames // 25)
+#     else:
+#         sampling_rate = 50  # Default fallback
+#     while video.isOpened():
+#         success, frame = video.read()
+#         if not success:
+#             break
+#         # Only take every Nth frame to reduce processing
+#         if frame_count % sampling_rate == 0:
+#             _, buffer = cv2.imencode(".jpg", frame)
+#             base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+#         frame_count += 1
+#     video.release()
+#     print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.", file=sys.stderr)
+#     # Create the timestamp for unique filenames
+#     timestamp = int(time.time())
+#     # Create data directory inside the vidExp-agent directory
+#     output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+#     os.makedirs(output_dir, exist_ok=True)
+#     # Generate explanation based on the custom prompt
+#     PROMPT_MESSAGES = [
+#         {
+#             "role": "user",
+#             "content": [
+#                 f"{custom_prompt}",
+#                 *map(lambda x: {"image": x, "resize": 768}, base64Frames),
+#             ],
+#         },
+#     ]
+#     params = {
+#         "model": "gpt-4o-mini",
+#         "messages": PROMPT_MESSAGES,
+#         "max_tokens": 500,
+#     }
+#     try:
+#         result = client.chat.completions.create(**params)
+#         explanation = result.choices[0].message.content
+#         print(f"Generated explanation based on provided prompt.", file=sys.stderr)
+#     except Exception as e:
+#         print(json.dumps({
+#             "error": f"Error generating explanation: {str(e)}"
+#         }))
+#         sys.exit(1)
+#     # Save the explanation as a text file
+#     explanation_file = os.path.join(output_dir, f"explanation_{timestamp}.txt")
+#     with open(explanation_file, "w") as f:
+#         f.write(explanation)
+#     # Generate audio from the explanation
+#     audio_filename = f"explanation_{timestamp}.mp3"
+#     audio_path = os.path.join(output_dir, audio_filename)
+#     try:
+#         # Default to English for TTS
+#         tts = gTTS(text=explanation, lang='en')
+#         tts.save(audio_path)
+#         print(f"Generated audio file.", file=sys.stderr)
+#     except Exception as e:
+#         print(json.dumps({
+#             "error": f"Error generating audio: {str(e)}"
+#         }))
+#         sys.exit(1)
+#     # Return the results as JSON
+#     result = {
+#         "success": True,
+#         "explanation": explanation,
+#         "explanationFilePath": explanation_file,
+#         "audioFilename": audio_filename,
+#         "audioFilePath": audio_path
+#     }
+#     print(json.dumps(result))
+# if __name__ == "__main__":
+#     main()
+import gradio as gr
+import cv2
+import base64
+import time
+import os
+import json
+import sys
+from openai import OpenAI
+from dotenv import load_dotenv
+from gtts import gTTS
+import tempfile # To handle temporary files for Gradio uploads
+# Load environment variables from .env file (for local testing)
+load_dotenv()
+def generate_explanation(video_file_path, prompt_text, openai_api_key_input):
+    """
+    Processes a video, generates an explanation using OpenAI, and converts it to audio.
+    This function is designed to be called by Gradio.
+    """
+    # Prioritize API key from environment variables (Hugging Face Secrets)
+    # If not found, use the key provided in the Gradio UI.
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        api_key = openai_api_key_input
+        if not api_key or api_key == "<your OpenAI API key if not set as env var>":
+            return "Error: OpenAI API key is missing. Please provide it in the input field or set it as an environment variable (OPENAI_API_KEY).", None
+    client = OpenAI(api_key=api_key)
+    print(f"Video file path: {video_file_path}")
+    if not video_file_path:
+        return "Error: Please upload a video file.", None
+    if not prompt_text:
+        return "Error: Please provide an explanation prompt.", None
+    # Open the video file
+    video = cv2.VideoCapture(video_file_path)
+    if not video.isOpened():
+        return f"Error: Failed to open video file: {video_file_path}", None
+    # Extract frames from video
+    base64Frames = []
+    frame_count = 0
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Calculate sampling rate to get around 20-30 frames total
+    if total_frames > 0:
+        sampling_rate = max(1, total_frames // 25)
+    else:
+        sampling_rate = 50  # Default fallback if total_frames is 0
+    while video.isOpened():
+        success, frame = video.read()
+        if not success:
+            break
+        # Only take every Nth frame to reduce processing
+        if frame_count % sampling_rate == 0:
+            _, buffer = cv2.imencode(".jpg", frame)
+            base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        frame_count += 1
+    video.release()
+    print(f"Processed {len(base64Frames)} frames from {total_frames} total frames.")
+    PROMPT_MESSAGES = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Create what is asked for in {prompt_text} for the images provided to you. Do not ask any questions. Just do what the user asks for in {prompt_text} for the images provided to you"
+                },
+                *[
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{frame}",
+                            "detail": "low"
+                        }
+                    } for frame in base64Frames
+                ]
+            ],
+        },
+    ]
+    params = {
+        "model": "gpt-4o-mini",
+        "messages": PROMPT_MESSAGES,
+        "max_tokens": 500,
+    }
+    explanation = ""
+    try:
+        result = client.chat.completions.create(**params)
+        explanation = result.choices[0].message.content
+        print("Generated explanation based on provided prompt.")
+    except Exception as e:
+        return f"Error generating explanation: {str(e)}", None
+    # Generate audio from the explanation
+    # Use tempfile to create a temporary audio file that Gradio can serve
+    try:
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio_file:
+            tts = gTTS(text=explanation, lang='en')
+            tts.save(temp_audio_file.name)
+            audio_path = temp_audio_file.name
+        print("Generated audio file.")
+    except Exception as e:
+        return f"Error generating audio: {str(e)}", None
+    return explanation, audio_path
+# Create the Gradio Interface
+iface = gr.Interface(
+    fn=generate_explanation,
+    inputs=[
+        gr.File(label="Upload Video File", type="filepath", file_count="single", file_types=[".mp4", ".avi", ".mov", ".webm"]),
+        gr.Textbox(label="Explanation Prompt", placeholder="e.g., 'What is happening in this video? Describe the main actions and objects.'", lines=5),
+        gr.Textbox(label="OpenAI API Key", type="password", placeholder="sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
+    ],
+    outputs=[
+        gr.Textbox(label="Generated Explanation", lines=10),
+        gr.Audio(label="Explanation Audio", type="filepath")
+    ],
+    title="Video Explanation Agent ",
+    description="Upload a video and provide a prompt to get an AI-generated explanation and an audio version of the explanation.",
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+opencv-python
+openai
+python-dotenv
+gTTS