Multimodal_Gemini / gemini_video_analysis.py
Rahatara's picture
Rename app.py to gemini_video_analysis.py
031d707 verified
import cv2
import gradio as gr
import google.generativeai as genai
import os
import PIL.Image
YOUR_API_KEY= "AIzaSyBjb6LLerzZE6JIIE0YBK6Wn0hqdO9E1Zk"
genai.configure(api_key="AIzaSyBjb6LLerzZE6JIIE0YBK6Wn0hqdO9E1Zk")
# Define the Generative AI model
model = genai.GenerativeModel('gemini-pro-vision')
# Function to capture frames from a video
def frame_capture(video_path, num_frames=5):
vidObj = cv2.VideoCapture(video_path)
frames = []
count = 0
while True:
success, image = vidObj.read()
if not success:
break
if count % (vidObj.get(cv2.CAP_PROP_FRAME_COUNT) // num_frames) == 0:
frames.append(image)
count += 1
return frames
#format description
def format_descriptions(descriptions):
# Join the descriptions into a single string
formatted_description = ' '.join(descriptions)
# Remove any leading or trailing whitespace
formatted_description = formatted_description.strip()
# Replace any occurrences of special characters with a space
formatted_description = ''.join(char if char.isalnum() or char.isspace() else ' ' for char in formatted_description)
return formatted_description
# Function to generate text descriptions for frames
def generate_descriptions_for_frames(video_path):
# Capture frames from the video
frames = frame_capture(video_path)
# Prepare images for input to the model
images = [PIL.Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames]
# Prepare the prompt with images and instructions
instructions = "Instructions: Consider the following frames from the video:"
prompt = "What is shown in each of the frames?"
images_with_prompt = [prompt] + [instructions] + images
# Generate content using the model
responses = model.generate_content(images_with_prompt)
# Extract and return the text descriptions from the responses
descriptions = [response.text for response in responses]
formatted_description = format_descriptions(descriptions)
return formatted_description
# Define Gradio interface
video_input = gr.Video(label="Upload Video", autoplay=True)
output_text = gr.Textbox(label="Frame Descriptions")
# Create Gradio app
gr.Interface(fn= generate_descriptions_for_frames, inputs=video_input, outputs=output_text, title="Frame Description Generator").launch()