File size: 3,347 Bytes
10c3a07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e1249d
 
3d7f238
 
 
25080b2
3d7f238
 
 
25080b2
3d7f238
 
0e1249d
 
3d7f238
0e1249d
 
10c3a07
 
 
0e1249d
10c3a07
 
 
25080b2
 
 
10c3a07
0e1249d
 
10c3a07
 
 
 
 
0e1249d
 
 
e05482a
0e1249d
 
6517f50
0e1249d
 
3d7f238
0e1249d
 
3d7f238
 
0e1249d
3d7f238
 
0e1249d
3d7f238
 
 
0e1249d
e05482a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import cv2
import gradio as gr
import google.generativeai as genai
import os
import PIL.Image

# Configure the API key for Google Generative AI
genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

# Define the Generative AI model
model = genai.GenerativeModel('gemini-1.5-flash')

# Function to capture frames from a video
def frame_capture(video_path, num_frames=5):
    vidObj = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(vidObj.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_step = max(1, total_frames // num_frames)
    count = 0
    
    while len(frames) < num_frames:
        vidObj.set(cv2.CAP_PROP_POS_FRAMES, count)
        success, image = vidObj.read()
        if not success:
            break
        frames.append(image)
        count += frame_step
    
    vidObj.release()
    return frames

# Function to generate text descriptions for frames
def generate_descriptions_for_frames(video_path):
    frames = frame_capture(video_path)
    images = [PIL.Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames]
    
    prompt = "Describe what is happening in each of these frames. Identify any potential railway defect or risk."
    images_with_prompt = [prompt] + images
    
    responses = model.generate_content(images_with_prompt)
    
    descriptions = [response.text for response in responses]
    
    formatted_description = format_descriptions(descriptions)
    return formatted_description

# Function to handle chat interaction
def chat_interaction(video_path, chatbot, user_message):
    frames = frame_capture(video_path)
    images = [PIL.Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) for frame in frames]
    
    prompt = f"Based on these video frames, {user_message}"
    images_with_prompt = [prompt] + images
    
    responses = model.generate_content(images_with_prompt)
    
    # Collect the text responses properly
    response_text = "".join([response.text for response in responses])
    
    chatbot.append((user_message, response_text))
    return "", chatbot

# Helper function to format descriptions
def format_descriptions(descriptions):
    return ' '.join(descriptions).strip()

# Define the Gradio interfaces for each tab
# Tab 1: Video Analysis System with Set Prompt
with gr.Blocks() as tab1:
    with gr.Column():
        gr.Markdown("### Video Analysis System")
        video_input_1 = gr.Video(label="Upload Video", autoplay=True)
        output_text = gr.Textbox(label="What's this video")
        analyze_button = gr.Button("Analyze Video")
        analyze_button.click(fn=generate_descriptions_for_frames, inputs=video_input_1, outputs=output_text)

# Tab 2: Interactive Chat Mode
with gr.Blocks() as tab2:
    with gr.Column():
        gr.Markdown("### Interactive Chat Mode")
        video_input_2 = gr.Video(label="Upload Video", autoplay=True)
        chatbot = gr.Chatbot(label="Video Analysis Chatbot")
        user_input = gr.Textbox(label="Ask something specific about the video", placeholder="E.g., Are there any cars in this video?")
        user_input.submit(fn=chat_interaction, inputs=[video_input_2, chatbot, user_input], outputs=[user_input, chatbot])

# Combine the two tabs into a single interface
with gr.Blocks() as demo:
    gr.TabbedInterface([tab1, tab2], ["Video Analysis", "Interactive Chat"])

demo.launch()