Safetensors
English
llava
video-retrieval
text-to-video-search
multimodal-embedding
File size: 8,321 Bytes
7daf628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python3
"""
Video Grid Visualizer (imageio-based with Grid)

Creates a grid layout of multiple videos with visual separators and saves it
as an MP4 or GIF. This version uses the imageio library for robust video
writing, adds customizable grid gaps, and intelligently resizes cells for a
balanced look.
"""

import cv2
import numpy as np
import argparse
import os
import imageio.v2 as imageio # Use imageio.v2 to avoid deprecation warnings

def get_video_info(video_path):
    """Get video information using imageio with a fallback to OpenCV."""
    try:
        with imageio.get_reader(video_path) as reader:
            meta = reader.get_meta_data()
            fps = meta.get('fps', 30)
            duration = meta.get('duration', 0)
            size = meta.get('size', (0, 0))
            if duration == 0 and fps > 0:
                duration = reader.count_frames() / fps
            frame_count = int(duration * fps) if duration and fps else reader.count_frames()
            
            return {
                'duration': duration, 'fps': fps, 'frame_count': frame_count,
                'width': size[0], 'height': size[1]
            }
    except Exception:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError(f"Cannot open video: {video_path}")
        
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        duration = frame_count / fps if fps > 0 else 0
        cap.release()
        return {
            'duration': duration, 'fps': fps, 'frame_count': frame_count,
            'width': width, 'height': height
        }

def resize_frame(frame, target_width, target_height):
    """Resize frame to fit inside target dimensions, maintaining aspect ratio."""
    h, w = frame.shape[:2]
    aspect = w / h
    target_aspect = target_width / target_height

    if aspect > target_aspect:
        new_w = target_width
        new_h = int(new_w / aspect)
    else:
        new_h = target_height
        new_w = int(new_h * aspect)
        
    resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    canvas = np.zeros((target_height, target_width, 3), dtype=np.uint8)
    y_offset = (target_height - new_h) // 2
    x_offset = (target_width - new_w) // 2
    canvas[y_offset:y_offset + new_h, x_offset:x_offset + new_w] = resized
    
    return canvas

def create_video_grid(video_paths, n_rows=1, n_cols=None, save_path="output.mp4", 
                      gap=10, grid_color_name="white", max_cell_width=640):
    """
    Create a grid of videos with separators and save as MP4 or GIF.
    """
    if not video_paths:
        raise ValueError("No video paths provided")
    
    if n_cols is None:
        n_cols = len(video_paths)
    
    if n_rows * n_cols < len(video_paths):
        raise ValueError(f"Grid size ({n_rows}x{n_cols}) is too small for {len(video_paths)} videos")

    grid_color = (255, 255, 255) if grid_color_name.lower() == "white" else (0, 0, 0)

    print("Gathering video information...")
    video_infos = [get_video_info(path) for path in video_paths]
    
    # --- Intelligent Resizing and Dimension Calculation ---
    valid_infos = [info for info in video_infos if info['width'] > 0 and info['height'] > 0]
    if not valid_infos:
        raise ValueError("Could not get valid dimensions from any input video.")
        
    avg_aspect_ratio = sum(info['width'] / info['height'] for info in valid_infos) / len(valid_infos)
    
    cell_width = min(max_cell_width, 1920 // n_cols) # Don't let cells get too big
    cell_height = int(cell_width / avg_aspect_ratio)

    output_width = (cell_width * n_cols) + (gap * (n_cols + 1))
    output_height = (cell_height * n_rows) + (gap * (n_rows + 1))
    
    # Ensure dimensions are even, as required by many video codecs
    output_width += output_width % 2
    output_height += output_height % 2
    # ---

    max_duration = max(info['duration'] for info in video_infos if info)
    target_fps = max(info['fps'] for info in video_infos if info and info['fps'])
    if not target_fps or target_fps <= 0:
        target_fps = 30
    
    total_frames = int(max_duration * target_fps)
    
    print(f"Grid: {n_rows}x{n_cols} with {gap}px {grid_color_name} gaps")
    print(f"Calculated Cell Size: {cell_width}x{cell_height}")
    print(f"Final Output Size: {output_width}x{output_height}")
    print(f"Max duration: {max_duration:.2f}s | Target FPS: {target_fps} | Total frames: {total_frames}")

    caps = [cv2.VideoCapture(path) for path in video_paths]
    last_frames = [None] * len(video_paths)

    writer = imageio.get_writer(save_path, fps=target_fps, codec='libx264', macro_block_size=None)

    try:
        for frame_idx in range(total_frames):
            # Initialize the master frame with the grid color
            output_frame = np.full((output_height, output_width, 3), grid_color, dtype=np.uint8)
            
            for i, cap in enumerate(caps):
                ret, frame = cap.read()
                if ret:
                    last_frames[i] = frame
                else:
                    if last_frames[i] is None:
                        info = video_infos[i]
                        h = info['height'] if info['height'] > 0 else cell_height
                        w = info['width'] if info['width'] > 0 else cell_width
                        last_frames[i] = np.zeros((h, w, 3), dtype=np.uint8)
                    frame = last_frames[i]
                
                resized_frame = resize_frame(frame, cell_width, cell_height)
                
                row = i // n_cols
                col = i % n_cols
                
                # Calculate position with gaps
                y_start = gap + row * (cell_height + gap)
                x_start = gap + col * (cell_width + gap)
                
                output_frame[y_start : y_start + cell_height, x_start : x_start + cell_width] = resized_frame
            
            rgb_frame = cv2.cvtColor(output_frame, cv2.COLOR_BGR2RGB)
            writer.append_data(rgb_frame)
            
            if frame_idx % int(target_fps) == 0 or frame_idx == total_frames - 1:
                progress = (frame_idx + 1) / total_frames * 100
                print(f"Processing... {progress:.1f}% complete", end='\r')

    finally:
        print("\nCleaning up resources...")
        for cap in caps:
            cap.release()
        writer.close()
    
    print(f"Video grid successfully saved to: {save_path}")

def main():
    parser = argparse.ArgumentParser(
        description="Create a grid visualization of multiple videos with separators.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("video_paths", nargs="+", help="One or more paths to input videos.")
    parser.add_argument("--n_rows", type=int, default=1, help="Number of rows in the grid.")
    parser.add_argument("--n_cols", type=int, help="Number of columns. Defaults to the number of videos if n_rows is 1.")
    parser.add_argument("--save_path", default="output.mp4", help="Output path for the video (e.g., 'output.mp4' or 'output.gif').")
    parser.add_argument("--gap", type=int, default=10, help="Size of the gap between videos in pixels.")
    parser.add_argument("--grid_color", default="white", choices=["white", "black"], help="Color of the grid gaps.")
    parser.add_argument("--max_cell_width", type=int, default=640, help="Maximum width for each video cell in the grid.")

    args = parser.parse_args()
    
    # Set default n_cols if not provided
    if args.n_cols is None:
        args.n_cols = len(args.video_paths) // args.n_rows
        if len(args.video_paths) % args.n_rows != 0:
            args.n_cols += 1

    try:
        create_video_grid(
            video_paths=args.video_paths,
            n_rows=args.n_rows,
            n_cols=args.n_cols,
            save_path=args.save_path,
            gap=args.gap,
            grid_color_name=args.grid_color,
            max_cell_width=args.max_cell_width
        )
    except Exception as e:
        print(f"\nAn error occurred: {e}")

if __name__ == "__main__":
    main()