Update app.py
Browse files
app.py
CHANGED
|
@@ -11,86 +11,144 @@ import gradio as gr
|
|
| 11 |
import tempfile
|
| 12 |
import os
|
| 13 |
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class VideoRAGTool:
|
| 16 |
def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
|
| 17 |
blip_model_name: str = "Salesforce/blip-image-captioning-base"):
|
| 18 |
-
"""
|
| 19 |
-
Initialize the Video RAG Tool with CLIP and BLIP models for frame analysis and captioning.
|
| 20 |
-
"""
|
| 21 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 22 |
|
| 23 |
-
# Initialize
|
| 24 |
self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
|
| 25 |
self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
|
| 26 |
-
|
| 27 |
-
# Initialize BLIP for image captioning
|
| 28 |
self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
|
| 29 |
self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
self.frame_index = None
|
| 32 |
self.frame_data = []
|
| 33 |
self.logger = self._setup_logger()
|
| 34 |
|
| 35 |
-
|
| 36 |
-
logger = logging.getLogger('VideoRAGTool')
|
| 37 |
-
logger.setLevel(logging.INFO)
|
| 38 |
-
handler = logging.StreamHandler()
|
| 39 |
-
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 40 |
-
handler.setFormatter(formatter)
|
| 41 |
-
logger.addHandler(handler)
|
| 42 |
-
return logger
|
| 43 |
-
|
| 44 |
def generate_caption(self, image: Image.Image) -> str:
|
| 45 |
-
"""
|
| 46 |
inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
|
| 47 |
-
out = self.blip_model.generate(**inputs)
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
def process_video(self, video_path: str, frame_interval: int = 30) -> None:
|
| 52 |
-
"""
|
| 53 |
self.logger.info(f"Processing video: {video_path}")
|
|
|
|
|
|
|
| 54 |
cap = cv2.VideoCapture(video_path)
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
features_list = []
|
|
|
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
if frame_count % frame_interval == 0:
|
| 64 |
-
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 65 |
-
image = Image.fromarray(frame_rgb)
|
| 66 |
-
|
| 67 |
-
# Generate caption for the frame
|
| 68 |
-
caption = self.generate_caption(image)
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
-
|
| 75 |
-
'frame_number': frame_count,
|
| 76 |
-
'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS),
|
| 77 |
-
'caption': caption
|
| 78 |
-
})
|
| 79 |
-
features_list.append(image_features.cpu().detach().numpy())
|
| 80 |
-
|
| 81 |
-
frame_count += 1
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
cap.release()
|
| 84 |
|
| 85 |
if not features_list:
|
| 86 |
raise ValueError("No frames were processed from the video")
|
| 87 |
-
|
|
|
|
| 88 |
features_array = np.vstack(features_list)
|
| 89 |
self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
|
| 90 |
self.frame_index.add(features_array)
|
| 91 |
|
| 92 |
self.logger.info(f"Processed {len(self.frame_data)} frames from video")
|
| 93 |
|
|
|
|
| 94 |
def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
|
| 95 |
"""Query the video using natural language and return relevant frames."""
|
| 96 |
self.logger.info(f"Processing query: {query_text}")
|
|
|
|
| 11 |
import tempfile
|
| 12 |
import os
|
| 13 |
import shutil
|
| 14 |
+
from tqdm import tqdm
|
| 15 |
+
import torch.nn as nn
|
| 16 |
+
import math
|
| 17 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
+
import numpy as np
|
| 19 |
|
| 20 |
class VideoRAGTool:
|
| 21 |
def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
|
| 22 |
blip_model_name: str = "Salesforce/blip-image-captioning-base"):
|
| 23 |
+
"""Initialize with performance optimizations."""
|
|
|
|
|
|
|
| 24 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 25 |
|
| 26 |
+
# Initialize models with optimization flags
|
| 27 |
self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
|
| 28 |
self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
|
|
|
|
|
|
|
| 29 |
self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
|
| 30 |
self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
|
| 31 |
|
| 32 |
+
# Enable eval mode for inference
|
| 33 |
+
self.clip_model.eval()
|
| 34 |
+
self.blip_model.eval()
|
| 35 |
+
|
| 36 |
+
# Batch processing settings
|
| 37 |
+
self.batch_size = 8 # Adjust based on your GPU memory
|
| 38 |
+
|
| 39 |
self.frame_index = None
|
| 40 |
self.frame_data = []
|
| 41 |
self.logger = self._setup_logger()
|
| 42 |
|
| 43 |
+
@torch.no_grad() # Disable gradient computation for inference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def generate_caption(self, image: Image.Image) -> str:
|
| 45 |
+
"""Optimized caption generation."""
|
| 46 |
inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
|
| 47 |
+
out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
|
| 48 |
+
return self.blip_processor.decode(out[0], skip_special_tokens=True)
|
| 49 |
+
|
| 50 |
+
def get_video_info(self, video_path: str) -> Tuple[int, float]:
|
| 51 |
+
"""Get video frame count and FPS."""
|
| 52 |
+
cap = cv2.VideoCapture(video_path)
|
| 53 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 54 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 55 |
+
cap.release()
|
| 56 |
+
return total_frames, fps
|
| 57 |
+
|
| 58 |
+
def preprocess_frame(self, frame: np.ndarray, target_size: Tuple[int, int] = (224, 224)) -> Image.Image:
|
| 59 |
+
"""Preprocess frame with resizing for efficiency."""
|
| 60 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 61 |
+
image = Image.fromarray(frame_rgb)
|
| 62 |
+
return image.resize(target_size, Image.LANCZOS)
|
| 63 |
+
|
| 64 |
+
@torch.no_grad()
|
| 65 |
+
def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
|
| 66 |
+
"""Process a batch of frames efficiently."""
|
| 67 |
+
# CLIP processing
|
| 68 |
+
clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
|
| 69 |
+
image_features = self.clip_model.get_image_features(**clip_inputs)
|
| 70 |
+
|
| 71 |
+
# BLIP processing
|
| 72 |
+
captions = []
|
| 73 |
+
blip_inputs = self.blip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
|
| 74 |
+
out = self.blip_model.generate(**blip_inputs, max_length=30, num_beams=2)
|
| 75 |
+
|
| 76 |
+
for o in out:
|
| 77 |
+
caption = self.blip_processor.decode(o, skip_special_tokens=True)
|
| 78 |
+
captions.append(caption)
|
| 79 |
+
|
| 80 |
+
return image_features.cpu().numpy(), captions
|
| 81 |
|
| 82 |
def process_video(self, video_path: str, frame_interval: int = 30) -> None:
|
| 83 |
+
"""Optimized video processing with batching and progress tracking."""
|
| 84 |
self.logger.info(f"Processing video: {video_path}")
|
| 85 |
+
|
| 86 |
+
total_frames, fps = self.get_video_info(video_path)
|
| 87 |
cap = cv2.VideoCapture(video_path)
|
| 88 |
+
|
| 89 |
+
# Calculate total batches for progress bar
|
| 90 |
+
frames_to_process = total_frames // frame_interval
|
| 91 |
+
total_batches = math.ceil(frames_to_process / self.batch_size)
|
| 92 |
+
|
| 93 |
+
current_batch = []
|
| 94 |
features_list = []
|
| 95 |
+
frame_count = 0
|
| 96 |
|
| 97 |
+
with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
|
| 98 |
+
while cap.isOpened():
|
| 99 |
+
ret, frame = cap.read()
|
| 100 |
+
if not ret:
|
| 101 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
if frame_count % frame_interval == 0:
|
| 104 |
+
# Preprocess frame
|
| 105 |
+
processed_frame = self.preprocess_frame(frame)
|
| 106 |
+
current_batch.append(processed_frame)
|
| 107 |
+
|
| 108 |
+
# Process batch when it reaches batch_size
|
| 109 |
+
if len(current_batch) == self.batch_size:
|
| 110 |
+
batch_features, batch_captions = self.process_batch(current_batch)
|
| 111 |
+
|
| 112 |
+
# Store results
|
| 113 |
+
for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
|
| 114 |
+
batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
|
| 115 |
+
self.frame_data.append({
|
| 116 |
+
'frame_number': batch_frame_number,
|
| 117 |
+
'timestamp': batch_frame_number / fps,
|
| 118 |
+
'caption': caption
|
| 119 |
+
})
|
| 120 |
+
features_list.append(features)
|
| 121 |
+
|
| 122 |
+
current_batch = []
|
| 123 |
+
pbar.update(self.batch_size)
|
| 124 |
|
| 125 |
+
frame_count += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
# Process remaining frames
|
| 128 |
+
if current_batch:
|
| 129 |
+
batch_features, batch_captions = self.process_batch(current_batch)
|
| 130 |
+
for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
|
| 131 |
+
batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
|
| 132 |
+
self.frame_data.append({
|
| 133 |
+
'frame_number': batch_frame_number,
|
| 134 |
+
'timestamp': batch_frame_number / fps,
|
| 135 |
+
'caption': caption
|
| 136 |
+
})
|
| 137 |
+
features_list.append(features)
|
| 138 |
+
|
| 139 |
cap.release()
|
| 140 |
|
| 141 |
if not features_list:
|
| 142 |
raise ValueError("No frames were processed from the video")
|
| 143 |
+
|
| 144 |
+
# Create FAISS index
|
| 145 |
features_array = np.vstack(features_list)
|
| 146 |
self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
|
| 147 |
self.frame_index.add(features_array)
|
| 148 |
|
| 149 |
self.logger.info(f"Processed {len(self.frame_data)} frames from video")
|
| 150 |
|
| 151 |
+
|
| 152 |
def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
|
| 153 |
"""Query the video using natural language and return relevant frames."""
|
| 154 |
self.logger.info(f"Processing query: {query_text}")
|