Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 22, 2024

Commit

8ad7e0c

verified ·

1 Parent(s): b04a1d0

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -44

app.py CHANGED Viewed

@@ -11,86 +11,144 @@ import gradio as gr
 import tempfile
 import os
 import shutil
 class VideoRAGTool:
     def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
                  blip_model_name: str = "Salesforce/blip-image-captioning-base"):
-        """
-        Initialize the Video RAG Tool with CLIP and BLIP models for frame analysis and captioning.
-        """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # Initialize CLIP for frame retrieval
         self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
         self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
-        # Initialize BLIP for image captioning
         self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
         self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
         self.frame_index = None
         self.frame_data = []
         self.logger = self._setup_logger()
-    def _setup_logger(self) -> logging.Logger:
-        logger = logging.getLogger('VideoRAGTool')
-        logger.setLevel(logging.INFO)
-        handler = logging.StreamHandler()
-        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-        handler.setFormatter(formatter)
-        logger.addHandler(handler)
-        return logger
     def generate_caption(self, image: Image.Image) -> str:
-        """Generate a description for the given image using BLIP."""
         inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
-        out = self.blip_model.generate(**inputs)
-        caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
-        return caption
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
-        """Process video file and extract features from frames."""
         self.logger.info(f"Processing video: {video_path}")
         cap = cv2.VideoCapture(video_path)
-        frame_count = 0
         features_list = []
-        while cap.isOpened():
-            ret, frame = cap.read()
-            if not ret:
-                break
-            if frame_count % frame_interval == 0:
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                image = Image.fromarray(frame_rgb)
-                # Generate caption for the frame
-                caption = self.generate_caption(image)
-                # Process frame with CLIP
-                inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
-                image_features = self.clip_model.get_image_features(**inputs)
-                self.frame_data.append({
-                    'frame_number': frame_count,
-                    'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS),
-                    'caption': caption
-                })
-                features_list.append(image_features.cpu().detach().numpy())
-            frame_count += 1
         cap.release()
         if not features_list:
             raise ValueError("No frames were processed from the video")
         features_array = np.vstack(features_list)
         self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
         self.frame_index.add(features_array)
         self.logger.info(f"Processed {len(self.frame_data)} frames from video")
     def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")

 import tempfile
 import os
 import shutil
+from tqdm import tqdm
+import torch.nn as nn
+import math
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
 class VideoRAGTool:
     def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
                  blip_model_name: str = "Salesforce/blip-image-captioning-base"):
+        """Initialize with performance optimizations."""
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize models with optimization flags
         self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
         self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
         self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
         self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
+        # Enable eval mode for inference
+        self.clip_model.eval()
+        self.blip_model.eval()
+        # Batch processing settings
+        self.batch_size = 8  # Adjust based on your GPU memory
         self.frame_index = None
         self.frame_data = []
         self.logger = self._setup_logger()
+    @torch.no_grad()  # Disable gradient computation for inference
     def generate_caption(self, image: Image.Image) -> str:
+        """Optimized caption generation."""
         inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
+        out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
+        return self.blip_processor.decode(out[0], skip_special_tokens=True)
+    def get_video_info(self, video_path: str) -> Tuple[int, float]:
+        """Get video frame count and FPS."""
+        cap = cv2.VideoCapture(video_path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        cap.release()
+        return total_frames, fps
+    def preprocess_frame(self, frame: np.ndarray, target_size: Tuple[int, int] = (224, 224)) -> Image.Image:
+        """Preprocess frame with resizing for efficiency."""
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(frame_rgb)
+        return image.resize(target_size, Image.LANCZOS)
+    @torch.no_grad()
+    def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
+        """Process a batch of frames efficiently."""
+        # CLIP processing
+        clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
+        image_features = self.clip_model.get_image_features(**clip_inputs)
+        # BLIP processing
+        captions = []
+        blip_inputs = self.blip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
+        out = self.blip_model.generate(**blip_inputs, max_length=30, num_beams=2)
+        for o in out:
+            caption = self.blip_processor.decode(o, skip_special_tokens=True)
+            captions.append(caption)
+        return image_features.cpu().numpy(), captions
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
+        """Optimized video processing with batching and progress tracking."""
         self.logger.info(f"Processing video: {video_path}")
+        total_frames, fps = self.get_video_info(video_path)
         cap = cv2.VideoCapture(video_path)
+        # Calculate total batches for progress bar
+        frames_to_process = total_frames // frame_interval
+        total_batches = math.ceil(frames_to_process / self.batch_size)
+        current_batch = []
         features_list = []
+        frame_count = 0
+        with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                if frame_count % frame_interval == 0:
+                    # Preprocess frame
+                    processed_frame = self.preprocess_frame(frame)
+                    current_batch.append(processed_frame)
+                    # Process batch when it reaches batch_size
+                    if len(current_batch) == self.batch_size:
+                        batch_features, batch_captions = self.process_batch(current_batch)
+                        # Store results
+                        for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
+                            batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
+                            self.frame_data.append({
+                                'frame_number': batch_frame_number,
+                                'timestamp': batch_frame_number / fps,
+                                'caption': caption
+                            })
+                            features_list.append(features)
+                        current_batch = []
+                        pbar.update(self.batch_size)
+                frame_count += 1
+            # Process remaining frames
+            if current_batch:
+                batch_features, batch_captions = self.process_batch(current_batch)
+                for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
+                    batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
+                    self.frame_data.append({
+                        'frame_number': batch_frame_number,
+                        'timestamp': batch_frame_number / fps,
+                        'caption': caption
+                    })
+                    features_list.append(features)
         cap.release()
         if not features_list:
             raise ValueError("No frames were processed from the video")
+        # Create FAISS index
         features_array = np.vstack(features_list)
         self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
         self.frame_index.add(features_array)
         self.logger.info(f"Processed {len(self.frame_data)} frames from video")
     def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")