Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 22, 2024

Commit

d8bea64

verified ·

1 Parent(s): ad632e1

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -22

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import cv2
 import numpy as np
-from transformers import CLIPProcessor, CLIPModel
 import torch
 from PIL import Image
 import faiss
@@ -13,13 +13,21 @@ import os
 import shutil
 class VideoRAGTool:
-    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
         """
-        Initialize the Video RAG Tool with CLIP model for frame analysis.
         """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = CLIPModel.from_pretrained(model_name).to(self.device)
-        self.processor = CLIPProcessor.from_pretrained(model_name)
         self.frame_index = None
         self.frame_data = []
         self.logger = self._setup_logger()
@@ -33,6 +41,13 @@ class VideoRAGTool:
         logger.addHandler(handler)
         return logger
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
         """Process video file and extract features from frames."""
         self.logger.info(f"Processing video: {video_path}")
@@ -49,12 +64,17 @@ class VideoRAGTool:
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 image = Image.fromarray(frame_rgb)
-                inputs = self.processor(images=image, return_tensors="pt").to(self.device)
-                image_features = self.model.get_image_features(**inputs)
                 self.frame_data.append({
                     'frame_number': frame_count,
-                    'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS)
                 })
                 features_list.append(image_features.cpu().detach().numpy())
@@ -75,8 +95,8 @@ class VideoRAGTool:
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")
-        inputs = self.processor(text=[query_text], return_tensors="pt").to(self.device)
-        text_features = self.model.get_text_features(**inputs)
         distances, indices = self.frame_index.search(
             text_features.cpu().detach().numpy(),
@@ -109,10 +129,7 @@ class VideoRAGApp:
             if video_file is None:
                 return "Please upload a video first."
-            # video_file is now a file path provided by Gradio
             video_path = video_file.name
-            # Create a copy in our temp directory
             temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
             shutil.copy2(video_path, temp_video_path)
@@ -135,7 +152,7 @@ class VideoRAGApp:
             results = self.rag_tool.query_video(query_text, k=4)
             frames = []
-            captions = []
             cap = cv2.VideoCapture(self.current_video_path)
@@ -148,13 +165,19 @@ class VideoRAGApp:
                     frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     frames.append(Image.fromarray(frame_rgb))
-                    caption = f"Timestamp: {result['timestamp']:.2f}s\n"
-                    caption += f"Relevance: {result['relevance_score']:.2f}"
-                    captions.append(caption)
             cap.release()
-            return frames, "\n\n".join(captions)
         except Exception as e:
             return None, f"Error querying video: {str(e)}"
@@ -194,9 +217,10 @@ class VideoRAGApp:
                     height="auto"
                 )
-            captions = gr.Textbox(
-                label="Frame Details",
-                interactive=False
             )
             process_button.click(
@@ -208,7 +232,7 @@ class VideoRAGApp:
             query_button.click(
                 fn=self.query_video,
                 inputs=[query_input],
-                outputs=[gallery, captions]
             )
         return interface

 import cv2
 import numpy as np
+from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
 import torch
 from PIL import Image
 import faiss
 import shutil
 class VideoRAGTool:
+    def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
+                 blip_model_name: str = "Salesforce/blip-image-captioning-base"):
         """
+        Initialize the Video RAG Tool with CLIP and BLIP models for frame analysis and captioning.
         """
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize CLIP for frame retrieval
+        self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
+        self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
+        # Initialize BLIP for image captioning
+        self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
+        self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
         self.frame_index = None
         self.frame_data = []
         self.logger = self._setup_logger()
         logger.addHandler(handler)
         return logger
+    def generate_caption(self, image: Image.Image) -> str:
+        """Generate a description for the given image using BLIP."""
+        inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
+        out = self.blip_model.generate(**inputs)
+        caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
+        return caption
     def process_video(self, video_path: str, frame_interval: int = 30) -> None:
         """Process video file and extract features from frames."""
         self.logger.info(f"Processing video: {video_path}")
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 image = Image.fromarray(frame_rgb)
+                # Generate caption for the frame
+                caption = self.generate_caption(image)
+                # Process frame with CLIP
+                inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
+                image_features = self.clip_model.get_image_features(**inputs)
                 self.frame_data.append({
                     'frame_number': frame_count,
+                    'timestamp': frame_count / cap.get(cv2.CAP_PROP_FPS),
+                    'caption': caption
                 })
                 features_list.append(image_features.cpu().detach().numpy())
         """Query the video using natural language and return relevant frames."""
         self.logger.info(f"Processing query: {query_text}")
+        inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
+        text_features = self.clip_model.get_text_features(**inputs)
         distances, indices = self.frame_index.search(
             text_features.cpu().detach().numpy(),
             if video_file is None:
                 return "Please upload a video first."
             video_path = video_file.name
             temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
             shutil.copy2(video_path, temp_video_path)
             results = self.rag_tool.query_video(query_text, k=4)
             frames = []
+            descriptions = []
             cap = cv2.VideoCapture(self.current_video_path)
                     frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                     frames.append(Image.fromarray(frame_rgb))
+                    description = f"Timestamp: {result['timestamp']:.2f}s\n"
+                    description += f"Scene Description: {result['caption']}\n"
+                    description += f"Relevance Score: {result['relevance_score']:.2f}"
+                    descriptions.append(description)
             cap.release()
+            # Combine all descriptions with frame numbers
+            combined_description = "\n\nFrame Analysis:\n\n"
+            for i, desc in enumerate(descriptions, 1):
+                combined_description += f"Frame {i}:\n{desc}\n\n"
+            return frames, combined_description
         except Exception as e:
             return None, f"Error querying video: {str(e)}"
                     height="auto"
                 )
+            descriptions = gr.Textbox(
+                label="Scene Descriptions",
+                interactive=False,
+                lines=10
             )
             process_button.click(
             query_button.click(
                 fn=self.query_video,
                 inputs=[query_input],
+                outputs=[gallery, descriptions]
             )
         return interface