Spaces:

AffordableAI
/

Mutimodal_Video_Chat_RAG

Sleeping

App Files Files Community

capradeepgujaran commited on Oct 22, 2024

Commit

e25cab4

verified ·

1 Parent(s): ff87c41

Update app.py

Browse files

Files changed (1) hide show

app.py +192 -171

app.py CHANGED Viewed

@@ -1,6 +1,11 @@
 import cv2
 import numpy as np
-from transformers import CLIPProcessor, CLIPModel, BlipProcessor, BlipForConditionalGeneration
 import torch
 from PIL import Image
 import faiss
@@ -11,233 +16,247 @@ import tempfile
 import os
 import shutil
 from tqdm import tqdm
-import math
-class VideoRAGTool:
-    def __init__(self, clip_model_name: str = "openai/clip-vit-base-patch32",
-                 blip_model_name: str = "Salesforce/blip-image-captioning-base"):
-        """Initialize with performance optimizations."""
-        # Setup logger first to avoid the attribute error
         self.logger = self.setup_logger()
-        self.logger.info("Initializing VideoRAGTool...")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.logger.info(f"Using device: {self.device}")
-        # Initialize models with optimization flags
-        self.clip_model = CLIPModel.from_pretrained(clip_model_name).to(self.device)
-        self.clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
-        self.blip_processor = BlipProcessor.from_pretrained(blip_model_name)
-        self.blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(self.device)
-        # Enable eval mode for inference
-        self.clip_model.eval()
-        self.blip_model.eval()
-        # Batch processing settings
-        self.batch_size = 4  # Reduced batch size for better memory management
         self.frame_index = None
         self.frame_data = []
     def setup_logger(self) -> logging.Logger:
-        """Set up logging configuration."""
-        logger = logging.getLogger('VideoRAGTool')
-        # Clear any existing handlers
         if logger.handlers:
             logger.handlers.clear()
         logger.setLevel(logging.INFO)
         handler = logging.StreamHandler()
-        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
         handler.setFormatter(formatter)
         logger.addHandler(handler)
         return logger
     @torch.no_grad()
-    def generate_caption(self, image: Image.Image) -> str:
-        """Optimized caption generation."""
         try:
-            inputs = self.blip_processor(image, return_tensors="pt").to(self.device)
-            out = self.blip_model.generate(**inputs, max_length=30, num_beams=2)
-            caption = self.blip_processor.decode(out[0], skip_special_tokens=True)
-            return caption
         except Exception as e:
-            self.logger.error(f"Error generating caption: {str(e)}")
-            return "Caption generation failed"
-    def get_video_info(self, video_path: str) -> Tuple[int, float]:
-        """Get video frame count and FPS."""
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fps = cap.get(cv2.CAP_PROP_FPS)
         cap.release()
-        return total_frames, fps
-    def preprocess_frame(self, frame: np.ndarray, target_size: Tuple[int, int] = (224, 224)) -> Image.Image:
-        """Preprocess frame with resizing for efficiency."""
-        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-        image = Image.fromarray(frame_rgb)
-        return image.resize(target_size, Image.LANCZOS)
     @torch.no_grad()
-    def process_batch(self, frames: List[Image.Image]) -> Tuple[np.ndarray, List[str]]:
-        """Process a batch of frames efficiently."""
-        try:
-            # CLIP processing
-            clip_inputs = self.clip_processor(images=frames, return_tensors="pt", padding=True).to(self.device)
-            image_features = self.clip_model.get_image_features(**clip_inputs)
-            # BLIP processing
-            captions = []
-            for frame in frames:
-                caption = self.generate_caption(frame)
-                captions.append(caption)
-            return image_features.cpu().numpy(), captions
-        except Exception as e:
-            self.logger.error(f"Error processing batch: {str(e)}")
-            raise
-    def process_video(self, video_path: str, frame_interval: int = 30) -> None:
-        """Optimized video processing with batching and progress tracking."""
         self.logger.info(f"Processing video: {video_path}")
         try:
-            total_frames, fps = self.get_video_info(video_path)
-            cap = cv2.VideoCapture(video_path)
-            # Calculate total batches for progress bar
-            frames_to_process = total_frames // frame_interval
-            total_batches = math.ceil(frames_to_process / self.batch_size)
-            current_batch = []
-            features_list = []
-            frame_count = 0
-            with tqdm(total=frames_to_process, desc="Processing frames") as pbar:
-                while cap.isOpened():
-                    ret, frame = cap.read()
-                    if not ret:
-                        break
-                    if frame_count % frame_interval == 0:
-                        # Preprocess frame
-                        processed_frame = self.preprocess_frame(frame)
-                        current_batch.append(processed_frame)
-                        # Process batch when it reaches batch_size
-                        if len(current_batch) == self.batch_size:
-                            batch_features, batch_captions = self.process_batch(current_batch)
-                            # Store results
-                            for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
-                                batch_frame_number = frame_count - (self.batch_size - i - 1) * frame_interval
-                                self.frame_data.append({
-                                    'frame_number': batch_frame_number,
-                                    'timestamp': batch_frame_number / fps,
-                                    'caption': caption
-                                })
-                                features_list.append(features)
-                            current_batch = []
-                            pbar.update(self.batch_size)
-                    frame_count += 1
-                # Process remaining frames
-                if current_batch:
-                    batch_features, batch_captions = self.process_batch(current_batch)
-                    for i, (features, caption) in enumerate(zip(batch_features, batch_captions)):
-                        batch_frame_number = frame_count - (len(current_batch) - i - 1) * frame_interval
-                        self.frame_data.append({
-                            'frame_number': batch_frame_number,
-                            'timestamp': batch_frame_number / fps,
-                            'caption': caption
-                        })
-                        features_list.append(features)
-            cap.release()
-            if not features_list:
-                raise ValueError("No frames were processed from the video")
             # Create FAISS index
-            features_array = np.vstack(features_list)
-            self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
-            self.frame_index.add(features_array)
-            self.logger.info(f"Processed {len(self.frame_data)} frames from video")
         except Exception as e:
             self.logger.error(f"Error processing video: {str(e)}")
             raise
-    def query_video(self, query_text: str, k: int = 5) -> List[Dict]:
-        """Query the video using natural language and return relevant frames."""
-        self.logger.info(f"Processing query: {query_text}")
         try:
-            inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
-            text_features = self.clip_model.get_text_features(**inputs)
             distances, indices = self.frame_index.search(
-                text_features.cpu().detach().numpy(),
                 k
             )
             results = []
-            for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
                 frame_info = self.frame_data[idx].copy()
                 frame_info['relevance_score'] = float(1 / (1 + distance))
-                results.append(frame_info)
             return results
         except Exception as e:
             self.logger.error(f"Error querying video: {str(e)}")
             raise
-class VideoRAGApp:
     def __init__(self):
-        self.rag_tool = VideoRAGTool()
         self.current_video_path = None
         self.processed = False
         self.temp_dir = tempfile.mkdtemp()
     def __del__(self):
-        """Cleanup temporary files on deletion"""
         if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
             shutil.rmtree(self.temp_dir, ignore_errors=True)
     def process_video(self, video_file):
-        """Process uploaded video and return status message"""
         try:
             if video_file is None:
-                return "Please upload a video first."
             video_path = video_file.name
             temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
             shutil.copy2(video_path, temp_video_path)
             self.current_video_path = temp_video_path
-            self.rag_tool.process_video(self.current_video_path)
             self.processed = True
-            return "Video processed successfully! You can now ask questions about the video."
         except Exception as e:
             self.processed = False
-            return f"Error processing video: {str(e)}"
     def query_video(self, query_text):
-        """Query the video and return relevant frames with descriptions"""
         if not self.processed:
             return None, "Please process a video first."
         try:
-            results = self.rag_tool.query_video(query_text, k=4)
             frames = []
             descriptions = []
@@ -254,13 +273,14 @@ class VideoRAGApp:
                     description = f"Timestamp: {result['timestamp']:.2f}s\n"
                     description += f"Scene Description: {result['caption']}\n"
                     description += f"Relevance Score: {result['relevance_score']:.2f}"
                     descriptions.append(description)
             cap.release()
-            # Combine all descriptions with frame numbers
-            combined_description = "\n\nFrame Analysis:\n\n"
             for i, desc in enumerate(descriptions, 1):
                 combined_description += f"Frame {i}:\n{desc}\n\n"
@@ -270,42 +290,43 @@ class VideoRAGApp:
             return None, f"Error querying video: {str(e)}"
     def create_interface(self):
-        """Create and return Gradio interface"""
-        with gr.Blocks(title="Video Chat RAG") as interface:
-            gr.Markdown("# Video Chat RAG")
-            gr.Markdown("Upload a video and ask questions about its content!")
             with gr.Row():
                 video_input = gr.File(
-                    label="Upload Video",
                     file_types=["video"],
                 )
                 process_button = gr.Button("Process Video")
-            status_output = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
             with gr.Row():
                 query_input = gr.Textbox(
-                    label="Ask about the video",
                     placeholder="What's happening in the video?"
                 )
                 query_button = gr.Button("Search")
-            with gr.Row():
-                gallery = gr.Gallery(
-                    label="Retrieved Frames",
-                    show_label=True,
-                    elem_id="gallery",
-                    columns=[2],
-                    rows=[2],
-                    height="auto"
-                )
             descriptions = gr.Textbox(
-                label="Scene Descriptions",
                 interactive=False,
                 lines=10
             )
@@ -313,7 +334,7 @@ class VideoRAGApp:
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
-                outputs=[status_output]
             )
             query_button.click(
@@ -325,7 +346,7 @@ class VideoRAGApp:
         return interface
 # Initialize and create the interface
-app = VideoRAGApp()
 interface = app.create_interface()
 # Launch the app

 import cv2
 import numpy as np
+from transformers import (
+    CLIPProcessor, CLIPModel,
+    BlipProcessor, BlipForConditionalGeneration,
+    Blip2Processor, Blip2ForConditionalGeneration,
+    AutoProcessor, AutoModelForObjectDetection
+)
 import torch
 from PIL import Image
 import faiss
 import os
 import shutil
 from tqdm import tqdm
+class EnhancedVideoAnalyzer:
+    def __init__(self):
         self.logger = self.setup_logger()
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.logger.info(f"Using device: {self.device}")
+        # Initialize CLIP for general scene understanding
+        self.logger.info("Loading CLIP model...")
+        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
+        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        # Initialize BLIP-2 for detailed scene description
+        self.logger.info("Loading BLIP-2 model...")
+        self.blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        self.blip2_model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        ).to(self.device)
+        # Initialize Object Detection model
+        self.logger.info("Loading object detection model...")
+        self.obj_processor = AutoProcessor.from_pretrained("microsoft/table-transformer-detection")
+        self.obj_model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection").to(self.device)
         self.frame_index = None
         self.frame_data = []
+        self.target_size = (384, 384)  # Increased size for better detail recognition
+        self.batch_size = 4
+        # Set all models to evaluation mode
+        self.clip_model.eval()
+        self.blip2_model.eval()
+        self.obj_model.eval()
     def setup_logger(self) -> logging.Logger:
+        logger = logging.getLogger('EnhancedVideoAnalyzer')
         if logger.handlers:
             logger.handlers.clear()
         logger.setLevel(logging.INFO)
         handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
         handler.setFormatter(formatter)
         logger.addHandler(handler)
         return logger
     @torch.no_grad()
+    def analyze_frame(self, image: Image.Image) -> Dict:
+        """Comprehensive frame analysis"""
         try:
+            # 1. Generate detailed caption using BLIP-2
+            inputs = self.blip2_processor(image, return_tensors="pt").to(self.device, torch.float16)
+            caption = self.blip2_model.generate(**inputs, max_new_tokens=50)
+            caption_text = self.blip2_processor.decode(caption[0], skip_special_tokens=True)
+            # 2. Detect objects
+            obj_inputs = self.obj_processor(images=image, return_tensors="pt").to(self.device)
+            obj_outputs = self.obj_model(**obj_inputs)
+            # Process object detection results
+            target_sizes = torch.tensor([image.size[::-1]])
+            results = self.obj_processor.post_process_object_detection(
+                obj_outputs, threshold=0.5, target_sizes=target_sizes
+            )[0]
+            detected_objects = []
+            for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+                detected_objects.append({
+                    "label": self.obj_processor.model.config.id2label[label.item()],
+                    "confidence": score.item()
+                })
+            return {
+                "caption": caption_text,
+                "objects": detected_objects
+            }
         except Exception as e:
+            self.logger.error(f"Error in frame analysis: {str(e)}")
+            return {"caption": "Error analyzing frame", "objects": []}
+    def extract_keyframes(self, video_path: str, max_frames: int = 15) -> List[Tuple[int, np.ndarray]]:
+        """Extract key frames using scene detection"""
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         fps = cap.get(cv2.CAP_PROP_FPS)
+        # Calculate frame interval to get approximately max_frames
+        frame_interval = max(1, total_frames // max_frames)
+        frames = []
+        frame_positions = []
+        prev_gray = None
+        with tqdm(total=total_frames, desc="Extracting frames") as pbar:
+            while cap.isOpened() and len(frames) < max_frames:
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                # Convert to grayscale for scene detection
+                gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+                if prev_gray is not None:
+                    # Calculate frame difference
+                    diff = cv2.absdiff(gray, prev_gray)
+                    mean_diff = np.mean(diff)
+                    # If significant change or first/last frame
+                    if mean_diff > 30 or len(frames) == 0:
+                        frames.append(frame)
+                        frame_positions.append(cap.get(cv2.CAP_PROP_POS_FRAMES))
+                prev_gray = gray
+                pbar.update(1)
         cap.release()
+        return list(zip(frame_positions, frames))
     @torch.no_grad()
+    def process_video(self, video_path: str) -> None:
+        """Process video with comprehensive analysis"""
         self.logger.info(f"Processing video: {video_path}")
+        self.frame_data = []
+        features_list = []
         try:
+            # Extract key frames
+            keyframes = self.extract_keyframes(video_path)
+            self.logger.info(f"Extracted {len(keyframes)} key frames")
+            # Process frames with progress bar
+            with tqdm(total=len(keyframes), desc="Analyzing frames") as pbar:
+                for frame_pos, frame in keyframes:
+                    # Convert frame to PIL Image
+                    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    image = Image.fromarray(frame_rgb).resize(self.target_size, Image.LANCZOS)
+                    # Analyze frame
+                    analysis = self.analyze_frame(image)
+                    # Get CLIP features
+                    clip_inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
+                    image_features = self.clip_model.get_image_features(**clip_inputs)
+                    # Store results
+                    self.frame_data.append({
+                        'frame_number': int(frame_pos),
+                        'timestamp': frame_pos / 30.0,  # Approximate timestamp
+                        'caption': analysis['caption'],
+                        'objects': analysis['objects']
+                    })
+                    features_list.append(image_features.cpu().numpy())
+                    pbar.update(1)
             # Create FAISS index
+            if features_list:
+                features_array = np.vstack(features_list)
+                self.frame_index = faiss.IndexFlatL2(features_array.shape[1])
+                self.frame_index.add(features_array)
+            self.logger.info("Video processing completed successfully")
         except Exception as e:
             self.logger.error(f"Error processing video: {str(e)}")
             raise
+    @torch.no_grad()
+    def query_video(self, query_text: str, k: int = 4) -> List[Dict]:
+        """Enhanced query processing"""
         try:
+            # Process query with CLIP
+            text_inputs = self.clip_processor(text=[query_text], return_tensors="pt").to(self.device)
+            text_features = self.clip_model.get_text_features(**text_inputs)
+            # Search for relevant frames
             distances, indices = self.frame_index.search(
+                text_features.cpu().numpy(),
                 k
             )
+            # Prepare results with enhanced information
             results = []
+            for distance, idx in zip(distances[0], indices[0]):
                 frame_info = self.frame_data[idx].copy()
+                # Add relevance score
                 frame_info['relevance_score'] = float(1 / (1 + distance))
+                # Add object summary
+                obj_summary = ", ".join(obj["label"] for obj in frame_info['objects'][:3])
+                if obj_summary:
+                    frame_info['object_summary'] = f"Objects detected: {obj_summary}"
+                results.append(frame_info)
             return results
         except Exception as e:
             self.logger.error(f"Error querying video: {str(e)}")
             raise
+class VideoQAApp:
     def __init__(self):
+        self.analyzer = EnhancedVideoAnalyzer()
         self.current_video_path = None
         self.processed = False
         self.temp_dir = tempfile.mkdtemp()
     def __del__(self):
         if hasattr(self, 'temp_dir') and os.path.exists(self.temp_dir):
             shutil.rmtree(self.temp_dir, ignore_errors=True)
     def process_video(self, video_file):
+        """Process video with progress updates"""
         try:
             if video_file is None:
+                return "Please upload a video first.", gr.Progress(0)
             video_path = video_file.name
             temp_video_path = os.path.join(self.temp_dir, "current_video.mp4")
             shutil.copy2(video_path, temp_video_path)
             self.current_video_path = temp_video_path
+            self.analyzer.process_video(self.current_video_path)
             self.processed = True
+            return "Video processed successfully! You can now ask questions about the video.", gr.Progress(100)
         except Exception as e:
             self.processed = False
+            return f"Error processing video: {str(e)}", gr.Progress(0)
     def query_video(self, query_text):
+        """Query video with comprehensive results"""
         if not self.processed:
             return None, "Please process a video first."
         try:
+            results = self.analyzer.query_video(query_text)
             frames = []
             descriptions = []
                     description = f"Timestamp: {result['timestamp']:.2f}s\n"
                     description += f"Scene Description: {result['caption']}\n"
+                    if 'object_summary' in result:
+                        description += f"{result['object_summary']}\n"
                     description += f"Relevance Score: {result['relevance_score']:.2f}"
                     descriptions.append(description)
             cap.release()
+            combined_description = "\n\nScene Analysis:\n\n"
             for i, desc in enumerate(descriptions, 1):
                 combined_description += f"Frame {i}:\n{desc}\n\n"
             return None, f"Error querying video: {str(e)}"
     def create_interface(self):
+        """Create Gradio interface"""
+        with gr.Blocks(title="Video Question Answering") as interface:
+            gr.Markdown("# Advanced Video Question Answering")
+            gr.Markdown("Upload a video and ask questions about any aspect of its content!")
             with gr.Row():
                 video_input = gr.File(
+                    label="Upload Video (Recommended: 30 seconds to 5 minutes)",
                     file_types=["video"],
                 )
                 process_button = gr.Button("Process Video")
+            with gr.Row():
+                status_output = gr.Textbox(
+                    label="Status",
+                    interactive=False
+                )
+                progress = gr.Progress()
             with gr.Row():
                 query_input = gr.Textbox(
+                    label="Ask anything about the video",
                     placeholder="What's happening in the video?"
                 )
                 query_button = gr.Button("Search")
+            gallery = gr.Gallery(
+                label="Retrieved Frames",
+                show_label=True,
+                elem_id="gallery",
+                columns=[2],
+                rows=[2],
+                height="auto"
+            )
             descriptions = gr.Textbox(
+                label="Scene Analysis",
                 interactive=False,
                 lines=10
             )
             process_button.click(
                 fn=self.process_video,
                 inputs=[video_input],
+                outputs=[status_output, progress]
             )
             query_button.click(
         return interface
 # Initialize and create the interface
+app = VideoQAApp()
 interface = app.create_interface()
 # Launch the app