Spaces:

IT4CHI2311
/

CBIR-System

Sleeping

App Files Files Community

IT4CHI2311 commited on Jan 17

Commit

1ebce69

1 Parent(s): b1f6733

Latest version

Browse files

Files changed (2) hide show

__notebook_source__.ipynb +156 -57
faiss_index.bin +2 -2

__notebook_source__.ipynb CHANGED Viewed

@@ -4,7 +4,7 @@
-get_ipython().getoutput("pip install torch torchvision transformers pillow numpy faiss-cpu opencv-python matplotlib accelerate bitsandbytes kaggle tqdm scikit-learn seaborn -q")
@@ -43,7 +43,7 @@ CONFIG = {
     'models_dir': './models',
     # Model settings
-    'llava_model': 'llava-hf/llava-1.5-7b-hf',
     'image_size': (224, 224),
     'batch_size': 8,
@@ -56,7 +56,7 @@ CONFIG = {
     'top_k': 3,
     # LLaVA settings
-    'use_4bit': True,  # For memory efficiency on Kaggle
     'max_length': 77
 }
@@ -107,32 +107,56 @@ print("✓ Faster R-CNN loaded successfully!")
-# Load LLaVA model with 4-bit quantization for memory efficiency
-print("Loading LLaVA model (this may take a few minutes)...")
-from transformers import BitsAndBytesConfig
-if CONFIG['use_4bit']:
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.float16
-    )
     llava_model = LlavaForConditionalGeneration.from_pretrained(
         CONFIG['llava_model'],
-        quantization_config=quantization_config,
-        device_map="auto",
-        low_cpu_mem_usage=True
     )
 else:
     llava_model = LlavaForConditionalGeneration.from_pretrained(
         CONFIG['llava_model'],
-        torch_dtype=torch.float16,
-        device_map="auto"
     )
-llava_processor = AutoProcessor.from_pretrained(CONFIG['llava_model'])
-print("✓ LLaVA model loaded successfully!")
@@ -163,54 +187,108 @@ def extract_rcnn_features(image_path):
         return feat
     except Exception as e:
-        print(f"Error processing {image_path}: {e}")
-        return np.zeros(1024)
 def extract_llava_features(image_path):
-    """Extract semantic features using LLaVA model"""
     try:
         # Load image
         img = Image.open(image_path).convert('RGB')
-        # Prepare prompt for feature extraction
-        prompt = "USER: <image>\nDescribe this image briefly. ASSISTANT:"
         # Process inputs
         inputs = llava_processor(text=prompt, images=img, return_tensors="pt")
-        inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
-        # Extract visual features from the vision tower
         with torch.no_grad():
-            vision_outputs = llava_model.vision_tower(
-                inputs['pixel_values'],
-                output_hidden_states=True
-            )
-            # Get the last hidden state and pool it
-            visual_features = vision_outputs.hidden_states[-1]
-            # Mean pooling across spatial dimensions
-            visual_features = visual_features.mean(dim=1).squeeze()
-            # Take first 1024 dimensions for consistency
-            visual_features = visual_features[:1024].cpu().numpy()
-        return visual_features
     except Exception as e:
-        print(f"Error processing {image_path}: {e}")
-        return np.zeros(1024)
 def extract_combined_features(image_path):
     """Extract and combine features from both RCNN and LLaVA"""
     rcnn_feat = extract_rcnn_features(image_path)
     llava_feat = extract_llava_features(image_path)
-    # Concatenate features
-    combined_feat = np.concatenate([rcnn_feat, llava_feat])
-    # L2 normalize
-    combined_feat = combined_feat / (np.linalg.norm(combined_feat) + 1e-6)
-    return combined_feat
-print("✓ Feature extraction functions defined!")
@@ -398,19 +476,40 @@ visualize_results(query_image, results)
 # Upload and query with your own image
-# Option 1: Use file upload widget (works on Kaggle/Colab)
-try:
-    from google.colab import files
-    uploaded = files.upload()
-    custom_query_image = list(uploaded.keys())[0]
-except:
-    # Option 2: Specify path to your image
-    custom_query_image = "/kaggle/input/query-image"  # Replace with your image path
-    print("Note: File upload not available. Using sample image instead.")
-    print("To use your own image, replace the path above.")
 # Search and visualize
-print(f"\nSearching for images similar to: {custom_query_image}")
 custom_results = search_similar_images(custom_query_image, top_k=CONFIG['top_k'])
 visualize_results(custom_query_image, custom_results)

+get_ipython().getoutput("pip install torch torchvision transformers pillow numpy faiss-cpu opencv-python matplotlib kaggle tqdm scikit-learn seaborn -q")
     'models_dir': './models',
     # Model settings
+    'llava_model': 'xtuner/llava-phi-3-mini-hf',  # Lightweight LLaVA (~4GB vs 14GB)
     'image_size': (224, 224),
     'batch_size': 8,
     'top_k': 3,
     # LLaVA settings
+    'use_fp16': torch.cuda.is_available(),  # Use FP16 on GPU for memory efficiency
     'max_length': 77
 }
+# Load LLaVA Phi-3-Mini model (lightweight ~4GB)
+print("Loading LLaVA Phi-3-Mini model (lightweight version)...")
+print(f"Model: {CONFIG['llava_model']}")
+# Load processor first
+from transformers import LlavaProcessor
+# Use LlavaProcessor explicitly instead of AutoProcessor
+llava_processor = LlavaProcessor.from_pretrained(CONFIG['llava_model'])
+# Fix patch_size issue - it's in the image_processor config
+if hasattr(llava_processor, 'image_processor'):
+    if not hasattr(llava_processor.image_processor, 'patch_size') or llava_processor.image_processor.patch_size is None:
+        llava_processor.image_processor.patch_size = 14  # Standard patch size for vision transformers
+        print(f"Set image_processor.patch_size to: {llava_processor.image_processor.patch_size}")
+    # Also set patch_size on the processor itself if it doesn't have it
+    if not hasattr(llava_processor, 'patch_size') or llava_processor.patch_size is None:
+        llava_processor.patch_size = llava_processor.image_processor.patch_size
+        print(f"Set processor.patch_size to: {llava_processor.patch_size}")
+# Verify processor configuration
+if hasattr(llava_processor, 'image_processor') and hasattr(llava_processor.image_processor, 'size'):
+    print(f"Image processor configured: {llava_processor.image_processor.size}")
+else:
+    print("Warning: Image processor configuration may need adjustment")
+# Load with memory-efficient settings
+if CONFIG['use_fp16']:
+    print("Using FP16 for GPU efficiency...")
     llava_model = LlavaForConditionalGeneration.from_pretrained(
         CONFIG['llava_model'],
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        device_map="auto"
     )
 else:
+    print("Using FP32 for CPU...")
     llava_model = LlavaForConditionalGeneration.from_pretrained(
         CONFIG['llava_model'],
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True
     )
+    llava_model = llava_model.to(device)
+llava_model.eval()
+print("✓ LLaVA Phi-3-Mini loaded successfully!")
+print(f"✓ Model size: ~4GB (much lighter than standard LLaVA 7B ~14GB)")
+print(f"✓ Memory efficient and faster inference!")
         return feat
     except Exception as e:
+        print(f"Error in RCNN processing {image_path}: {e}")
+        return np.zeros(1024, dtype=np.float32)
 def extract_llava_features(image_path):
+    """Extract semantic features using LLaVA vision encoder (FAST - no text generation)"""
     try:
         # Load image
         img = Image.open(image_path).convert('RGB')
+        # Process image only (minimal prompt for processor)
+        prompt = "USER: <image>\nASSISTANT:"
         # Process inputs
         inputs = llava_processor(text=prompt, images=img, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Extract visual features directly from vision tower
         with torch.no_grad():
+            # Try to get vision tower
+            if hasattr(llava_model, 'get_vision_tower'):
+                vision_tower = llava_model.get_vision_tower()
+            elif hasattr(llava_model, 'vision_tower'):
+                vision_tower = llava_model.vision_tower
+            else:
+                vision_tower = None
+            # Use vision tower directly if available (fastest)
+            if vision_tower is not None and 'pixel_values' in inputs:
+                image_outputs = vision_tower(inputs['pixel_values'])
+                # Handle different output types
+                if hasattr(image_outputs, 'pooler_output'):
+                    # Use pooled output if available (pre-computed pooling)
+                    visual_features = image_outputs.pooler_output.squeeze()
+                elif hasattr(image_outputs, 'last_hidden_state'):
+                    # Pool the last hidden state
+                    visual_features = image_outputs.last_hidden_state.mean(dim=1).squeeze()
+                elif isinstance(image_outputs, tuple):
+                    # Handle tuple output
+                    hidden_state = image_outputs[0]
+                    visual_features = hidden_state.mean(dim=1).squeeze()
+                else:
+                    # Fallback: assume it's a tensor
+                    if image_outputs.dim() > 2:
+                        visual_features = image_outputs.mean(dim=1).squeeze()
+                    else:
+                        visual_features = image_outputs.squeeze()
+            else:
+                # Fallback: use model forward pass
+                outputs = llava_model(
+                    input_ids=inputs['input_ids'],
+                    attention_mask=inputs.get('attention_mask'),
+                    pixel_values=inputs.get('pixel_values'),
+                    output_hidden_states=True
+                )
+                visual_features = outputs.hidden_states[-1].mean(dim=1).squeeze()
+            # Convert to numpy
+            if isinstance(visual_features, torch.Tensor):
+                visual_features = visual_features.cpu().numpy()
+            # Ensure it's a 1D array
+            if visual_features.ndim == 0:
+                visual_features = np.array([visual_features], dtype=np.float32)
+            elif visual_features.ndim > 1:
+                visual_features = visual_features.flatten()
+            # Ensure float32 dtype
+            visual_features = visual_features.astype(np.float32)
+            # Resize to exactly 1024 dimensions
+            current_size = visual_features.shape[0]
+            if current_size < 1024:
+                padding = np.zeros(1024 - current_size, dtype=np.float32)
+                visual_features = np.concatenate([visual_features, padding])
+            elif current_size > 1024:
+                visual_features = visual_features[:1024]
+            return visual_features
     except Exception as e:
+        print(f"Error in LLaVA processing {image_path}: {e}")
+        return np.zeros(1024, dtype=np.float32)
 def extract_combined_features(image_path):
     """Extract and combine features from both RCNN and LLaVA"""
     rcnn_feat = extract_rcnn_features(image_path)
     llava_feat = extract_llava_features(image_path)
+    # Ensure both are numpy arrays with correct dtype
+    rcnn_feat = np.array(rcnn_feat, dtype=np.float32)
+    llava_feat = np.array(llava_feat, dtype=np.float32)
+    # Ensure correct shapes
+    if rcnn_feat.shape[0] != 1024:
+        rcnn_feat = np.resize(rcnn_feat, 1024).astype(np.float32)
+    if llava_feat.shape[0] != 1024:
+        llava_feat = np.resize(llava_feat, 1024).astype(np.float32)
+    # Concatenate features
+    combined = np.concatenate([rcnn_feat, llava_feat])
+    return combined
 # Upload and query with your own image
+# Method 1: Use a random image from the dataset
+# custom_query_image = random.choice(valid_image_paths)
+# print(f"Using sample image: {custom_query_image}")
+# print("\nTo use your own image on Kaggle:")
+# print("1. Click 'Add Data' in the right sidebar")
+# print("2. Upload your image or add a dataset")
+# print("3. Update the path below to: '/kaggle/input/YOUR_DATASET/your_image.jpg'")
+# print("\nAlternatively, uncomment and modify one of the options below:\n")
+# Method 2: Specify a path to your uploaded image (Kaggle)
+custom_query_image = '/kaggle/input/query-image-1/images.jpg'
+# Method 3: Use Kaggle's file upload (interactive)
+# Uncomment the code below to enable:
+# from IPython.display import FileUpload
+# import shutil
+# print("Upload your image:")
+# # Note: You'll need to manually upload via Kaggle's interface
+# # Then specify the path like: custom_query_image = '/kaggle/working/uploaded_image.jpg'
+# Method 4: Google Colab upload (if running on Colab instead)
+# try:
+#     from google.colab import files
+#     uploaded = files.upload()
+#     custom_query_image = list(uploaded.keys())[0]
+#     print(f"Uploaded: {custom_query_image}")
+# except:
+#     pass
 # Search and visualize
+print(f"\n{'='*60}")
+print(f"Searching for images similar to: {custom_query_image}")
+print(f"{'='*60}\n")
 custom_results = search_similar_images(custom_query_image, top_k=CONFIG['top_k'])
 visualize_results(custom_query_image, custom_results)

faiss_index.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ffb1d842839313489932bd5f7a981c16e637f8d9920ee2a2086ba1c61517d0ec
-size 156707885

 version https://git-lfs.github.com/spec/v1
+oid sha256:016ba66879e3ece45f30ef6e9febf9e8734a7587b5f40755f3bbb04579f213b3
+size 250732589