IT4CHI2311 commited on
Commit
1ebce69
·
1 Parent(s): b1f6733

Latest version

Browse files
Files changed (2) hide show
  1. __notebook_source__.ipynb +156 -57
  2. faiss_index.bin +2 -2
__notebook_source__.ipynb CHANGED
@@ -4,7 +4,7 @@
4
 
5
 
6
 
7
- get_ipython().getoutput("pip install torch torchvision transformers pillow numpy faiss-cpu opencv-python matplotlib accelerate bitsandbytes kaggle tqdm scikit-learn seaborn -q")
8
 
9
 
10
 
@@ -43,7 +43,7 @@ CONFIG = {
43
  'models_dir': './models',
44
 
45
  # Model settings
46
- 'llava_model': 'llava-hf/llava-1.5-7b-hf',
47
  'image_size': (224, 224),
48
  'batch_size': 8,
49
 
@@ -56,7 +56,7 @@ CONFIG = {
56
  'top_k': 3,
57
 
58
  # LLaVA settings
59
- 'use_4bit': True, # For memory efficiency on Kaggle
60
  'max_length': 77
61
  }
62
 
@@ -107,32 +107,56 @@ print("✓ Faster R-CNN loaded successfully!")
107
 
108
 
109
 
110
- # Load LLaVA model with 4-bit quantization for memory efficiency
111
- print("Loading LLaVA model (this may take a few minutes)...")
 
112
 
113
- from transformers import BitsAndBytesConfig
 
114
 
115
- if CONFIG['use_4bit']:
116
- quantization_config = BitsAndBytesConfig(
117
- load_in_4bit=True,
118
- bnb_4bit_compute_dtype=torch.float16
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  llava_model = LlavaForConditionalGeneration.from_pretrained(
121
  CONFIG['llava_model'],
122
- quantization_config=quantization_config,
123
- device_map="auto",
124
- low_cpu_mem_usage=True
125
  )
126
  else:
 
127
  llava_model = LlavaForConditionalGeneration.from_pretrained(
128
  CONFIG['llava_model'],
129
- torch_dtype=torch.float16,
130
- device_map="auto"
131
  )
 
132
 
133
- llava_processor = AutoProcessor.from_pretrained(CONFIG['llava_model'])
134
 
135
- print("✓ LLaVA model loaded successfully!")
 
 
136
 
137
 
138
 
@@ -163,54 +187,108 @@ def extract_rcnn_features(image_path):
163
 
164
  return feat
165
  except Exception as e:
166
- print(f"Error processing {image_path}: {e}")
167
- return np.zeros(1024)
168
 
169
  def extract_llava_features(image_path):
170
- """Extract semantic features using LLaVA model"""
171
  try:
172
  # Load image
173
  img = Image.open(image_path).convert('RGB')
174
 
175
- # Prepare prompt for feature extraction
176
- prompt = "USER: <image>\nDescribe this image briefly. ASSISTANT:"
177
 
178
  # Process inputs
179
  inputs = llava_processor(text=prompt, images=img, return_tensors="pt")
180
- inputs = {k: v.to(llava_model.device) for k, v in inputs.items()}
181
 
182
- # Extract visual features from the vision tower
183
  with torch.no_grad():
184
- vision_outputs = llava_model.vision_tower(
185
- inputs['pixel_values'],
186
- output_hidden_states=True
187
- )
188
- # Get the last hidden state and pool it
189
- visual_features = vision_outputs.hidden_states[-1]
190
- # Mean pooling across spatial dimensions
191
- visual_features = visual_features.mean(dim=1).squeeze()
192
- # Take first 1024 dimensions for consistency
193
- visual_features = visual_features[:1024].cpu().numpy()
194
-
195
- return visual_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  except Exception as e:
197
- print(f"Error processing {image_path}: {e}")
198
- return np.zeros(1024)
199
 
200
  def extract_combined_features(image_path):
201
  """Extract and combine features from both RCNN and LLaVA"""
202
  rcnn_feat = extract_rcnn_features(image_path)
203
  llava_feat = extract_llava_features(image_path)
204
 
205
- # Concatenate features
206
- combined_feat = np.concatenate([rcnn_feat, llava_feat])
 
207
 
208
- # L2 normalize
209
- combined_feat = combined_feat / (np.linalg.norm(combined_feat) + 1e-6)
 
 
 
210
 
211
- return combined_feat
212
-
213
- print("✓ Feature extraction functions defined!")
214
 
215
 
216
 
@@ -398,19 +476,40 @@ visualize_results(query_image, results)
398
 
399
 
400
  # Upload and query with your own image
401
- # Option 1: Use file upload widget (works on Kaggle/Colab)
402
- try:
403
- from google.colab import files
404
- uploaded = files.upload()
405
- custom_query_image = list(uploaded.keys())[0]
406
- except:
407
- # Option 2: Specify path to your image
408
- custom_query_image = "/kaggle/input/query-image" # Replace with your image path
409
- print("Note: File upload not available. Using sample image instead.")
410
- print("To use your own image, replace the path above.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
 
412
  # Search and visualize
413
- print(f"\nSearching for images similar to: {custom_query_image}")
 
 
414
  custom_results = search_similar_images(custom_query_image, top_k=CONFIG['top_k'])
415
  visualize_results(custom_query_image, custom_results)
416
 
 
4
 
5
 
6
 
7
+ get_ipython().getoutput("pip install torch torchvision transformers pillow numpy faiss-cpu opencv-python matplotlib kaggle tqdm scikit-learn seaborn -q")
8
 
9
 
10
 
 
43
  'models_dir': './models',
44
 
45
  # Model settings
46
+ 'llava_model': 'xtuner/llava-phi-3-mini-hf', # Lightweight LLaVA (~4GB vs 14GB)
47
  'image_size': (224, 224),
48
  'batch_size': 8,
49
 
 
56
  'top_k': 3,
57
 
58
  # LLaVA settings
59
+ 'use_fp16': torch.cuda.is_available(), # Use FP16 on GPU for memory efficiency
60
  'max_length': 77
61
  }
62
 
 
107
 
108
 
109
 
110
+ # Load LLaVA Phi-3-Mini model (lightweight ~4GB)
111
+ print("Loading LLaVA Phi-3-Mini model (lightweight version)...")
112
+ print(f"Model: {CONFIG['llava_model']}")
113
 
114
+ # Load processor first
115
+ from transformers import LlavaProcessor
116
 
117
+ # Use LlavaProcessor explicitly instead of AutoProcessor
118
+ llava_processor = LlavaProcessor.from_pretrained(CONFIG['llava_model'])
119
+
120
+ # Fix patch_size issue - it's in the image_processor config
121
+ if hasattr(llava_processor, 'image_processor'):
122
+ if not hasattr(llava_processor.image_processor, 'patch_size') or llava_processor.image_processor.patch_size is None:
123
+ llava_processor.image_processor.patch_size = 14 # Standard patch size for vision transformers
124
+ print(f"Set image_processor.patch_size to: {llava_processor.image_processor.patch_size}")
125
+
126
+ # Also set patch_size on the processor itself if it doesn't have it
127
+ if not hasattr(llava_processor, 'patch_size') or llava_processor.patch_size is None:
128
+ llava_processor.patch_size = llava_processor.image_processor.patch_size
129
+ print(f"Set processor.patch_size to: {llava_processor.patch_size}")
130
+
131
+ # Verify processor configuration
132
+ if hasattr(llava_processor, 'image_processor') and hasattr(llava_processor.image_processor, 'size'):
133
+ print(f"Image processor configured: {llava_processor.image_processor.size}")
134
+ else:
135
+ print("Warning: Image processor configuration may need adjustment")
136
+
137
+ # Load with memory-efficient settings
138
+ if CONFIG['use_fp16']:
139
+ print("Using FP16 for GPU efficiency...")
140
  llava_model = LlavaForConditionalGeneration.from_pretrained(
141
  CONFIG['llava_model'],
142
+ torch_dtype=torch.float16,
143
+ low_cpu_mem_usage=True,
144
+ device_map="auto"
145
  )
146
  else:
147
+ print("Using FP32 for CPU...")
148
  llava_model = LlavaForConditionalGeneration.from_pretrained(
149
  CONFIG['llava_model'],
150
+ torch_dtype=torch.float32,
151
+ low_cpu_mem_usage=True
152
  )
153
+ llava_model = llava_model.to(device)
154
 
155
+ llava_model.eval()
156
 
157
+ print("✓ LLaVA Phi-3-Mini loaded successfully!")
158
+ print(f"✓ Model size: ~4GB (much lighter than standard LLaVA 7B ~14GB)")
159
+ print(f"✓ Memory efficient and faster inference!")
160
 
161
 
162
 
 
187
 
188
  return feat
189
  except Exception as e:
190
+ print(f"Error in RCNN processing {image_path}: {e}")
191
+ return np.zeros(1024, dtype=np.float32)
192
 
193
  def extract_llava_features(image_path):
194
+ """Extract semantic features using LLaVA vision encoder (FAST - no text generation)"""
195
  try:
196
  # Load image
197
  img = Image.open(image_path).convert('RGB')
198
 
199
+ # Process image only (minimal prompt for processor)
200
+ prompt = "USER: <image>\nASSISTANT:"
201
 
202
  # Process inputs
203
  inputs = llava_processor(text=prompt, images=img, return_tensors="pt")
204
+ inputs = {k: v.to(device) for k, v in inputs.items()}
205
 
206
+ # Extract visual features directly from vision tower
207
  with torch.no_grad():
208
+ # Try to get vision tower
209
+ if hasattr(llava_model, 'get_vision_tower'):
210
+ vision_tower = llava_model.get_vision_tower()
211
+ elif hasattr(llava_model, 'vision_tower'):
212
+ vision_tower = llava_model.vision_tower
213
+ else:
214
+ vision_tower = None
215
+
216
+ # Use vision tower directly if available (fastest)
217
+ if vision_tower is not None and 'pixel_values' in inputs:
218
+ image_outputs = vision_tower(inputs['pixel_values'])
219
+
220
+ # Handle different output types
221
+ if hasattr(image_outputs, 'pooler_output'):
222
+ # Use pooled output if available (pre-computed pooling)
223
+ visual_features = image_outputs.pooler_output.squeeze()
224
+ elif hasattr(image_outputs, 'last_hidden_state'):
225
+ # Pool the last hidden state
226
+ visual_features = image_outputs.last_hidden_state.mean(dim=1).squeeze()
227
+ elif isinstance(image_outputs, tuple):
228
+ # Handle tuple output
229
+ hidden_state = image_outputs[0]
230
+ visual_features = hidden_state.mean(dim=1).squeeze()
231
+ else:
232
+ # Fallback: assume it's a tensor
233
+ if image_outputs.dim() > 2:
234
+ visual_features = image_outputs.mean(dim=1).squeeze()
235
+ else:
236
+ visual_features = image_outputs.squeeze()
237
+ else:
238
+ # Fallback: use model forward pass
239
+ outputs = llava_model(
240
+ input_ids=inputs['input_ids'],
241
+ attention_mask=inputs.get('attention_mask'),
242
+ pixel_values=inputs.get('pixel_values'),
243
+ output_hidden_states=True
244
+ )
245
+ visual_features = outputs.hidden_states[-1].mean(dim=1).squeeze()
246
+
247
+ # Convert to numpy
248
+ if isinstance(visual_features, torch.Tensor):
249
+ visual_features = visual_features.cpu().numpy()
250
+
251
+ # Ensure it's a 1D array
252
+ if visual_features.ndim == 0:
253
+ visual_features = np.array([visual_features], dtype=np.float32)
254
+ elif visual_features.ndim > 1:
255
+ visual_features = visual_features.flatten()
256
+
257
+ # Ensure float32 dtype
258
+ visual_features = visual_features.astype(np.float32)
259
+
260
+ # Resize to exactly 1024 dimensions
261
+ current_size = visual_features.shape[0]
262
+ if current_size < 1024:
263
+ padding = np.zeros(1024 - current_size, dtype=np.float32)
264
+ visual_features = np.concatenate([visual_features, padding])
265
+ elif current_size > 1024:
266
+ visual_features = visual_features[:1024]
267
+
268
+ return visual_features
269
+
270
  except Exception as e:
271
+ print(f"Error in LLaVA processing {image_path}: {e}")
272
+ return np.zeros(1024, dtype=np.float32)
273
 
274
  def extract_combined_features(image_path):
275
  """Extract and combine features from both RCNN and LLaVA"""
276
  rcnn_feat = extract_rcnn_features(image_path)
277
  llava_feat = extract_llava_features(image_path)
278
 
279
+ # Ensure both are numpy arrays with correct dtype
280
+ rcnn_feat = np.array(rcnn_feat, dtype=np.float32)
281
+ llava_feat = np.array(llava_feat, dtype=np.float32)
282
 
283
+ # Ensure correct shapes
284
+ if rcnn_feat.shape[0] != 1024:
285
+ rcnn_feat = np.resize(rcnn_feat, 1024).astype(np.float32)
286
+ if llava_feat.shape[0] != 1024:
287
+ llava_feat = np.resize(llava_feat, 1024).astype(np.float32)
288
 
289
+ # Concatenate features
290
+ combined = np.concatenate([rcnn_feat, llava_feat])
291
+ return combined
292
 
293
 
294
 
 
476
 
477
 
478
  # Upload and query with your own image
479
+
480
+ # Method 1: Use a random image from the dataset
481
+ # custom_query_image = random.choice(valid_image_paths)
482
+ # print(f"Using sample image: {custom_query_image}")
483
+ # print("\nTo use your own image on Kaggle:")
484
+ # print("1. Click 'Add Data' in the right sidebar")
485
+ # print("2. Upload your image or add a dataset")
486
+ # print("3. Update the path below to: '/kaggle/input/YOUR_DATASET/your_image.jpg'")
487
+ # print("\nAlternatively, uncomment and modify one of the options below:\n")
488
+
489
+ # Method 2: Specify a path to your uploaded image (Kaggle)
490
+ custom_query_image = '/kaggle/input/query-image-1/images.jpg'
491
+
492
+ # Method 3: Use Kaggle's file upload (interactive)
493
+ # Uncomment the code below to enable:
494
+ # from IPython.display import FileUpload
495
+ # import shutil
496
+ # print("Upload your image:")
497
+ # # Note: You'll need to manually upload via Kaggle's interface
498
+ # # Then specify the path like: custom_query_image = '/kaggle/working/uploaded_image.jpg'
499
+
500
+ # Method 4: Google Colab upload (if running on Colab instead)
501
+ # try:
502
+ # from google.colab import files
503
+ # uploaded = files.upload()
504
+ # custom_query_image = list(uploaded.keys())[0]
505
+ # print(f"Uploaded: {custom_query_image}")
506
+ # except:
507
+ # pass
508
 
509
  # Search and visualize
510
+ print(f"\n{'='*60}")
511
+ print(f"Searching for images similar to: {custom_query_image}")
512
+ print(f"{'='*60}\n")
513
  custom_results = search_similar_images(custom_query_image, top_k=CONFIG['top_k'])
514
  visualize_results(custom_query_image, custom_results)
515
 
faiss_index.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffb1d842839313489932bd5f7a981c16e637f8d9920ee2a2086ba1c61517d0ec
3
- size 156707885
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:016ba66879e3ece45f30ef6e9febf9e8734a7587b5f40755f3bbb04579f213b3
3
+ size 250732589