Spaces:

IT4CHI2311
/

CBIR-System

Sleeping

App Files Files Community

IT4CHI2311 commited on Jan 17

Commit

09421ab

1 Parent(s): e3df3aa

Made changes

Browse files

Files changed (1) hide show

app.py +44 -13

app.py CHANGED Viewed

@@ -106,30 +106,61 @@ def extract_features(image, rcnn_backbone, llava_model, llava_processor):
         rcnn_features['pool'], (1, 1)
     ).flatten().cpu().numpy()
-    # LLaVA Phi-3-Mini features (lightweight vision-language model)
     if USE_LLAVA and llava_model is not None:
-        prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"
         inputs = llava_processor(text=prompt, images=image, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
-        outputs = llava_model.generate(
-            **inputs,
-            max_new_tokens=77,
-            output_hidden_states=True,
-            return_dict_in_generate=True
-        )
-        # Extract LLaVA features from last hidden state
-        hidden_states = outputs.hidden_states[0][-1]
-        llava_feat_vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
         # Resize to 1024 dimensions
         if llava_feat_vector.shape[0] != 1024:
             if llava_feat_vector.shape[0] < 1024:
-                # Pad if smaller
                 llava_feat_vector = np.pad(llava_feat_vector, (0, 1024 - llava_feat_vector.shape[0]))
             else:
-                # Truncate if larger
                 llava_feat_vector = llava_feat_vector[:1024]
     else:
         # Use zeros when LLaVA is disabled (maintains compatibility)

         rcnn_features['pool'], (1, 1)
     ).flatten().cpu().numpy()
+    # LLaVA Phi-3-Mini features (FAST - direct vision encoder, no text generation)
     if USE_LLAVA and llava_model is not None:
+        # CRITICAL: Ensure patch_size is set before processing
+        if hasattr(llava_processor, 'image_processor'):
+            llava_processor.image_processor.patch_size = 14
+        llava_processor.patch_size = 14
+        prompt = "USER: <image>\nASSISTANT:"
         inputs = llava_processor(text=prompt, images=image, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
+        # Extract visual features directly (10-20x faster than generate())
+        # Get vision tower
+        if hasattr(llava_model, 'get_vision_tower'):
+            vision_tower = llava_model.get_vision_tower()
+        elif hasattr(llava_model, 'vision_tower'):
+            vision_tower = llava_model.vision_tower
+        else:
+            vision_tower = None
+        # Use vision tower directly if available
+        if vision_tower is not None and 'pixel_values' in inputs:
+            image_outputs = vision_tower(inputs['pixel_values'])
+            # Handle different output types
+            if hasattr(image_outputs, 'pooler_output'):
+                llava_feat_vector = image_outputs.pooler_output.squeeze().cpu().numpy()
+            elif hasattr(image_outputs, 'last_hidden_state'):
+                llava_feat_vector = image_outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
+            elif isinstance(image_outputs, tuple):
+                llava_feat_vector = image_outputs[0].mean(dim=1).squeeze().cpu().numpy()
+            else:
+                if image_outputs.dim() > 2:
+                    llava_feat_vector = image_outputs.mean(dim=1).squeeze().cpu().numpy()
+                else:
+                    llava_feat_vector = image_outputs.squeeze().cpu().numpy()
+        else:
+            # Fallback: use model forward pass (still much faster than generate)
+            outputs = llava_model(
+                input_ids=inputs['input_ids'],
+                attention_mask=inputs.get('attention_mask'),
+                pixel_values=inputs.get('pixel_values'),
+                output_hidden_states=True
+            )
+            llava_feat_vector = outputs.hidden_states[-1].mean(dim=1).squeeze().cpu().numpy()
+        # Ensure proper shape
+        if llava_feat_vector.ndim > 1:
+            llava_feat_vector = llava_feat_vector.flatten()
         # Resize to 1024 dimensions
         if llava_feat_vector.shape[0] != 1024:
             if llava_feat_vector.shape[0] < 1024:
                 llava_feat_vector = np.pad(llava_feat_vector, (0, 1024 - llava_feat_vector.shape[0]))
             else:
                 llava_feat_vector = llava_feat_vector[:1024]
     else:
         # Use zeros when LLaVA is disabled (maintains compatibility)