Spaces:

bgaspra
/

Rec_Sys_Flo2

Sleeping

App Files Files Community

bgaspra commited on Nov 6, 2024

Commit

107b2a4

verified ·

1 Parent(s): 8bbea5e

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -5

app.py CHANGED Viewed

@@ -31,16 +31,19 @@ text_embedding_cache = {}
 def get_image_embedding(image):
     try:
         inputs = processor(
             images=image,
-            text=[""], # Florence-2 requires both image and text inputs
             return_tensors="pt"
         ).to(device, torch_dtype)
         with torch.no_grad():
             outputs = model(**inputs)
-            # Get the image embeddings from the last hidden states
-            image_embeddings = outputs.last_hidden_state[:, 0, :]  # Take CLS token
         return image_embeddings.cpu().numpy()
     except Exception as e:
         print(f"Error in get_image_embedding: {str(e)}")
@@ -51,15 +54,25 @@ def get_text_embedding(text):
         if text in text_embedding_cache:
             return text_embedding_cache[text]
         inputs = processor(
             text=text,
-            images=None,
             return_tensors="pt"
         ).to(device, torch_dtype)
         with torch.no_grad():
             outputs = model(**inputs)
-            text_embeddings = outputs.last_hidden_state[:, 0, :]  # Take CLS token
         embedding = text_embeddings.cpu().numpy()
         text_embedding_cache[text] = embedding

 def get_image_embedding(image):
     try:
+        # Process image and add dummy text input
         inputs = processor(
             images=image,
+            text="Describe this image",  # Adding a default text prompt
+            padding=True,
             return_tensors="pt"
         ).to(device, torch_dtype)
         with torch.no_grad():
+            # Get model outputs
             outputs = model(**inputs)
+            # Extract image features from the cross-attention layers
+            image_embeddings = outputs.last_hidden_state.mean(dim=1)
         return image_embeddings.cpu().numpy()
     except Exception as e:
         print(f"Error in get_image_embedding: {str(e)}")
         if text in text_embedding_cache:
             return text_embedding_cache[text]
+        # Process text with proper input formatting
         inputs = processor(
             text=text,
+            padding=True,
             return_tensors="pt"
         ).to(device, torch_dtype)
+        # Add required decoder input ids
+        inputs['decoder_input_ids'] = model.generate(
+            **inputs,
+            max_length=1,
+            return_dict_in_generate=True,
+            output_hidden_states=True,
+            early_stopping=True
+        ).sequences
         with torch.no_grad():
             outputs = model(**inputs)
+            text_embeddings = outputs.last_hidden_state.mean(dim=1)
         embedding = text_embeddings.cpu().numpy()
         text_embedding_cache[text] = embedding