Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

pratik-250620 commited on 28 days ago

Commit

c98d24c

verified ·

1 Parent(s): fe57225

Upload folder using huggingface_hub

Files changed (5) hide show

artifacts/coherence_stats.json ADDED Viewed

+{
+  "msci": {
+    "mean": 0.01478813559322034,
+    "std": 0.023561129183220276,
+    "min": -0.0489,
+    "max": 0.0696,
+    "count": 59
+  },
+  "st_i": {
+    "mean": 0.014918644067796609,
+    "std": 0.03350348565006066,
+    "min": -0.057,
+    "max": 0.1151,
+    "count": 59
+  },
+  "st_a": {
+    "mean": 0.019023728813559324,
+    "std": 0.03577630282356001,
+    "min": -0.0774,
+    "max": 0.0958,
+    "count": 59
+  },
+  "si_a": {
+    "mean": -0.0048796610169491526,
+    "std": 0.05054298314947193,
+    "min": -0.1267,
+    "max": 0.0884,
+    "count": 59
+  }
+}

src/embeddings/audio_embedder.py CHANGED Viewed

@@ -25,6 +25,22 @@ class AudioEmbedder:
         self.model.to(self.device)
         self.model.eval()
     @torch.no_grad()
     def embed(self, audio_path: str) -> np.ndarray:
         waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
@@ -36,7 +52,7 @@ class AudioEmbedder:
         ).to(self.device)
         outputs = self.model.get_audio_features(**inputs)
-        emb = outputs[0]
         return emb.cpu().numpy().astype("float32")
     @torch.no_grad()
@@ -47,5 +63,6 @@ class AudioEmbedder:
             return_tensors="pt",
             padding=True,
         ).to(self.device)
-        feats = self.model.get_text_features(**inputs)[0]
         return feats.cpu().numpy().astype("float32")

         self.model.to(self.device)
         self.model.eval()
+    def _squeeze_features(self, feats: torch.Tensor, projection: str) -> torch.Tensor:
+        """Ensure features are 1-D projected embeddings (512-d).
+        Some transformers versions return raw hidden states (batch, seq, hidden)
+        instead of projected features (batch, proj_dim). Detect and fix.
+        """
+        if feats.dim() == 3:
+            pooled = feats[:, 0, :]  # CLS token
+            proj = getattr(self.model, projection, None)
+            if proj is not None:
+                pooled = proj(pooled)
+            feats = pooled
+        if feats.dim() == 2:
+            feats = feats[0]
+        return feats
     @torch.no_grad()
     def embed(self, audio_path: str) -> np.ndarray:
         waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
         ).to(self.device)
         outputs = self.model.get_audio_features(**inputs)
+        emb = self._squeeze_features(outputs, "audio_projection")
         return emb.cpu().numpy().astype("float32")
     @torch.no_grad()
             return_tensors="pt",
             padding=True,
         ).to(self.device)
+        feats = self.model.get_text_features(**inputs)
+        feats = self._squeeze_features(feats, "text_projection")
         return feats.cpu().numpy().astype("float32")

src/embeddings/image_embedder.py CHANGED Viewed

@@ -25,5 +25,14 @@ class ImageEmbedder:
     def embed(self, image_path: str) -> np.ndarray:
         image = Image.open(image_path).convert("RGB")
         inputs = self.processor(images=image, return_tensors="pt").to(self.device)
-        feats = self.model.get_image_features(**inputs)[0]
         return feats.cpu().numpy().astype("float32")

     def embed(self, image_path: str) -> np.ndarray:
         image = Image.open(image_path).convert("RGB")
         inputs = self.processor(images=image, return_tensors="pt").to(self.device)
+        feats = self.model.get_image_features(**inputs)
+        # Handle different transformers versions (some return 3-D hidden states)
+        if feats.dim() == 3:
+            pooled = feats[:, 0, :]
+            proj = getattr(self.model, "visual_projection", None)
+            if proj is not None:
+                pooled = proj(pooled)
+            feats = pooled
+        if feats.dim() == 2:
+            feats = feats[0]
         return feats.cpu().numpy().astype("float32")

src/embeddings/similarity.py CHANGED Viewed

@@ -10,6 +10,8 @@ def l2_normalize(vec: np.ndarray, eps: float = 1e-12) -> np.ndarray:
 def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
     a_n = l2_normalize(a)
     b_n = l2_normalize(b)
     return float(np.clip(np.dot(a_n, b_n), -1.0, 1.0))

 def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    a = a.squeeze()
+    b = b.squeeze()
     a_n = l2_normalize(a)
     b_n = l2_normalize(b)
     return float(np.clip(np.dot(a_n, b_n), -1.0, 1.0))

src/embeddings/text_embedder.py CHANGED Viewed

@@ -28,5 +28,14 @@ class TextEmbedder:
             padding=True,
             truncation=True,
         ).to(self.device)
-        feats = self.model.get_text_features(**inputs)[0]
         return feats.cpu().numpy().astype("float32")

             padding=True,
             truncation=True,
         ).to(self.device)
+        feats = self.model.get_text_features(**inputs)
+        # Handle different transformers versions (some return 3-D hidden states)
+        if feats.dim() == 3:
+            pooled = feats[:, 0, :]
+            proj = getattr(self.model, "text_projection", None)
+            if proj is not None:
+                pooled = proj(pooled)
+            feats = pooled
+        if feats.dim() == 2:
+            feats = feats[0]
         return feats.cpu().numpy().astype("float32")