pratik-250620 commited on
Commit
c98d24c
·
verified ·
1 Parent(s): fe57225

Upload folder using huggingface_hub

Browse files
artifacts/coherence_stats.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "msci": {
3
+ "mean": 0.01478813559322034,
4
+ "std": 0.023561129183220276,
5
+ "min": -0.0489,
6
+ "max": 0.0696,
7
+ "count": 59
8
+ },
9
+ "st_i": {
10
+ "mean": 0.014918644067796609,
11
+ "std": 0.03350348565006066,
12
+ "min": -0.057,
13
+ "max": 0.1151,
14
+ "count": 59
15
+ },
16
+ "st_a": {
17
+ "mean": 0.019023728813559324,
18
+ "std": 0.03577630282356001,
19
+ "min": -0.0774,
20
+ "max": 0.0958,
21
+ "count": 59
22
+ },
23
+ "si_a": {
24
+ "mean": -0.0048796610169491526,
25
+ "std": 0.05054298314947193,
26
+ "min": -0.1267,
27
+ "max": 0.0884,
28
+ "count": 59
29
+ }
30
+ }
src/embeddings/audio_embedder.py CHANGED
@@ -25,6 +25,22 @@ class AudioEmbedder:
25
  self.model.to(self.device)
26
  self.model.eval()
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  @torch.no_grad()
29
  def embed(self, audio_path: str) -> np.ndarray:
30
  waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
@@ -36,7 +52,7 @@ class AudioEmbedder:
36
  ).to(self.device)
37
 
38
  outputs = self.model.get_audio_features(**inputs)
39
- emb = outputs[0]
40
  return emb.cpu().numpy().astype("float32")
41
 
42
  @torch.no_grad()
@@ -47,5 +63,6 @@ class AudioEmbedder:
47
  return_tensors="pt",
48
  padding=True,
49
  ).to(self.device)
50
- feats = self.model.get_text_features(**inputs)[0]
 
51
  return feats.cpu().numpy().astype("float32")
 
25
  self.model.to(self.device)
26
  self.model.eval()
27
 
28
+ def _squeeze_features(self, feats: torch.Tensor, projection: str) -> torch.Tensor:
29
+ """Ensure features are 1-D projected embeddings (512-d).
30
+
31
+ Some transformers versions return raw hidden states (batch, seq, hidden)
32
+ instead of projected features (batch, proj_dim). Detect and fix.
33
+ """
34
+ if feats.dim() == 3:
35
+ pooled = feats[:, 0, :] # CLS token
36
+ proj = getattr(self.model, projection, None)
37
+ if proj is not None:
38
+ pooled = proj(pooled)
39
+ feats = pooled
40
+ if feats.dim() == 2:
41
+ feats = feats[0]
42
+ return feats
43
+
44
  @torch.no_grad()
45
  def embed(self, audio_path: str) -> np.ndarray:
46
  waveform, _ = librosa.load(audio_path, sr=self.target_sr, mono=True)
 
52
  ).to(self.device)
53
 
54
  outputs = self.model.get_audio_features(**inputs)
55
+ emb = self._squeeze_features(outputs, "audio_projection")
56
  return emb.cpu().numpy().astype("float32")
57
 
58
  @torch.no_grad()
 
63
  return_tensors="pt",
64
  padding=True,
65
  ).to(self.device)
66
+ feats = self.model.get_text_features(**inputs)
67
+ feats = self._squeeze_features(feats, "text_projection")
68
  return feats.cpu().numpy().astype("float32")
src/embeddings/image_embedder.py CHANGED
@@ -25,5 +25,14 @@ class ImageEmbedder:
25
  def embed(self, image_path: str) -> np.ndarray:
26
  image = Image.open(image_path).convert("RGB")
27
  inputs = self.processor(images=image, return_tensors="pt").to(self.device)
28
- feats = self.model.get_image_features(**inputs)[0]
 
 
 
 
 
 
 
 
 
29
  return feats.cpu().numpy().astype("float32")
 
25
  def embed(self, image_path: str) -> np.ndarray:
26
  image = Image.open(image_path).convert("RGB")
27
  inputs = self.processor(images=image, return_tensors="pt").to(self.device)
28
+ feats = self.model.get_image_features(**inputs)
29
+ # Handle different transformers versions (some return 3-D hidden states)
30
+ if feats.dim() == 3:
31
+ pooled = feats[:, 0, :]
32
+ proj = getattr(self.model, "visual_projection", None)
33
+ if proj is not None:
34
+ pooled = proj(pooled)
35
+ feats = pooled
36
+ if feats.dim() == 2:
37
+ feats = feats[0]
38
  return feats.cpu().numpy().astype("float32")
src/embeddings/similarity.py CHANGED
@@ -10,6 +10,8 @@ def l2_normalize(vec: np.ndarray, eps: float = 1e-12) -> np.ndarray:
10
 
11
 
12
  def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
 
 
13
  a_n = l2_normalize(a)
14
  b_n = l2_normalize(b)
15
  return float(np.clip(np.dot(a_n, b_n), -1.0, 1.0))
 
10
 
11
 
12
  def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
13
+ a = a.squeeze()
14
+ b = b.squeeze()
15
  a_n = l2_normalize(a)
16
  b_n = l2_normalize(b)
17
  return float(np.clip(np.dot(a_n, b_n), -1.0, 1.0))
src/embeddings/text_embedder.py CHANGED
@@ -28,5 +28,14 @@ class TextEmbedder:
28
  padding=True,
29
  truncation=True,
30
  ).to(self.device)
31
- feats = self.model.get_text_features(**inputs)[0]
 
 
 
 
 
 
 
 
 
32
  return feats.cpu().numpy().astype("float32")
 
28
  padding=True,
29
  truncation=True,
30
  ).to(self.device)
31
+ feats = self.model.get_text_features(**inputs)
32
+ # Handle different transformers versions (some return 3-D hidden states)
33
+ if feats.dim() == 3:
34
+ pooled = feats[:, 0, :]
35
+ proj = getattr(self.model, "text_projection", None)
36
+ if proj is not None:
37
+ pooled = proj(pooled)
38
+ feats = pooled
39
+ if feats.dim() == 2:
40
+ feats = feats[0]
41
  return feats.cpu().numpy().astype("float32")