commandeaw commited on
Commit
5e31798
·
verified ·
1 Parent(s): 7cb17d8

Fix transformers 5.x API change: get_text_features now returns BaseModelOutputWithPooling

Browse files

In transformers >=5.0, CLIPModel.get_text_features and get_image_features
return BaseModelOutputWithPooling instead of a Tensor. The projected
text/image embedding lives at .pooler_output (after projection).

Add backward-compatible accessor that works on both old (tensor) and
new (output object) return types.

Files changed (1) hide show
  1. dw_queryframes.py +9 -2
dw_queryframes.py CHANGED
@@ -117,13 +117,20 @@ class QueryFrames:
117
  )
118
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
119
  with torch.inference_mode():
120
- text_emb = self.clip_model.get_text_features(
 
 
 
121
  input_ids=inputs["input_ids"],
122
  attention_mask=inputs["attention_mask"],
123
  )
124
- image_embs = self.clip_model.get_image_features(
 
 
125
  pixel_values=inputs["pixel_values"]
126
  )
 
 
127
  text_emb = F.normalize(text_emb, dim=-1)
128
  image_embs = F.normalize(image_embs, dim=-1)
129
  sims = (text_emb @ image_embs.T).squeeze(0).float().cpu()
 
117
  )
118
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
119
  with torch.inference_mode():
120
+ # transformers ≤ 4.x returns a tensor directly; ≥ 5.x returns
121
+ # a BaseModelOutputWithPooling whose .pooler_output is the
122
+ # projected embedding. Handle both.
123
+ text_out = self.clip_model.get_text_features(
124
  input_ids=inputs["input_ids"],
125
  attention_mask=inputs["attention_mask"],
126
  )
127
+ text_emb = (text_out.pooler_output
128
+ if hasattr(text_out, "pooler_output") else text_out)
129
+ image_out = self.clip_model.get_image_features(
130
  pixel_values=inputs["pixel_values"]
131
  )
132
+ image_embs = (image_out.pooler_output
133
+ if hasattr(image_out, "pooler_output") else image_out)
134
  text_emb = F.normalize(text_emb, dim=-1)
135
  image_embs = F.normalize(image_embs, dim=-1)
136
  sims = (text_emb @ image_embs.T).squeeze(0).float().cpu()