hieu3636
/

cxr-vlm-code

Model card Files Files and versions

convitom commited on 9 days ago

Commit

cba2b6c

·

1 Parent(s): d313d81

d

Files changed (1) hide show

model/projection.py +12 -0

model/projection.py CHANGED Viewed

@@ -98,6 +98,18 @@ class MLPProjection(nn.Module):
         """
         B = patch_features.size(0)
         # Expand query tokens to batch size
         queries = self.query_tokens.expand(B, -1, -1)  # (B, 32, 768)

         """
         B = patch_features.size(0)
+        # Align input dtype with the projection's own parameter dtype.
+        # The frozen image encoder may run in bf16/fp16 (llm_dtype) while
+        # the projection's MLP/MHA weights stay fp32. Under bf16 autocast,
+        # nn.MultiheadAttention's in-projection sometimes bypasses autocast
+        # (cross-attention path), giving:
+        #   RuntimeError: mat1 and mat2 must have the same dtype: BFloat16 vs Float
+        # Upcasting patch_features keeps the matmul self-consistent on any
+        # GPU/precision. No-op when dtypes already match (T4 fp16 fast path).
+        target_dtype = self.query_tokens.dtype
+        if patch_features.dtype != target_dtype:
+            patch_features = patch_features.to(target_dtype)
         # Expand query tokens to batch size
         queries = self.query_tokens.expand(B, -1, -1)  # (B, 32, 768)