rnagabh
/

gemma4-audio-encoder

@@ -98,12 +98,15 @@ with torch.no_grad():
     # Option 2: Pure audio embeddings (1024-dim) — conformer output before projection
     # Recommended for downstream audio tasks (classification, verification, etc.)
-    mel = inputs["input_features"].to(dtype=torch.bfloat16, device="cuda")
-    hidden = mel
-    hidden = audio_tower.subsample_conv_projection(hidden)
-    for layer in audio_tower.layers:
-        hidden = layer(hidden)
-    audio_embeddings = hidden  # (1, 100, 1024)
 ```
 > **Which to use?** For audio-only tasks (classification, speaker verification, deepfake detection),

     # Option 2: Pure audio embeddings (1024-dim) — conformer output before projection
     # Recommended for downstream audio tasks (classification, verification, etc.)
+    # Use a forward hook to capture the 1024-dim input to output_proj
+    pre_proj_features = {}
+    def hook_fn(module, input, output):
+        pre_proj_features["hidden"] = input[0]
+    handle = audio_tower.output_proj.register_forward_hook(hook_fn)
+    _ = audio_tower(inputs["input_features"].to(dtype=torch.bfloat16, device="cuda"))
+    handle.remove()
+    audio_embeddings = pre_proj_features["hidden"]  # (1, 100, 1024)
 ```
 > **Which to use?** For audio-only tasks (classification, speaker verification, deepfake detection),