rnagabh
/

gemma4-audio-encoder

@@ -106,22 +106,26 @@ waveform = np.random.randn(64000).astype(np.float32)  # 4s @ 16kHz
 inputs = feature_extractor([waveform], sampling_rate=16000, return_tensors="pt")
 with torch.no_grad():
-    output = audio_tower(inputs["input_features"].to(dtype=torch.bfloat16, device="cuda"))
-    # Option 1: Text-projected embeddings (1536-dim) — maps into Gemma 4 text decoder space
     text_projected = output.last_hidden_state  # (1, 100, 1536)
-    # Option 2: Pure audio embeddings (1024-dim) — conformer output before projection
-    # Recommended for downstream audio tasks (classification, verification, etc.)
-    # Use a forward hook to capture the 1024-dim input to output_proj
-    pre_proj_features = {}
-    def hook_fn(module, input, output):
-        pre_proj_features["hidden"] = input[0]
-    handle = audio_tower.output_proj.register_forward_hook(hook_fn)
-    _ = audio_tower(inputs["input_features"].to(dtype=torch.bfloat16, device="cuda"))
-    handle.remove()
-    audio_embeddings = pre_proj_features["hidden"]  # (1, 100, 1024)
 ```
 > **Which to use?** For audio-only tasks (classification, speaker verification, deepfake detection),

 inputs = feature_extractor([waveform], sampling_rate=16000, return_tensors="pt")
 with torch.no_grad():
+    mel = inputs["input_features"].to(dtype=torch.bfloat16, device="cuda")
+    # === Option 1: Text-projected embeddings (1536-dim) ===
+    # Use this if feeding into an LLM or need the full model output.
+    output = audio_tower(mel)
     text_projected = output.last_hidden_state  # (1, 100, 1536)
+# === Option 2: Pure audio embeddings (1024-dim) ===
+# Captures the conformer output BEFORE the text projection layer.
+# Recommended for downstream audio tasks (classification, verification, etc.)
+# Note: this registers a hook and runs a separate forward pass.
+pre_proj_features = {}
+def hook_fn(module, input, output):
+    pre_proj_features["hidden"] = input[0]
+handle = audio_tower.output_proj.register_forward_hook(hook_fn)
+with torch.no_grad():
+    _ = audio_tower(mel)
+handle.remove()
+audio_embeddings = pre_proj_features["hidden"]  # (1, 100, 1024)
 ```
 > **Which to use?** For audio-only tasks (classification, speaker verification, deepfake detection),