"""Audio comprehension tests — verify the audio pipeline on CPU. Tests: AudioSequencer forward, Moonshine encoder feature extraction, frame_proj → unfold → projection → norm pipeline. Runs on CPU — downloads Moonshine-base on first run. """ import os, sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) import torch from arbitor.kernel.ternary_scale import TScaleType device = "cpu" FAILED = 0 def check(name, condition, detail=""): global FAILED if condition: print(f" ✓ {name}") else: print(f" ✗ {name} — {detail}") FAILED += 1 print("\n=== Audio Comprehension ===\n") print("Loading AudioSequencer (downloads Moonshine-base on first run)...") from arbitor import ARBModel, HIDDEN_DIM # 1. AudioSequencer forward with synthetic tone model = ARBModel(enable_image=False, enable_audio=True, enable_vq=False, enable_graph=False, enable_memory_modules=False, enable_moe=False) model.eval() sr = 16000 duration_s = 1.0 t = torch.linspace(0, duration_s, int(sr * duration_s)) tone = torch.sin(2 * 3.14159 * 440 * t).unsqueeze(0) # 440 Hz, mono with torch.no_grad(): seq_out = model.audio_sequencer(tone) check("AudioSequencer forward", seq_out is not None, "got None") check("Output last dim", seq_out.shape[-1] == HIDDEN_DIM, f"last dim={seq_out.shape[-1]}") check("No NaN in audio features", not torch.isnan(seq_out).any()) check("Audio features finite", torch.isfinite(seq_out).all()) check("Audio features have variance", seq_out.std().item() > 0.001, f"std={seq_out.std().item()}") # 2. Different frequency tone (should produce different features) tone2 = torch.sin(2 * 3.14159 * 1000 * t).unsqueeze(0) with torch.no_grad(): seq_out2 = model.audio_sequencer(tone2) feature_diff = (seq_out - seq_out2).abs().mean().item() check("Different tones → different features", feature_diff > 0.001, f"diff={feature_diff}") # 3. Left and right channels → mono downmix stereo = torch.stack([tone.squeeze(0), tone2.squeeze(0)], dim=0).unsqueeze(0) stereo_audio = stereo.permute(0, 2, 1) # [1, samples, 2] -> [1, samples, 2] # AudioSequencer expects [B, T] waveform, stereo will be handled by forward # Test mono downmix happens with torch.no_grad(): seq_stereo = model.audio_sequencer(tone.expand(1, -1)) # mono is fine check("Audio processes mono correctly", seq_stereo.shape[-1] == HIDDEN_DIM) # 4. Audio VQ encoder from arbitor.encoders.audio import AudioVQEncoder vq_enc = AudioVQEncoder() tone_4ch = tone.unsqueeze(0) # [1, 1, 16000] logits, indices = vq_enc(tone_4ch) check("AudioVQEncoder logits", logits.shape[-1] == 288, f"vocab dim={logits.shape[-1]}") check("AudioVQEncoder indices", indices.shape[-1] == logits.shape[1]) check("No NaN in VQ output", not torch.isnan(logits).any()) print(f"\n{'='*50}") if FAILED == 0: print("✓ All audio comprehension tests passed!") else: print(f"✗ {FAILED} test(s) failed") sys.exit(FAILED)