"""Minimal LIVE smoke test of the deployed Modal app — ONE LLM call + ONE voice call (not the 32-take pre-gen), to validate the real model APIs cheaply. python3 scripts/smoke_modal.py NOTE: the first call downloads model weights (MiniCPM-o ~19GB on A100, VoxCPM2 on A10G) into the Volume and spins GPUs — this is the real-credit step. Subsequent calls are warm. """ import sys import numpy as np import modal APP = "witnessbox" def main(): WitnessLLM = modal.Cls.from_name(APP, "WitnessLLM")() WitnessVoice = modal.Cls.from_name(APP, "WitnessVoice")() print("→ LLM (MiniCPM-o) cold start + one reply…", flush=True) reply = WitnessLLM.respond.remote( "You are Marcus Reid, a guarded CFO under cross-examination. Answer in ONE short sentence, in character.", [{"role": "user", "content": "Did you authorize the twelve-million-dollar wire?"}], ) print(" LLM reply:", repr(reply)) assert isinstance(reply, str) and reply, "LLM returned empty/non-string" print("→ Voice (VoxCPM2) cold start + one line…", flush=True) wav, sr = WitnessVoice.speak.remote( "I have nothing to hide, counselor.", "calm, composed, faintly condescending" ) wav = np.asarray(wav) print(f" voice: {wav.shape} samples @ {sr} Hz ({wav.shape[0]/sr:.1f}s)") assert wav.size > 0 and sr in (16000, 22050, 24000, 44100, 48000) print("\n✅ LIVE smoke passed — MiniCPM-o + VoxCPM2 APIs are correct on GPU.") if __name__ == "__main__": sys.exit(main())