{ "ext_weights": { "cav_mae_st": { "path": "ext_weights/cav_mae_st.pth", "size": "727M", "description": "CAV-MAE-ST pretrained weights" }, "empty_string": { "path": "ext_weights/empty_string.pth", "size": "310K", "description": "Empty string embedding for classifier-free guidance" }, "music_speech_audioset_clap": { "path": "ext_weights/music_speech_audioset_epoch_15_esc_89.98.pt", "size": "2.2G", "description": "CLAP model trained on music/speech/AudioSet, ESC-50 acc=89.98%" }, "synchformer": { "path": "ext_weights/synchformer_state_dict.pth", "size": "907M", "description": "Synchformer audio-visual synchronization model weights" }, "v1-44": { "path": "ext_weights/v1-44.pth", "size": "1.2G", "description": "VAE checkpoint (44kHz)" } }, "weights": { "controlfoley": { "path": "weights/controlfoley.pth", "size": "11G", "description": "ControlFoley main model checkpoint" } } }