{ "clip_model": "openai/clip-vit-base-patch32", "clap_model": "laion/larger_clap_music_and_speech", "embed_dim": 512, "training_dataset": "OpenSound/AudioCaps", "training_method": "clap_audio_to_clip_text", "num_samples": 10000, "epochs": 30, "batch_size": 256, "lr": 0.0001 }