| { | |
| "audio_encoder": { | |
| "embed_dim_audio": 1024, | |
| "joint_dim": 512, | |
| "audio_cfg": { | |
| "audio_length": 1024, | |
| "clip_samples": 480000, | |
| "mel_bins": 64, | |
| "sample_rate": 48000, | |
| "window_size": 1024, | |
| "hop_size": 480, | |
| "fmin": 50, | |
| "fmax": 14000, | |
| "class_num": 527, | |
| "model_type": "HTSAT", | |
| "model_name": "base" | |
| } | |
| }, | |
| "LLM": { | |
| "t5_model": "google/flan-t5-xl", | |
| "repetition_penalty": 10.0 | |
| }, | |
| "num_query_tokens": 16, | |
| "embed_dim": 256, | |
| "max_txt_len": 120, | |
| "prompt": "Caption this data:", | |
| "apply_lemmatizer": false, | |
| "atRandom": true | |
| } |