{ "model_type": "audio-spatializer", "architecture": "CrossAttnSpatializer", "sample_rate": 24000, "n_fft": 1024, "hop_length": 512, "n_mels": 128, "audio_embed_dim": 512, "text_embed_dim": 512, "nhead": 8, "num_layers": 6, "output_channels": 4, "output_format": "foa", "training": { "framework": "pytorch-lightning", "final_epoch": 14, "final_step": 342, "checkpoint": "epoch=14-step=342.ckpt" }, "spatial_parameters": { "direction": ["front", "front-left", "left", "back-left", "back", "back-right", "right", "front-right"], "elevation": ["down", "level", "up"], "distance": ["near", "mid", "far"], "room_size": ["small", "medium", "large"], "reverb": ["dry", "medium", "wet"] } }