| | --- |
| | license: other |
| | language: |
| | - en |
| | base_model: |
| | - facebook/sam-audio-base |
| | pipeline_tag: audio-to-audio |
| | library_name: mlx-audio |
| | tags: |
| | - audio-to-audio |
| | - speech |
| | - speech generation |
| | - voice isolation |
| | - mlx |
| | --- |
| | # mlx-community/sam-audio-base |
| | This model was converted to MLX format from [`facebook/sam-audio-base`](https://huggingface.co/facebook/sam-audio-base) using mlx-audio version **0.3.2**. |
| | Refer to the [original model card](https://huggingface.co/facebook/sam-audio-base) for more details on the model. |
| |
|
| | ## Use with mlx |
| | ```bash |
| | pip install -U mlx-audio |
| | ``` |
| |
|
| | ## Voice Isolation: |
| | ```python |
| | from mlx_audio.sts import SAMAudio, SAMAudioProcessor, save_audio |
| | import mlx.core as mx |
| | |
| | # Load model and processor |
| | processor = SAMAudioProcessor.from_pretrained("mlx-community/sam-audio-base") |
| | model = SAMAudio.from_pretrained("mlx-community/sam-audio-base") |
| | |
| | # Process inputs |
| | batch = processor( |
| | descriptions=["speech"], |
| | audios=["path/to/audio.mp3"], |
| | # anchors=[[("+ ", 0.2, 0.5)]], # Optional: temporal |
| | ) |
| | |
| | # Separate audio |
| | result = model.separate( |
| | audios=batch.audios, |
| | descriptions=batch.descriptions, |
| | sizes=batch.sizes, |
| | anchor_ids=batch.anchor_ids, |
| | anchor_alignment=batch.anchor_alignment, |
| | ode_decode_chunk_size=50, # Chunked decoding for memory efficiency |
| | ) |
| | |
| | # For long audio files, use separate_long(). |
| | # Note: This is slower than separate() but it is more memory efficient. |
| | # result = model.separate_long( |
| | # audios=batch.audios, |
| | # descriptions=batch.descriptions, |
| | # chunk_seconds=10.0, |
| | # overlap_seconds=3.0, |
| | # anchor_ids=batch.anchor_ids, |
| | # anchor_alignment=batch.anchor_alignment, |
| | # ode_decode_chunk_size=50, # Chunked decoding for memory efficiency |
| | # ) |
| | |
| | # Save output |
| | ## Isolated speech |
| | save_audio(result.target[0], "separated.wav", sample_rate=model.sample_rate) |
| | |
| | ## Residual audio (background music/noise/other sounds) |
| | save_audio(result.residual[0], "residual.wav", sample_rate=model.sample_rate) |
| | |
| | # Check memory usage |
| | print(f"Peak memory: {result.peak_memory:.2f} GB") |
| | ``` |
| |
|