Text-to-Audio
Diffusers
English
text-video-to-audio
text-controlled-video-to-audio
audio-controlled-video-to-audio
audio-generation
Instructions to use YJX-Xiaomi/ControlFoley with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use YJX-Xiaomi/ControlFoley with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("YJX-Xiaomi/ControlFoley", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
| { | |
| "ext_weights": { | |
| "cav_mae_st": { | |
| "path": "ext_weights/cav_mae_st.pth", | |
| "size": "727M", | |
| "description": "CAV-MAE-ST pretrained weights" | |
| }, | |
| "empty_string": { | |
| "path": "ext_weights/empty_string.pth", | |
| "size": "310K", | |
| "description": "Empty string embedding for classifier-free guidance" | |
| }, | |
| "music_speech_audioset_clap": { | |
| "path": "ext_weights/music_speech_audioset_epoch_15_esc_89.98.pt", | |
| "size": "2.2G", | |
| "description": "CLAP model trained on music/speech/AudioSet, ESC-50 acc=89.98%" | |
| }, | |
| "synchformer": { | |
| "path": "ext_weights/synchformer_state_dict.pth", | |
| "size": "907M", | |
| "description": "Synchformer audio-visual synchronization model weights" | |
| }, | |
| "v1-44": { | |
| "path": "ext_weights/v1-44.pth", | |
| "size": "1.2G", | |
| "description": "VAE checkpoint (44kHz)" | |
| } | |
| }, | |
| "weights": { | |
| "controlfoley": { | |
| "path": "weights/controlfoley.pth", | |
| "size": "11G", | |
| "description": "ControlFoley main model checkpoint" | |
| } | |
| } | |
| } | |