zenlm
/

zen-foley

@@ -1,6 +1,6 @@
 ---
 library_name: diffusers
-pipeline_tag: video-to-audio
 language:
   - en
 license: other
@@ -59,14 +59,12 @@ client = OpenAI(
     api_key='your-api-key',
 )
-# Generate foley audio from video
-with open('video.mp4', 'rb') as f:
-    response = client.audio.speech.create(
-        model='zen-foley',
-        input='footsteps on gravel with ambient wind',
-        extra_body={'video': f.read()},
-        voice='foley',
-    )
 response.stream_to_file('foley.wav')
 ```
@@ -86,7 +84,11 @@ foley_model = torch.load(
 # Load auxiliary models
 vae = torch.load('vae_128d_48k.pth', map_location=device, weights_only=False)
-sync_encoder = torch.load('synchformer_state_dict.pth', map_location=device, weights_only=False)
 ```
 See [github.com/zenlm/zen-audio](https://github.com/zenlm/zen-audio) for the full inference pipeline.

 ---
 library_name: diffusers
+pipeline_tag: text-to-audio
 language:
   - en
 license: other
     api_key='your-api-key',
 )
+# Generate foley audio from video description
+response = client.audio.speech.create(
+    model='zen-foley',
+    input='footsteps on gravel with ambient wind',
+    voice='foley',
+)
 response.stream_to_file('foley.wav')
 ```
 # Load auxiliary models
 vae = torch.load('vae_128d_48k.pth', map_location=device, weights_only=False)
+sync_encoder = torch.load(
+    'synchformer_state_dict.pth',
+    map_location=device,
+    weights_only=False,
+)
 ```
 See [github.com/zenlm/zen-audio](https://github.com/zenlm/zen-audio) for the full inference pipeline.