ivillar
/

lp-music-caps

music-captioning

Model card Files Files and versions

ivillar commited on Mar 12, 2024

Commit

5e4e6f5

·

1 Parent(s): ef418fe

Change request logic

Files changed (1) hide show

handler.py +14 -14

handler.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Dict, List, Any
 import numpy as np
 import librosa
 import os
 def preprocess_audio(audio_signal, sr, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
     audio = librosa.to_mono(audio_signal)
@@ -51,14 +51,14 @@ class EndpointHandler:
             inference += f"{time}\n{text} \n \n"
         return inference
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        audio_bytes = data["audio_bytes"]
-        audio_shape = tuple([int(x) for x in data["audio_shape"].split(', ')])
-        audio_dtype = data["audio_dtype"]
         sr = data["sampling_rate"]
-        input_audio = np.frombuffer(audio_bytes, dtype=audio_dtype).reshape(audio_shape)
         preprocessed_audio = preprocess_audio(input_audio, sr)
         return self._captioning(preprocessed_audio)
@@ -72,17 +72,17 @@ if __name__ == "__main__":
     audio_path = "folk.wav"
     np_audio, sr = librosa.load(audio_path, sr=44100)
-    np_bytes = np_audio.tobytes()
     np_shape = np_audio.shape
     np_dtype = np_audio.dtype.name
-    request = {
-        "audio_bytes": np_bytes,
-        "audio_shape": ', '.join(map(str, np_shape)),
         "audio_dtype": np_dtype,
         "sampling_rate": sr
-    }
     print(f"Loaded {audio_path} with sample rate {sr}")
-    print(handler.__call__(request))
 """

 import numpy as np
 import librosa
 import os
+import json
 def preprocess_audio(audio_signal, sr, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
     audio = librosa.to_mono(audio_signal)
             inference += f"{time}\n{text} \n \n"
         return inference
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        data = json.loads(data["payload"])
+        array = np.array(data['audio_list'], dtype=data["audio_dtype"])
+        array_shape = data['audio_shape']
+        input_audio = array.reshape(array_shape)
         sr = data["sampling_rate"]
         preprocessed_audio = preprocess_audio(input_audio, sr)
         return self._captioning(preprocessed_audio)
     audio_path = "folk.wav"
     np_audio, sr = librosa.load(audio_path, sr=44100)
+    np_list = np_audio.tolist()
     np_shape = np_audio.shape
     np_dtype = np_audio.dtype.name
+    request = json.dumps({
+        "audio_list": np_list,
+        "audio_shape": np_shape,
         "audio_dtype": np_dtype,
         "sampling_rate": sr
+    })
     print(f"Loaded {audio_path} with sample rate {sr}")
+    print(handler.__call__({"payload": request}))
 """