speecht5_tts

@@ -1,37 +1,37 @@
 from transformers import pipeline
 import torch
 import soundfile as sf
-import base64
 import io
 class EndpointHandler:
-    def __init__(self):
         self.synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
         self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    def __call__(self, data):
         text = data.get("inputs", "")
         speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
         # Generate speech using the synthesiser
         speech = self.synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
-        # Convert numpy audio array to bytes
-        audio_bytes = io.BytesIO()
-        sf.write(audio_bytes, speech["audio"], samplerate=speech["sampling_rate"], format='WAV')
-        audio_bytes.seek(0)
-        audio_base64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
-        # Create response
         response = {
             "statusCode": 200,
-            "body": {
-                "audio": audio_base64,
-                "sampling_rate": speech["sampling_rate"]
-            },
-            "headers": {
-                "Content-Type": "audio/wav"
-            }
         }
         return response

+from typing import  Dict
 from transformers import pipeline
 import torch
 import soundfile as sf
 import io
 class EndpointHandler:
+    def __init__(self, path=""):
         self.synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
         self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
         text = data.get("inputs", "")
         speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
         # Generate speech using the synthesiser
         speech = self.synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
+        # Convert numpy audio array to a WAV byte stream.
+        audio_buffer = io.BytesIO()
+        sf.write(file=audio_buffer, data=speech["audio"], samplerate=speech["sampling_rate"], format='WAV')
+        audio_buffer.seek(0)
+        audio_wav = audio_buffer.read()
+        # Prepare the response headers.
+        headers = {
+            "Content-Type": "audio/wav"
+        }
+        # Create the response as raw audio bytes.
         response = {
             "statusCode": 200,
+            "body": audio_wav,
+            "headers": headers
         }
         return response