Update handler.py

Browse files

Files changed (1) hide show

handler.py +79 -9

handler.py CHANGED Viewed

@@ -1,11 +1,39 @@
 import os
 import torch
 import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from snac import SNAC
 class EndpointHandler:
     def __init__(self, path=""):
         # Load the Orpheus model and tokenizer
         self.model_name = "hypaai/Hypa_Orpheus-3b-0.1-ft-unsloth-merged_16bit"
         self.model = AutoModelForCausalLM.from_pretrained(
@@ -16,13 +44,20 @@ class EndpointHandler:
         # Move model to GPU if available
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         # Load SNAC model for audio decoding
-        self.snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
-        self.snac_model.to(self.device)
         # Special tokens
         self.start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
@@ -31,14 +66,20 @@ class EndpointHandler:
         self.start_audio_token = 128257  # Start of Audio token
         self.end_audio_token = 128258  # End of Audio token
-        print(f"Model loaded on {self.device}")
     def preprocess(self, data):
         """
         Preprocess input data before inference
         """
         # HF Inference API format: 'inputs' is the text, 'parameters' contains the config
-        # Handle both direct access and standardized HF format
         if isinstance(data, dict) and "inputs" in data:
             # Standard HF format
             text = data["inputs"]
@@ -57,6 +98,7 @@ class EndpointHandler:
         # Format prompt with voice
         prompt = f"{voice}: {text}"
         # Tokenize
         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
@@ -74,13 +116,18 @@ class EndpointHandler:
             "temperature": temperature,
             "top_p": top_p,
             "max_new_tokens": max_new_tokens,
-            "repetition_penalty": repetition_penalty
         }
     def inference(self, inputs):
         """
         Run model inference on the preprocessed inputs
         """
         # Extract parameters
         input_ids = inputs["input_ids"]
         attention_mask = inputs["attention_mask"]
@@ -89,6 +136,8 @@ class EndpointHandler:
         max_new_tokens = inputs["max_new_tokens"]
         repetition_penalty = inputs["repetition_penalty"]
         # Generate output tokens
         with torch.no_grad():
             generated_ids = self.model.generate(
@@ -103,20 +152,29 @@ class EndpointHandler:
                 eos_token_id=self.end_audio_token,
             )
         return generated_ids
     def postprocess(self, generated_ids):
         """
         Process generated tokens into audio
         """
         # Find Start of Audio token
         token_indices = (generated_ids == self.start_audio_token).nonzero(as_tuple=True)
         if len(token_indices[1]) > 0:
             last_occurrence_idx = token_indices[1][-1].item()
             cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
         else:
             cropped_tensor = generated_ids
         # Remove End of Audio tokens
         processed_rows = []
@@ -137,8 +195,16 @@ class EndpointHandler:
         # Generate audio from codes
         audio_samples = []
         for code_list in code_lists:
-            audio = self.redistribute_codes(code_list)
-            audio_samples.append(audio)
         # Return first (and only) audio sample
         audio_sample = audio_samples[0].detach().squeeze().cpu().numpy()
@@ -162,6 +228,7 @@ class EndpointHandler:
         # Encode as base64
         audio_b64 = base64.b64encode(wav_data).decode('utf-8')
         return {
             "audio_b64": audio_b64,
@@ -205,7 +272,8 @@ class EndpointHandler:
             logger.info(f"Received request: {type(data)}")
             # Check if we need to handle the health check route
-            if data == "ping" or data == {"inputs": "ping"}:
                 return {"status": "ok"}
             preprocessed_inputs = self.preprocess(data)
@@ -216,4 +284,6 @@ class EndpointHandler:
             logger.error(f"Error processing request: {str(e)}")
             import traceback
             logger.error(traceback.format_exc())
-            return {"error": str(e)}

+"""
+# Orpheus TTS Handler - Explanation & Deployment Guide
+This guide explains how to properly deploy the Orpheus TTS model with the custom
+handler on Hugging Face Inference Endpoints.
+## The Problem
+Based on the error messages you're seeing:
+1. Connection is working (you get responses)
+2. But responses contain text rather than audio data
+3. The response format is the standard HF format: [{"generated_text": "..."}]
+This indicates that your endpoint is running the standard text generation handler
+rather than the custom audio generation handler you've defined.
+## Step 1: Properly package your handler
+Create a `handler.py` file with your custom handler code:
+"""
+# Code from your original handler, but with some fixes
 import os
 import torch
 import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from snac import SNAC
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path=""):
+        logger.info("Initializing Orpheus TTS handler")
         # Load the Orpheus model and tokenizer
         self.model_name = "hypaai/Hypa_Orpheus-3b-0.1-ft-unsloth-merged_16bit"
         self.model = AutoModelForCausalLM.from_pretrained(
         # Move model to GPU if available
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
+        logger.info(f"Model loaded on {self.device}")
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        logger.info("Tokenizer loaded")
         # Load SNAC model for audio decoding
+        try:
+            self.snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+            self.snac_model.to(self.device)
+            logger.info("SNAC model loaded")
+        except Exception as e:
+            logger.error(f"Error loading SNAC: {str(e)}")
+            raise
         # Special tokens
         self.start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
         self.start_audio_token = 128257  # Start of Audio token
         self.end_audio_token = 128258  # End of Audio token
+        logger.info("Handler initialization complete")
     def preprocess(self, data):
         """
         Preprocess input data before inference
         """
+        logger.info(f"Preprocessing data: {type(data)}")
+        # Handle health check
+        if data == "ping" or (isinstance(data, dict) and data.get("inputs") == "ping"):
+            logger.info("Health check detected")
+            return {"health_check": True}
         # HF Inference API format: 'inputs' is the text, 'parameters' contains the config
         if isinstance(data, dict) and "inputs" in data:
             # Standard HF format
             text = data["inputs"]
         # Format prompt with voice
         prompt = f"{voice}: {text}"
+        logger.info(f"Formatted prompt with voice {voice}")
         # Tokenize
         input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             "temperature": temperature,
             "top_p": top_p,
             "max_new_tokens": max_new_tokens,
+            "repetition_penalty": repetition_penalty,
+            "health_check": False
         }
     def inference(self, inputs):
         """
         Run model inference on the preprocessed inputs
         """
+        # Handle health check
+        if inputs.get("health_check", False):
+            return {"status": "ok"}
         # Extract parameters
         input_ids = inputs["input_ids"]
         attention_mask = inputs["attention_mask"]
         max_new_tokens = inputs["max_new_tokens"]
         repetition_penalty = inputs["repetition_penalty"]
+        logger.info(f"Running inference with max_new_tokens={max_new_tokens}")
         # Generate output tokens
         with torch.no_grad():
             generated_ids = self.model.generate(
                 eos_token_id=self.end_audio_token,
             )
+        logger.info(f"Generation complete, output shape: {generated_ids.shape}")
         return generated_ids
     def postprocess(self, generated_ids):
         """
         Process generated tokens into audio
         """
+        # Handle health check response
+        if isinstance(generated_ids, dict) and "status" in generated_ids:
+            return generated_ids
+        logger.info("Postprocessing generated tokens")
         # Find Start of Audio token
         token_indices = (generated_ids == self.start_audio_token).nonzero(as_tuple=True)
         if len(token_indices[1]) > 0:
             last_occurrence_idx = token_indices[1][-1].item()
             cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
+            logger.info(f"Found start audio token at position {last_occurrence_idx}")
         else:
             cropped_tensor = generated_ids
+            logger.warning("No start audio token found")
         # Remove End of Audio tokens
         processed_rows = []
         # Generate audio from codes
         audio_samples = []
         for code_list in code_lists:
+            logger.info(f"Processing code list of length {len(code_list)}")
+            if len(code_list) > 0:
+                audio = self.redistribute_codes(code_list)
+                audio_samples.append(audio)
+            else:
+                logger.warning("Empty code list, no audio to generate")
+        if not audio_samples:
+            logger.error("No audio samples generated")
+            return {"error": "No audio samples generated"}
         # Return first (and only) audio sample
         audio_sample = audio_samples[0].detach().squeeze().cpu().numpy()
         # Encode as base64
         audio_b64 = base64.b64encode(wav_data).decode('utf-8')
+        logger.info(f"Audio encoded as base64, length: {len(audio_b64)}")
         return {
             "audio_b64": audio_b64,
             logger.info(f"Received request: {type(data)}")
             # Check if we need to handle the health check route
+            if data == "ping" or (isinstance(data, dict) and data.get("inputs") == "ping"):
+                logger.info("Processing health check request")
                 return {"status": "ok"}
             preprocessed_inputs = self.preprocess(data)
             logger.error(f"Error processing request: {str(e)}")
             import traceback
             logger.error(traceback.format_exc())
+            return {"error": str(e)}
+"