handler.py for running SongFormer on HF inference endpoint

Files changed (2) hide show

handler.py +172 -0
requirements.txt +31 -0

handler.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+HuggingFace Inference Endpoint Handler for SongFormer
+Supports binary audio input (WAV, MP3, etc.) via base64 encoding or direct bytes
+"""
+import os
+import sys
+import io
+import base64
+import json
+import tempfile
+from typing import Dict, Any, Union
+import librosa
+import numpy as np
+import torch
+from transformers import AutoModel
+class EndpointHandler:
+    """
+    HuggingFace Inference Endpoint Handler for SongFormer model.
+    Accepts base64-encoded audio (WAV, MP3, FLAC, etc.)
+    """
+    def __init__(self, path: str = ""):
+        """
+        Initialize the handler and load the SongFormer model.
+        Args:
+            path: Path to the model directory (provided by HuggingFace)
+        """
+        # Set up environment
+        self.model_path = path or os.getcwd()
+        os.environ["SONGFORMER_LOCAL_DIR"] = self.model_path
+        sys.path.insert(0, self.model_path)
+        # Import after setting up path
+        # Load the model
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Loading SongFormer model on {self.device}...")
+        # Load model without device_map to avoid meta device initialization
+        # The SongFormerModel.__init__ now handles meta device detection
+        self.model = AutoModel.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            device_map=None,
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        # Expected sampling rate for the model
+        self.target_sr = 24000
+        print("SongFormer model loaded successfully!")
+    def _decode_base64_audio(self, audio_b64: str) -> np.ndarray:
+        """
+        Decode base64-encoded audio to numpy array.
+        Args:
+            audio_b64: Base64-encoded audio string
+        Returns:
+            numpy array of audio samples at 24kHz
+        """
+        # Decode base64 string to bytes
+        try:
+            audio_bytes = base64.b64decode(audio_b64)
+        except Exception as e:
+            raise ValueError(f"Failed to decode base64 audio data: {e}")
+        # Load audio from bytes using librosa
+        # Create a file-like object from bytes
+        audio_io = io.BytesIO(audio_bytes)
+        # Load with librosa (automatically handles WAV, MP3, etc.)
+        audio_array, _ = librosa.load(audio_io, sr=self.target_sr, mono=True)
+        return audio_array
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process inference request with base64-encoded audio.
+        Expected input:
+           {
+               "inputs": "<base64-encoded-audio-data>"
+           }
+        Returns:
+           {
+               "segments": [
+                   {
+                       "label": "intro",
+                       "start": 0.0,
+                       "end": 15.2
+                   },
+                   ...
+               ],
+               "duration": 180.5,
+               "num_segments": 8
+           }
+        """
+        try:
+            # Extract base64-encoded audio
+            audio_b64 = data.get("inputs")
+            if not audio_b64:
+                raise ValueError("Missing 'inputs' key with base64-encoded audio")
+            if not isinstance(audio_b64, str):
+                raise ValueError("Input must be a base64-encoded string")
+            # Decode audio
+            audio_array = self._decode_base64_audio(audio_b64)
+            # Run inference
+            with torch.no_grad():
+                result = self.model(audio_array)
+            # Calculate duration
+            duration = len(audio_array) / self.target_sr
+            # Format output
+            output = {
+                "segments": result,
+                "duration": float(duration),
+                "num_segments": len(result)
+            }
+            return output
+        except Exception as e:
+            # Return error in a structured format
+            return {
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "segments": [],
+                "duration": 0.0,
+                "num_segments": 0
+            }
+# For local testing
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Test SongFormer handler locally")
+    parser.add_argument("audio_file", help="Path to audio file to test")
+    parser.add_argument("--model-path", default=".", help="Path to model directory")
+    args = parser.parse_args()
+    # Initialize handler
+    handler = EndpointHandler(args.model_path)
+    # Read and encode audio file
+    with open(args.audio_file, "rb") as f:
+        audio_bytes = f.read()
+    audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
+    # Test with base64 input
+    print("\n=== Testing with base64-encoded audio ===")
+    result = handler({"inputs": audio_b64})
+    print(json.dumps(result, indent=2))
+    # Test with file path directly (for comparison)
+    print("\n=== Testing with direct file path (not typical for endpoint) ===")
+    result_direct = handler.model(args.audio_file)
+    print(json.dumps(result_direct, indent=2))

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# Requirements for HuggingFace Inference Endpoint
+# This file contains dependencies needed for the handler.py to work
+# Core ML frameworks
+transformers>=4.30.0
+torch>=2.0.0
+# Audio processing
+librosa>=0.10.0
+soundfile>=0.12.0
+audioread>=3.0.0
+# Numerical computing
+numpy>=1.24.0
+scipy>=1.10.0
+# Additional dependencies for SongFormer model
+# (these may already be installed by the model itself)
+einops>=0.7.0
+x-transformers>=1.0.0
+ema-pytorch>=0.2.0
+loguru>=0.7.0
+omegaconf>=2.3.0
+muq
+msaf
+# Note: For MP3 support, ffmpeg must be installed in the system
+# Add to Dockerfile:
+# RUN apt-get update && apt-get install -y ffmpeg