Add MultiTalk custom handler for HF Inference Endpoint

Browse files

Files changed (3) hide show

README.md +77 -0
handler.py +139 -0
requirements.txt +16 -0

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# MultiTalk Hugging Face Endpoint Handler
+This custom handler enables the MeiGen-AI/MeiGen-MultiTalk model to run on Hugging Face Inference Endpoints.
+## Setup Instructions
+1. **Create a new Inference Endpoint** on Hugging Face:
+   - Go to https://huggingface.co/inference-endpoints
+   - Click "New endpoint"
+2. **Configure the endpoint**:
+   - **Model repository**: `ajwestfield/multitalk-handler` (you'll need to upload this handler to your HF account)
+   - **Task**: Custom
+   - **Framework**: Custom
+   - **Instance type**: GPU · A100 · 1x GPU (80 GB)
+3. **Advanced Configuration**:
+   - **Container type**: Custom
+   - **Custom image**: `pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime`
+   - **Autoscaling**:
+     - Min replicas: 0
+     - Max replicas: 1
+     - Scale to zero after: 300 seconds (5 minutes)
+4. **Environment Variables** (add these in Settings):
+   ```
+   PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
+   CUDA_VISIBLE_DEVICES=0
+   ```
+## Uploading the Handler
+1. Create a new model repository on Hugging Face:
+   ```bash
+   huggingface-cli repo create multitalk-handler --type model
+   ```
+2. Upload the handler files:
+   ```bash
+   cd huggingface-endpoint/multitalk-handler
+   git init
+   git add .
+   git commit -m "Add MultiTalk custom handler"
+   git remote add origin https://huggingface.co/ajwestfield/multitalk-handler
+   git push -u origin main
+   ```
+## Usage
+Once deployed, you can call the endpoint with:
+```python
+import requests
+import json
+API_URL = "https://YOUR-ENDPOINT-URL.endpoints.huggingface.cloud"
+headers = {
+    "Authorization": "Bearer YOUR_HF_TOKEN",
+    "Content-Type": "application/json"
+}
+data = {
+    "inputs": {
+        "prompt": "A person speaking naturally",
+        "image": "base64_encoded_image_optional"
+    },
+    "parameters": {
+        "num_frames": 16,
+        "height": 480,
+        "width": 640,
+        "num_inference_steps": 25
+    }
+}
+response = requests.post(API_URL, headers=headers, json=data)
+result = response.json()
+```

handler.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import json
+import base64
+import io
+from typing import Dict, Any, List
+from PIL import Image
+import numpy as np
+class EndpointHandler:
+    def __init__(self, path=""):
+        """
+        Initialize the MultiTalk model handler
+        """
+        import sys
+        import os
+        # Add error handling for missing dependencies
+        try:
+            from diffusers import DiffusionPipeline
+            import librosa
+        except ImportError as e:
+            print(f"Missing dependency: {e}")
+            print("Please ensure all requirements are installed")
+            raise
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Initialize model with low VRAM mode if needed
+        try:
+            # Try to load the model
+            self.pipeline = DiffusionPipeline.from_pretrained(
+                path if path else "MeiGen-AI/MeiGen-MultiTalk",
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+            # Enable memory efficient attention if available
+            if hasattr(self.pipeline, "enable_attention_slicing"):
+                self.pipeline.enable_attention_slicing()
+            if hasattr(self.pipeline, "enable_vae_slicing"):
+                self.pipeline.enable_vae_slicing()
+            print("Model loaded successfully")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            raise
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process the inference request
+        Args:
+            data: Input data containing:
+                - inputs: The input prompt or image
+                - parameters: Additional generation parameters
+        Returns:
+            Dict containing the generated output
+        """
+        try:
+            # Extract inputs
+            inputs = data.get("inputs", "")
+            parameters = data.get("parameters", {})
+            # Handle different input types
+            if isinstance(inputs, str):
+                # Text prompt input
+                prompt = inputs
+                image = None
+            elif isinstance(inputs, dict):
+                prompt = inputs.get("prompt", "")
+                # Handle base64 encoded image if provided
+                if "image" in inputs:
+                    image_data = base64.b64decode(inputs["image"])
+                    image = Image.open(io.BytesIO(image_data))
+                else:
+                    image = None
+            else:
+                prompt = str(inputs)
+                image = None
+            # Set default parameters
+            num_inference_steps = parameters.get("num_inference_steps", 25)
+            guidance_scale = parameters.get("guidance_scale", 7.5)
+            height = parameters.get("height", 480)
+            width = parameters.get("width", 640)
+            num_frames = parameters.get("num_frames", 16)
+            # Generate video
+            with torch.no_grad():
+                if hasattr(self.pipeline, "__call__"):
+                    result = self.pipeline(
+                        prompt=prompt,
+                        image=image,
+                        height=height,
+                        width=width,
+                        num_frames=num_frames,
+                        num_inference_steps=num_inference_steps,
+                        guidance_scale=guidance_scale
+                    )
+                    # Handle the output
+                    if hasattr(result, "frames"):
+                        # Convert frames to base64 encoded video or images
+                        frames = result.frames[0] if len(result.frames) > 0 else []
+                        # Convert frames to base64 encoded images
+                        encoded_frames = []
+                        for frame in frames:
+                            if isinstance(frame, Image.Image):
+                                buffered = io.BytesIO()
+                                frame.save(buffered, format="PNG")
+                                img_str = base64.b64encode(buffered.getvalue()).decode()
+                                encoded_frames.append(img_str)
+                        return {
+                            "frames": encoded_frames,
+                            "num_frames": len(encoded_frames),
+                            "message": "Video generated successfully"
+                        }
+                    else:
+                        return {
+                            "error": "Model output format not recognized",
+                            "result": str(result)
+                        }
+                else:
+                    return {
+                        "error": "Model pipeline not properly initialized"
+                    }
+        except Exception as e:
+            import traceback
+            return {
+                "error": str(e),
+                "traceback": traceback.format_exc()
+            }

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch==2.4.1
+torchvision==0.19.1
+torchaudio==2.4.1
+xformers==0.0.28
+flash-attn==2.7.4.post1
+diffusers
+transformers
+accelerate
+librosa
+ffmpeg-python
+opencv-python-headless
+numpy
+Pillow
+scipy
+imageio
+moviepy