Commit ·
38ea731
0
Parent(s):
Duplicate from philschmid/multi-model-inference-endpoint
Browse filesCo-authored-by: Philipp Schmid <philschmid@users.noreply.huggingface.co>
- .gitattributes +34 -0
- README.md +56 -0
- handler.py +41 -0
- inference.png +0 -0
- mmie.png +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- endpoints-template
|
| 4 |
+
license: apache-2.0
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
# Multi-Model GPU Inference with Hugging Face Inference Endpoints
|
| 8 |
+
|
| 9 |
+
Multi-model Inference Endpoints provide a way to deploy multiple models onto the same infrastructure for a scalable and cost-effective inference. On multi-model Inference Endpoints, we load a list of models into memory, either CPU or GPU, and dynamically use them during inference time.
|
| 10 |
+
|
| 11 |
+
The following diagram shows how multi-model inference endpoints look.
|
| 12 |
+
|
| 13 |
+

|
| 14 |
+
|
| 15 |
+
This repository includes a [custom handler](handler.py) of a sample multi-model `EndpointHandler` implementation. This multi-model handler loads 5 different models for inference including:
|
| 16 |
+
- `DistilBERT` model for `sentiment-analysis`
|
| 17 |
+
- `Marian` model `translation`
|
| 18 |
+
- `BART` model for `summarization`
|
| 19 |
+
- `BERT` model for `token-classification`
|
| 20 |
+
- `BERT` model for `text-classification`
|
| 21 |
+
|
| 22 |
+
If you want to learn more about multi-model inference endpoints checkout https://www.philschmid.de/multi-model-inference-endpoints
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Use with Inference Endpoints
|
| 26 |
+
|
| 27 |
+
Hugging Face Inference endpoints can be used with an HTTP client in any language. We will use Python and the `requests` library to send our requests. (make your you have it installed `pip install requests`)
|
| 28 |
+
|
| 29 |
+

|
| 30 |
+
|
| 31 |
+
## Send requests with Pyton
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
import json
|
| 35 |
+
import requests as r
|
| 36 |
+
|
| 37 |
+
ENDPOINT_URL = "" # url of your endpoint
|
| 38 |
+
HF_TOKEN = "" # token of the account you deployed
|
| 39 |
+
|
| 40 |
+
# define model and payload
|
| 41 |
+
model_id = "facebook/bart-large-cnn"
|
| 42 |
+
text = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."
|
| 43 |
+
request_body = {"inputs": text, "model_id": model_id}
|
| 44 |
+
|
| 45 |
+
# HTTP headers for authorization
|
| 46 |
+
headers= {
|
| 47 |
+
"Authorization": f"Bearer {HF_TOKEN}",
|
| 48 |
+
"Content-Type": "application/json"
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# send request
|
| 52 |
+
response = r.post(ENDPOINT_URL, headers=headers, json=request_body)
|
| 53 |
+
prediction = response.json()
|
| 54 |
+
|
| 55 |
+
# [{'summary_text': 'The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.'}]
|
| 56 |
+
```
|
handler.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from typing import Dict, List, Any
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 4 |
+
|
| 5 |
+
# check for GPU
|
| 6 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 7 |
+
|
| 8 |
+
# multi-model list
|
| 9 |
+
multi_model_list = [
|
| 10 |
+
{"model_id": "distilbert-base-uncased-finetuned-sst-2-english", "task": "text-classification"},
|
| 11 |
+
{"model_id": "Helsinki-NLP/opus-mt-en-de", "task": "translation"},
|
| 12 |
+
{"model_id": "facebook/bart-large-cnn", "task": "summarization"},
|
| 13 |
+
{"model_id": "dslim/bert-base-NER", "task": "token-classification"},
|
| 14 |
+
{"model_id": "textattack/bert-base-uncased-ag-news", "task": "text-classification"},
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class EndpointHandler():
|
| 19 |
+
def __init__(self, path=""):
|
| 20 |
+
self.multi_model={}
|
| 21 |
+
# load all the models onto device
|
| 22 |
+
for model in multi_model_list:
|
| 23 |
+
self.multi_model[model["model_id"]] = pipeline(model["task"], model=model["model_id"], device=device)
|
| 24 |
+
|
| 25 |
+
def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
|
| 26 |
+
# deserialize incomin request
|
| 27 |
+
inputs = data.pop("inputs", data)
|
| 28 |
+
parameters = data.pop("parameters", None)
|
| 29 |
+
model_id = data.pop("model_id", None)
|
| 30 |
+
|
| 31 |
+
# check if model_id is in the list of models
|
| 32 |
+
if model_id is None or model_id not in self.multi_model:
|
| 33 |
+
raise ValueError(f"model_id: {model_id} is not valid. Available models are: {list(self.multi_model.keys())}")
|
| 34 |
+
|
| 35 |
+
# pass inputs with all kwargs in data
|
| 36 |
+
if parameters is not None:
|
| 37 |
+
prediction = self.multi_model[model_id](inputs, **parameters)
|
| 38 |
+
else:
|
| 39 |
+
prediction = self.multi_model[model_id](inputs)
|
| 40 |
+
# postprocess the prediction
|
| 41 |
+
return prediction
|
inference.png
ADDED
|
mmie.png
ADDED
|