cephcyn
/

handler-manual-batching

endpoint-template

Model card Files Files and versions

cephcyn commited on Aug 8, 2024

Commit

1a63a25

·

verified ·

1 Parent(s): 843429b

Update handler.py

Files changed (1) hide show

handler.py +37 -35

handler.py CHANGED Viewed

@@ -1,35 +1,37 @@
-from typing import Dict, List, Any
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-# Need to set HF_TOKEN on the endpoint creation process for this to work
-model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
-class EndpointHandler:
-    def __init__(self, path=""):
-        # load the model
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
-        # create inference pipeline
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    def __call__(self, data: Dict[str, Any]) -> List[List[Dict[str, float]]]:
-        """
-        input args:
-            data: a dict with elements...
-                inputs: List[str] , inputs to batch-process
-                parameters: Any , parameters to be passed into model
-        outputs:
-            list of {'generated_text': str} type outputs
-        """
-        inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            predictions = self.pipeline(inputs, **parameters)
-        else:
-            predictions = self.pipeline(inputs)
-        # postprocess the prediction
-        return [{'generated_text': e} for e in predictions]

+from typing import Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+# Need to set HF_TOKEN on the endpoint creation process for this to work
+model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+class EndpointHandler:
+    def __init__(self, path=""):
+        # create inference pipeline
+        self.pipeline = pipeline(
+            "text-generation",
+            model=model_name,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        input args:
+            data: a dict with elements...
+                inputs: List[List[Dict[str, str]]] or List[str] , inputs to batch-process in conversational format
+                parameters: Any , parameters to be passed into model
+        outputs:
+            list of {'generated_text': str} type outputs
+        """
+        inputs = data.pop("inputs", data)
+        parameters = data.pop("parameters", None)
+        # pass inputs with all kwargs in data
+        if parameters is not None:
+            predictions = self.pipeline(inputs, **parameters)
+        else:
+            predictions = self.pipeline(inputs)
+        # postprocess the prediction
+        return [{'next_chat_turn': e[0]["generated_text"][-1]} for e in predictions]