push updates to handler

Files changed (3) hide show

Phi-3-medium-128k-instruct-IQ2_XS.gguf DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8c769c4137173dd434c070e116e4b0599af2b12752ba4c7188a1bf8bf5372a55
-size 4127405088

handler.py CHANGED Viewed

@@ -1,33 +1,30 @@
-from typing import Dict, List, Any
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 class EndpointHandler:
     def __init__(self, path=""):
-        # load model and processor from path
-        self.tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
-        self.model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-medium-4k-instruct", trust_remote_code=True)
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
-        """
-        Args:
-            data (:obj:):
-                includes the deserialized image file as PIL.Image
-        """
-        # process input
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
-        # preprocess
-        input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
         # pass inputs with all kwargs in data
         if parameters is not None:
-            outputs = self.model.generate(input_ids, **parameters)
         else:
-            outputs = self.model.generate(input_ids)
         # postprocess the prediction
-        prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return [{"generated_text": prediction}]

 import torch
+from typing import Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 class EndpointHandler:
     def __init__(self, path=""):
+        # load the model
+        tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            "microsoft/Phi-3-mini-128k-instruct",
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            device_map="cuda",
+            trust_remote_code=True
+        )
+        # create inference pipeline
+        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
+    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data
         if parameters is not None:
+            prediction = self.pipeline(inputs, **parameters)
         else:
+            prediction = self.pipeline(inputs)
         # postprocess the prediction
+        return prediction

requirements.txt DELETED Viewed