falcon-40b

@@ -8,7 +8,7 @@ class EndpointHandler:
     def __init__(self, path=""):
         tokenizer = AutoTokenizer.from_pretrained(path)
         model = AutoModelForCausalLM.from_pretrained(path,
-                                                     torch_dtype=torch.float16,
                                                      trust_remote_code=True)
         device = "cuda:0" if torch.cuda.is_available() else "cpu"
         self.pipeline = transformers.pipeline('text-generation',
@@ -19,6 +19,6 @@ class EndpointHandler:
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
-        with torch.autocast(self.pipeline.device.type, dtype=torch.float16):
             outputs = self.pipeline(inputs, **parameters, use_cache=True)
             return outputs

     def __init__(self, path=""):
         tokenizer = AutoTokenizer.from_pretrained(path)
         model = AutoModelForCausalLM.from_pretrained(path,
+                                                     torch_dtype=torch.bfloat16,
                                                      trust_remote_code=True)
         device = "cuda:0" if torch.cuda.is_available() else "cpu"
         self.pipeline = transformers.pipeline('text-generation',
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
+        with torch.autocast(self.pipeline.device.type, dtype=torch.bfloat16):
             outputs = self.pipeline(inputs, **parameters, use_cache=True)
             return outputs