falcon-40b

@@ -4,22 +4,34 @@ from typing import Any, Dict
 from transformers import AutoModelForCausalLM, AutoTokenizer
 class EndpointHandler:
     def __init__(self, path=""):
-        tokenizer = AutoTokenizer.from_pretrained(path)
-        model = AutoModelForCausalLM.from_pretrained(path,
-                                                     torch_dtype=torch.bfloat16,
-                                                     trust_remote_code=True)
-        device = "cuda:0" if torch.cuda.is_available() else "cpu"
-        self.pipeline = transformers.pipeline('text-generation',
-                                              model=model,
-                                              tokenizer=tokenizer,
-                                              device=device)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
-        torch.cuda.empty_cache()
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
         with torch.autocast(self.pipeline.device.type, dtype=torch.bfloat16):
-            outputs = self.pipeline(inputs, **parameters, use_cache=True)
-            return outputs

 from transformers import AutoModelForCausalLM, AutoTokenizer
+def load_pipeline(path):
+    tokenizer = AutoTokenizer.from_pretrained(path)
+    model = AutoModelForCausalLM.from_pretrained(path,
+                                                 torch_dtype=torch.bfloat16,
+                                                 trust_remote_code=True)
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    pipeline = transformers.pipeline('text-generation',
+                                     model=model,
+                                     tokenizer=tokenizer,
+                                     device=device)
+    return pipeline
 class EndpointHandler:
     def __init__(self, path=""):
+        self.path = path
+        self.pipeline = load_pipeline(self.path)
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
         with torch.autocast(self.pipeline.device.type, dtype=torch.bfloat16):
+            try:
+                outputs = self.pipeline(inputs, **parameters, use_cache=True)
+                return outputs
+            except Exception as e:
+                print("Exception encounted. Reloading pipeline")
+                # Reload pipeline
+                self.pipeline = load_pipeline(self.path)
+                torch.cuda.empty_cache()
+                raise e