handraise-dev
/

gguf-inference

syberWolf commited on Jul 4, 2024

Commit

f96aa72

1 Parent(s): c676380

update handler and add flash attention

Files changed (2) hide show

handler.py CHANGED Viewed

@@ -9,8 +9,6 @@ class EndpointHandler:
         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             "microsoft/Phi-3-mini-128k-instruct",
-            torch_dtype=torch.bfloat16,
-            device_map="cuda",
             trust_remote_code=True
         )
         # create inference pipeline
@@ -18,6 +16,11 @@ class EndpointHandler:
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data

         tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             "microsoft/Phi-3-mini-128k-instruct",
             trust_remote_code=True
         )
         # create inference pipeline
     def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
+        for key in ['stop_sequences', 'watermark', 'stop']:
+            if key in inputs:
+                del inputs[key]
         parameters = data.pop("parameters", None)
         # pass inputs with all kwargs in data

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ flash-attn==latest