handraise-dev
/

gguf-inference

Text Generation

Model card Files Files and versions

syberWolf commited on Jul 4, 2024

Commit

eff3ac4

·

1 Parent(s): e8628b3

update handler

Files changed (1) hide show

handler.py +18 -15

handler.py CHANGED Viewed

@@ -4,34 +4,37 @@ import torch
 class EndpointHandler:
     def __init__(self, path=""):
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        # load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-            device_map="auto"
         )
-        # create inference pipeline without specifying the device
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
-    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
-        # Ensure inputs are on the GPU if available
         if isinstance(inputs, str):
             inputs = [inputs]
-        # Tensor input handling
-        try:
-            inputs = torch.tensor(inputs).cuda() if torch.cuda.is_available() else torch.tensor(inputs)
-        except:
-            pass  # If inputs are not tensors (e.g., strings), continue without conversion
-        # pass inputs with all kwargs in data
         prediction = self.pipeline(inputs, **parameters)
         return prediction

 class EndpointHandler:
     def __init__(self, path=""):
+        device = 0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU
+        # Load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+            device_map="cuda" # for single instance one GPU
         )
+        # Create inference pipeline with the correct device
+        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
+    def __call__(self, data: Any) -> List[List[Dict[str, Any]]]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
         if isinstance(inputs, str):
             inputs = [inputs]
+        # Get predictions from the pipeline
         prediction = self.pipeline(inputs, **parameters)
         return prediction
+# Example usage
+if __name__ == "__main__":
+    handler = EndpointHandler()
+    data = {
+        "inputs": "Hello, how can I",
+        "parameters": {"max_length": 50, "num_return_sequences": 1}
+    }
+    result = handler(data)
+    print(result)