handraise-dev
/

gguf-inference

Text Generation

Model card Files Files and versions

syberWolf commited on Jul 4, 2024

Commit

e126c73

·

1 Parent(s): 52afa01

update endpoint

Files changed (1) hide show

handler.py +4 -6

handler.py CHANGED Viewed

@@ -4,18 +4,16 @@ import torch
 class EndpointHandler:
     def __init__(self, path=""):
-        device = 0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU
         # Load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-            device_map="cuda" # for single instance one GPU
         )
-        # Create inference pipeline with the correct device
-        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
     def __call__(self, data: Any) -> List[List[Dict[str, Any]]]:
         inputs = data.pop("inputs", data)

 class EndpointHandler:
     def __init__(self, path=""):
         # Load the model
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct")
         model = AutoModelForCausalLM.from_pretrained(
             "Qwen/Qwen2-1.5B-Instruct",
             torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            device_map="cuda" if torch.cuda.is_available() else "auto" # Include device_map for correct device allocation
         )
+        # Create inference pipeline without specifying the device
+        self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
     def __call__(self, data: Any) -> List[List[Dict[str, Any]]]:
         inputs = data.pop("inputs", data)