chatGLM3-6B-Base

@@ -4,6 +4,11 @@ from PIL import Image
 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
 class EndpointHandler:
@@ -20,16 +25,35 @@ class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
-        self.model = (
-            AutoModelForCausalLM.from_pretrained(
-                "THUDM/cogvlm-chat-hf",
-                torch_dtype=torch.bfloat16,
-                low_cpu_mem_usage=True,
-                trust_remote_code=True,
             )
-            .to("cuda")
-            .eval()
         )
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """

 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
+from accelerate import (
+    init_empty_weights,
+    infer_auto_device_map,
+    load_checkpoint_and_dispatch,
+)
 class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+        with init_empty_weights():
+            self.model = (
+                AutoModelForCausalLM.from_pretrained(
+                    "THUDM/cogvlm-chat-hf",
+                    torch_dtype=torch.bfloat16,
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
+                )
+                .to("cuda")
+                .eval()
             )
+        device_map = infer_auto_device_map(
+            model,
+            max_memory={
+                0: "16GiB",
+                1: "16GiB",
+                2: "16GiB",
+                3: "16GiB",
+                "cpu": "180GiB",
+            },
+            no_split_module_classes="CogVLMDecoderLayer",
+        )
+        self.model = load_checkpoint_and_dispatch(
+            model,
+            "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots",  # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
+            device_map=device_map,
         )
+        model = model.eval()
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """