chatGLM3-6B-Base

@@ -4,11 +4,12 @@ from PIL import Image
 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
-from accelerate import (
-    init_empty_weights,
-    infer_auto_device_map,
-    load_checkpoint_and_dispatch,
-)
 class EndpointHandler:
@@ -25,35 +26,45 @@ class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
         # with init_empty_weights():
-        #     self.model = (
-        #         AutoModelForCausalLM.from_pretrained(
-        #             "THUDM/cogvlm-chat-hf",
-        #             torch_dtype=torch.bfloat16,
-        #             low_cpu_mem_usage=True,
-        #             trust_remote_code=True,
-        #         )
-        #         .to("cuda")
-        #         .eval()
         #     )
-        device_map = infer_auto_device_map(
-            model,
-            max_memory={
-                0: "16GiB",
-                1: "16GiB",
-                2: "16GiB",
-                3: "16GiB",
-                "cpu": "180GiB",
-            },
-            no_split_module_classes="CogVLMDecoderLayer",
-        )
-        self.model = load_checkpoint_and_dispatch(
-            model,
-            "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots",  # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
-            device_map=device_map,
-        )
-        model = model.eval()
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """

 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
+# from accelerate import (
+#     init_empty_weights,
+#     infer_auto_device_map,
+#     load_checkpoint_and_dispatch,
+# )
 class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+        self.model = (
+            AutoModelForCausalLM.from_pretrained(
+                "THUDM/cogvlm-chat-hf",
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+            .to("cuda")
+            .eval()
+        )
+        # DISTRIBUTED GPUS
         # with init_empty_weights():
+        #     self.model = AutoModelForCausalLM.from_pretrained(
+        #         "THUDM/cogvlm-chat-hf",
+        #         torch_dtype=torch.bfloat16,
+        #         low_cpu_mem_usage=True,
+        #         trust_remote_code=True,
         #     )
+        # device_map = infer_auto_device_map(
+        #     self.model,
+        #     max_memory={
+        #         0: "16GiB",
+        #         1: "16GiB",
+        #         2: "16GiB",
+        #         3: "16GiB",
+        #         "cpu": "180GiB",
+        #     },
+        #     no_split_module_classes="CogVLMDecoderLayer",
+        # )
+        # self.model = load_checkpoint_and_dispatch(
+        #     self.model,
+        #     "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",  # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
+        #     device_map=device_map,
+        #     no_split_module_classes=["CogVLMDecoderLayer"],
+        # )
+        # self.model = self.model.eval()
+        ## DISTRIBUTED GPUS
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """