chatGLM3-6B-Base

@@ -4,12 +4,11 @@ from PIL import Image
 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
-# from accelerate import (
-#     init_empty_weights,
-#     infer_auto_device_map,
-#     load_checkpoint_and_dispatch,
-# )
 class EndpointHandler:
@@ -26,44 +25,44 @@ class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
-        self.model = (
-            AutoModelForCausalLM.from_pretrained(
-                "THUDM/cogvlm-chat-hf",
-                torch_dtype=torch.bfloat16,
-                low_cpu_mem_usage=True,
-                trust_remote_code=True,
-            )
-            .to("cuda")
-            .eval()
-        )
-        # DISTRIBUTED GPUS
-        # with init_empty_weights():
-        #     self.model = AutoModelForCausalLM.from_pretrained(
         #         "THUDM/cogvlm-chat-hf",
         #         torch_dtype=torch.bfloat16,
         #         low_cpu_mem_usage=True,
         #         trust_remote_code=True,
         #     )
-        # device_map = infer_auto_device_map(
-        #     self.model,
-        #     max_memory={
-        #         0: "16GiB",
-        #         1: "16GiB",
-        #         2: "16GiB",
-        #         3: "16GiB",
-        #         "cpu": "180GiB",
-        #     },
-        #     no_split_module_classes="CogVLMDecoderLayer",
-        # )
-        # self.model = load_checkpoint_and_dispatch(
-        #     self.model,
-        #     "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",  # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
-        #     device_map=device_map,
-        #     no_split_module_classes=["CogVLMDecoderLayer"],
         # )
-        # self.model = self.model.eval()
         ## DISTRIBUTED GPUS
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

 import requests
 from transformers import AutoModelForCausalLM, LlamaTokenizer
 import torch
+from accelerate import (
+    init_empty_weights,
+    infer_auto_device_map,
+    load_checkpoint_and_dispatch,
+)
 class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+        # self.model = (
+        #     AutoModelForCausalLM.from_pretrained(
         #         "THUDM/cogvlm-chat-hf",
         #         torch_dtype=torch.bfloat16,
         #         low_cpu_mem_usage=True,
         #         trust_remote_code=True,
         #     )
+        #     .to("cuda")
+        #     .eval()
         # )
+        # DISTRIBUTED GPUS
+        with init_empty_weights():
+            self.model = AutoModelForCausalLM.from_pretrained(
+                "THUDM/cogvlm-chat-hf",
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+        device_map = infer_auto_device_map(
+            self.model,
+            max_memory={
+                0: "16GiB",
+                1: "16GiB",
+                2: "16GiB",
+                3: "16GiB",
+                "cpu": "180GiB",
+            },
+            no_split_module_classes=["CogVLMDecoderLayer"],
+        )
+        self.model = load_checkpoint_and_dispatch(
+            self.model,
+            "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",  # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
+            device_map=device_map,
+            no_split_module_classes=["CogVLMDecoderLayer"],
+        )
+        self.model = self.model.eval()
         ## DISTRIBUTED GPUS
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: