cogvlm-chat-hf

@@ -40,48 +40,48 @@ class EndpointHandler:
         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
-        # self.model = (
-        #     AutoModelForCausalLM.from_pretrained(
-        #         "THUDM/cogvlm-chat-hf",
-        #         torch_dtype=torch.bfloat16,
-        #         low_cpu_mem_usage=True,
-        #         trust_remote_code=True,
-        #     )
-        #     .to("cuda")
-        #     .eval()
-        # )
-        # DISTRIBUTED GPUS
-        with init_empty_weights():
-            self.model = AutoModelForCausalLM.from_pretrained(
                 "THUDM/cogvlm-chat-hf",
                 torch_dtype=torch.bfloat16,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
             )
-        # print("LISTING FILES IN ", "/root/.cache/huggingface")
-        # list_files("/root/.cache/huggingface", 0, 5)
-        device_map = infer_auto_device_map(
-            self.model,
-            max_memory={
-                0: "12GiB",
-                1: "12GiB",
-                2: "12GiB",
-                3: "12GiB",
-                "cpu": "180GiB",
-            },
-            no_split_module_classes=["CogVLMDecoderLayer"],
         )
-        self.model = load_checkpoint_and_dispatch(
-            self.model,
-            "/root/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",
-            device_map=device_map,
-            no_split_module_classes=["CogVLMDecoderLayer"],
-        )
-        self.model = self.model.eval()
         ## DISTRIBUTED GPUS
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

         self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
+        self.model = (
+            AutoModelForCausalLM.from_pretrained(
                 "THUDM/cogvlm-chat-hf",
                 torch_dtype=torch.bfloat16,
                 low_cpu_mem_usage=True,
                 trust_remote_code=True,
             )
+            .to("cuda")
+            .eval()
         )
+        # DISTRIBUTED GPUS
+        # with init_empty_weights():
+        #     self.model = AutoModelForCausalLM.from_pretrained(
+        #         "THUDM/cogvlm-chat-hf",
+        #         torch_dtype=torch.bfloat16,
+        #         low_cpu_mem_usage=True,
+        #         trust_remote_code=True,
+        #     )
+        # # print("LISTING FILES IN ", "/root/.cache/huggingface")
+        # # list_files("/root/.cache/huggingface", 0, 5)
+        # device_map = infer_auto_device_map(
+        #     self.model,
+        #     max_memory={
+        #         0: "12GiB",
+        #         1: "12GiB",
+        #         2: "12GiB",
+        #         3: "12GiB",
+        #         "cpu": "180GiB",
+        #     },
+        #     no_split_module_classes=["CogVLMDecoderLayer"],
+        # )
+        # self.model = load_checkpoint_and_dispatch(
+        #     self.model,
+        #     "/root/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",
+        #     device_map=device_map,
+        #     no_split_module_classes=["CogVLMDecoderLayer"],
+        # )
+        # self.model = self.model.eval()
         ## DISTRIBUTED GPUS
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: