Marlon Wiprud commited on
Commit
4fbf700
·
1 Parent(s): 8b33989

sketch: multi-gpu

Browse files
Files changed (1) hide show
  1. handler.py +32 -8
handler.py CHANGED
@@ -4,6 +4,11 @@ from PIL import Image
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
 
 
 
 
 
7
 
8
 
9
  class EndpointHandler:
@@ -20,16 +25,35 @@ class EndpointHandler:
20
 
21
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
22
 
23
- self.model = (
24
- AutoModelForCausalLM.from_pretrained(
25
- "THUDM/cogvlm-chat-hf",
26
- torch_dtype=torch.bfloat16,
27
- low_cpu_mem_usage=True,
28
- trust_remote_code=True,
 
 
 
 
29
  )
30
- .to("cuda")
31
- .eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
 
33
 
34
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
35
  """
 
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
7
+ from accelerate import (
8
+ init_empty_weights,
9
+ infer_auto_device_map,
10
+ load_checkpoint_and_dispatch,
11
+ )
12
 
13
 
14
  class EndpointHandler:
 
25
 
26
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
27
 
28
+ with init_empty_weights():
29
+ self.model = (
30
+ AutoModelForCausalLM.from_pretrained(
31
+ "THUDM/cogvlm-chat-hf",
32
+ torch_dtype=torch.bfloat16,
33
+ low_cpu_mem_usage=True,
34
+ trust_remote_code=True,
35
+ )
36
+ .to("cuda")
37
+ .eval()
38
  )
39
+
40
+ device_map = infer_auto_device_map(
41
+ model,
42
+ max_memory={
43
+ 0: "16GiB",
44
+ 1: "16GiB",
45
+ 2: "16GiB",
46
+ 3: "16GiB",
47
+ "cpu": "180GiB",
48
+ },
49
+ no_split_module_classes="CogVLMDecoderLayer",
50
+ )
51
+ self.model = load_checkpoint_and_dispatch(
52
+ model,
53
+ "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
54
+ device_map=device_map,
55
  )
56
+ model = model.eval()
57
 
58
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
59
  """