QuanSun commited on
Commit
c193dc4
·
1 Parent(s): 827d192

add README.md

Browse files
Files changed (1) hide show
  1. README.md +256 -0
README.md ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ ---
5
+
6
+ [Demo](https://huggingface.co/spaces/BAAI/Emu2) | [Project Page](https://baaivision.github.io/emu2/)
7
+
8
+ ## Model Weights
9
+
10
+ | Model name | Weight |
11
+ | ------------------ | ------------------------------------------------------- |
12
+ | **Emu2** | [🤗 HF link](https://huggingface.co/BAAI/Emu2) |
13
+ | **Emu2-Chat** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Chat) |
14
+ | **Emu2-Gen** | [🤗 HF link](https://huggingface.co/BAAI/Emu2-Gen) |
15
+
16
+
17
+ ## Inference (Huggingface Version)
18
+
19
+ ### Emu2 & Emu2-Chat
20
+ #### Single GPU
21
+
22
+ ```python
23
+ from PIL import Image
24
+ import requests
25
+ import torch
26
+ from transformers import AutoModelForCausalLM, AutoTokenizer
27
+
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained("/BAAI/Emu2-Chat")
30
+
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ "/BAAI/Emu2-Chat",
33
+ torch_dtype=torch.bfloat16,
34
+ low_cpu_mem_usage=True,
35
+ trust_remote_code=True).to('cuda').eval()
36
+
37
+
38
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
39
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
40
+
41
+ query = '[<IMG_PLH>]Describe the image in details:'
42
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
43
+
44
+
45
+ inputs = model.build_input_ids(
46
+ text=[query],
47
+ tokenizer=tokenizer,
48
+ image=[image]
49
+ )
50
+
51
+ with torch.no_grad():
52
+ outputs = model.generate(
53
+ input_ids=inputs["input_ids"],
54
+ attention_mask=inputs["attention_mask"],
55
+ image=inputs["image"].to(torch.bfloat16),
56
+ max_new_tokens=64,
57
+ length_penalty=-1)
58
+
59
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
60
+ ```
61
+
62
+ Interleaved image and text
63
+
64
+ ```python
65
+ from PIL import Image
66
+ import requests
67
+ import torch
68
+ from transformers import AutoModelForCausalLM, AutoTokenizer
69
+
70
+
71
+ tokenizer = AutoTokenizer.from_pretrained("/BAAI/Emu2-Chat")
72
+
73
+ model = AutoModelForCausalLM.from_pretrained(
74
+ "/BAAI/Emu2-Chat",
75
+ torch_dtype=torch.bfloat16,
76
+ low_cpu_mem_usage=True,
77
+ trust_remote_code=True).to('cuda').eval()
78
+
79
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
80
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
81
+
82
+ query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
83
+
84
+ images = [
85
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
86
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
87
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
88
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
89
+ ]
90
+
91
+ inputs = model.build_input_ids(
92
+ text=[query],
93
+ tokenizer=tokenizer,
94
+ image=images
95
+
96
+ )
97
+
98
+ with torch.no_grad():
99
+ outputs = model.generate(
100
+ input_ids=inputs["input_ids"],
101
+ attention_mask=inputs["attention_mask"],
102
+ image=inputs["image"].to(torch.bfloat16),
103
+ max_new_tokens=64,
104
+ length_penalty=-1)
105
+
106
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
107
+ ```
108
+
109
+ #### Multi GPU
110
+
111
+
112
+ ```python
113
+ from PIL import Image
114
+ import requests
115
+ import torch
116
+ from transformers import AutoModelForCausalLM, AutoTokenizer
117
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
118
+
119
+ tokenizer = AutoTokenizer.from_pretrained("/BAAI/Emu2-Chat")
120
+
121
+ with init_empty_weights():
122
+ model = AutoModelForCausalLM.from_pretrained(
123
+ "/BAAI/Emu2-Chat",
124
+ torch_dtype=torch.bfloat16,
125
+ low_cpu_mem_usage=True,
126
+ trust_remote_code=True)
127
+
128
+ device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
129
+ # input and output logits should be on same device
130
+ device_map["model.decoder.lm.lm_head"] = 0
131
+
132
+ model = load_checkpoint_and_dispatch(
133
+ model,
134
+ 'local/path/to/hf/version/Emu2-Chat/model',
135
+ device_map=device_map).eval()
136
+
137
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
138
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
139
+
140
+ query = '[<IMG_PLH>]Describe the image in details:'
141
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
142
+
143
+ inputs = model.build_input_ids(
144
+ text=[query],
145
+ tokenizer=tokenizer,
146
+ image=[image]
147
+
148
+ )
149
+
150
+ with torch.no_grad():
151
+ outputs = model.generate(
152
+ input_ids=inputs["input_ids"],
153
+ attention_mask=inputs["attention_mask"],
154
+ image=inputs["image"].to(torch.bfloat16),
155
+ max_new_tokens=64,
156
+ length_penalty=-1)
157
+
158
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
159
+ ```
160
+
161
+ Interleaved image and text
162
+
163
+ ```python
164
+ from PIL import Image
165
+ import requests
166
+ import torch
167
+ from transformers import AutoModelForCausalLM, AutoTokenizer
168
+ from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
169
+
170
+ tokenizer = AutoTokenizer.from_pretrained("/BAAI/Emu2-Chat")
171
+
172
+ with init_empty_weights():
173
+ model = AutoModelForCausalLM.from_pretrained(
174
+ "/BAAI/Emu2-Chat",
175
+ torch_dtype=torch.bfloat16,
176
+ low_cpu_mem_usage=True,
177
+ trust_remote_code=True)
178
+
179
+ device_map = infer_auto_device_map(model, max_memory={0:'38GiB',1:'38GiB',}, no_split_module_classes=['Block','LlamaDecoderLayer'])
180
+ # input and output logits should be on same device
181
+ device_map["model.decoder.lm.lm_head"] = 0
182
+
183
+ model = load_checkpoint_and_dispatch(
184
+ model,
185
+ 'local/path/to/hf/version/Emu2/model',
186
+ device_map=device_map).eval()
187
+
188
+ # `[<IMG_PLH>]` is the image placeholder which will be replaced by image embeddings.
189
+ # the number of `[<IMG_PLH>]` should be equal to the number of input images
190
+ query = "[<IMG_PLH>][red, white, 3, bottom left].[<IMG_PLH>][yellow, white, 2, top left].[<IMG_PLH>][green, black, 4, bottom right][<IMG_PLH>]"
191
+
192
+ images = [
193
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/red_white_3_bottom_left.jpg?raw=true',stream=True).raw).convert('RGB'),
194
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/yellow_white_2_top_right.jpg?raw=true',stream=True).raw).convert('RGB'),
195
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/green_black_4_bottom_right.jpg?raw=true',stream=True).raw).convert('RGB'),
196
+ Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB'),
197
+ ]
198
+
199
+ inputs = model.build_input_ids(
200
+ text=[query],
201
+ tokenizer=tokenizer,
202
+ image=images
203
+
204
+ )
205
+
206
+ with torch.no_grad():
207
+ outputs = model.generate(
208
+ input_ids=inputs["input_ids"],
209
+ attention_mask=inputs["attention_mask"],
210
+ image=inputs["image"].to(torch.bfloat16),
211
+ max_new_tokens=64,
212
+ length_penalty=-1)
213
+
214
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
215
+ ```
216
+
217
+ #### Quantization
218
+
219
+ Check quantization guidance at [transformers](https://huggingface.co/docs/transformers/v4.28.0/main_classes/quantization)
220
+
221
+
222
+ ```python
223
+ from PIL import Image
224
+ import requests
225
+ import torch
226
+ from transformers import AutoModelForCausalLM, AutoTokenizer
227
+
228
+
229
+ tokenizer = AutoTokenizer.from_pretrained("/BAAI/Emu2-Chat")
230
+
231
+ model = AutoModelForCausalLM.from_pretrained(
232
+ "/BAAI/Emu2-Chat",
233
+ load_in_4bit=True,
234
+ trust_remote_code=True,
235
+ bnb_4bit_compute_dtype=torch.float16).eval()
236
+
237
+ query = '[<IMG_PLH>]Describe the image in details:'
238
+ image = Image.open(requests.get('https://github.com/baaivision/Emu/Emu2/examples/blue_black_1_top_left.jpg?raw=true',stream=True).raw).convert('RGB')
239
+
240
+ inputs = model.build_input_ids(
241
+ text=[query],
242
+ tokenizer=tokenizer,
243
+ image=[image]
244
+
245
+ )
246
+
247
+ with torch.no_grad():
248
+ outputs = model.generate(
249
+ input_ids=inputs["input_ids"],
250
+ attention_mask=inputs["attention_mask"],
251
+ image=inputs["image"].to(torch.float16), # should be torch.float16
252
+ max_new_tokens=64,
253
+ length_penalty=-1)
254
+
255
+ output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
256
+ ```