Upload modeling_mplug_owl2.py with huggingface_hub
Browse files- modeling_mplug_owl2.py +2 -1
modeling_mplug_owl2.py
CHANGED
|
@@ -282,9 +282,10 @@ class MPLUGOwl2LlamaForCausalLM(LlamaForCausalLM, MPLUGOwl2MetaForCausalLM):
|
|
| 282 |
return torch.softmax(output_logits, -1) @ self.weight_tensor
|
| 283 |
else:
|
| 284 |
video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
|
|
|
|
| 285 |
with torch.inference_mode():
|
| 286 |
video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
|
| 287 |
-
output_logits = self(
|
| 288 |
images=video_tensors)["logits"][:,-1, self.preferential_ids_]
|
| 289 |
return torch.softmax(output_logits, -1) @ self.weight_tensor
|
| 290 |
|
|
|
|
| 282 |
return torch.softmax(output_logits, -1) @ self.weight_tensor
|
| 283 |
else:
|
| 284 |
video = [[expand2square(frame, tuple(int(x*255) for x in self.image_processor.image_mean)) for frame in vid] for vid in images]
|
| 285 |
+
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
|
| 286 |
with torch.inference_mode():
|
| 287 |
video_tensors = [self.image_processor.preprocess(vid, return_tensors="pt")["pixel_values"].half().to(self.model.device) for vid in video]
|
| 288 |
+
output_logits = self(input_ids.repeat(len(video_tensors), 1),
|
| 289 |
images=video_tensors)["logits"][:,-1, self.preferential_ids_]
|
| 290 |
return torch.softmax(output_logits, -1) @ self.weight_tensor
|
| 291 |
|