Spaces:
Running on Zero
Running on Zero
File size: 2,571 Bytes
98ceb88 3e70669 98ceb88 cd2ed90 98ceb88 cd2ed90 98ceb88 cd2ed90 98ceb88 cd2ed90 98ceb88 cd2ed90 98ceb88 cd2ed90 3e70669 cd2ed90 3e70669 98ceb88 cd2ed90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | import spaces # ZeroGPU: must precede torch/transformers imports
import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
MODEL_ID = "openbmb/MiniCPM-V-4.6"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
)
model.eval()
def _to_pil(img) -> Image.Image:
if isinstance(img, Image.Image):
return img.convert("RGB")
if hasattr(img, "__array__"):
import numpy as np
arr = img if isinstance(img, np.ndarray) else img.__array__()
return Image.fromarray(arr).convert("RGB")
if isinstance(img, str):
return Image.open(img).convert("RGB")
raise TypeError(f"Cannot convert {type(img)} to PIL Image")
@spaces.GPU(duration=120)
def vision_infer(
images,
instruction: str,
json_mode: bool = False,
max_tokens: int = 768,
do_sample: bool = False,
temperature: float = 0.7,
) -> str:
"""Single GPU entrypoint. images: PIL Image or list of PIL Images."""
if not isinstance(images, list):
images = [images]
model.to("cuda")
try:
pil_images = [_to_pil(img) for img in images]
if json_mode:
instruction = (
instruction
+ "\n\nRespond with ONLY valid JSON. No markdown fences, no prose, no explanation."
)
content = [{"type": "image", "image": img} for img in pil_images]
content.append({"type": "text", "text": instruction})
messages = [{"role": "user", "content": content}]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
downsample_mode="16x",
max_slice_nums=36,
).to(model.device)
gen_kwargs: dict = {
"downsample_mode": "16x",
"max_new_tokens": max_tokens,
"do_sample": do_sample,
}
if do_sample:
gen_kwargs["temperature"] = temperature
with torch.no_grad():
generated_ids = model.generate(**inputs, **gen_kwargs)
trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
return processor.batch_decode(
trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
finally:
model.to("cpu")
|