File size: 2,571 Bytes
98ceb88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e70669
 
98ceb88
 
 
 
 
cd2ed90
 
 
98ceb88
cd2ed90
 
 
 
 
98ceb88
cd2ed90
 
98ceb88
cd2ed90
98ceb88
cd2ed90
 
 
 
 
 
98ceb88
cd2ed90
 
 
3e70669
 
 
 
 
 
 
 
cd2ed90
3e70669
98ceb88
cd2ed90
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import spaces  # ZeroGPU: must precede torch/transformers imports

import torch
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor

MODEL_ID = "openbmb/MiniCPM-V-4.6"

processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
)
model.eval()


def _to_pil(img) -> Image.Image:
    if isinstance(img, Image.Image):
        return img.convert("RGB")
    if hasattr(img, "__array__"):
        import numpy as np
        arr = img if isinstance(img, np.ndarray) else img.__array__()
        return Image.fromarray(arr).convert("RGB")
    if isinstance(img, str):
        return Image.open(img).convert("RGB")
    raise TypeError(f"Cannot convert {type(img)} to PIL Image")


@spaces.GPU(duration=120)
def vision_infer(
    images,
    instruction: str,
    json_mode: bool = False,
    max_tokens: int = 768,
    do_sample: bool = False,
    temperature: float = 0.7,
) -> str:
    """Single GPU entrypoint. images: PIL Image or list of PIL Images."""
    if not isinstance(images, list):
        images = [images]

    model.to("cuda")
    try:
        pil_images = [_to_pil(img) for img in images]

        if json_mode:
            instruction = (
                instruction
                + "\n\nRespond with ONLY valid JSON. No markdown fences, no prose, no explanation."
            )

        content = [{"type": "image", "image": img} for img in pil_images]
        content.append({"type": "text", "text": instruction})

        messages = [{"role": "user", "content": content}]

        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt",
            downsample_mode="16x",
            max_slice_nums=36,
        ).to(model.device)

        gen_kwargs: dict = {
            "downsample_mode": "16x",
            "max_new_tokens": max_tokens,
            "do_sample": do_sample,
        }
        if do_sample:
            gen_kwargs["temperature"] = temperature

        with torch.no_grad():
            generated_ids = model.generate(**inputs, **gen_kwargs)

        trimmed = [
            out_ids[len(in_ids):]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        return processor.batch_decode(
            trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )[0]
    finally:
        model.to("cpu")