|
|
import torch |
|
|
from transformers import AutoProcessor, LlavaForConditionalGeneration |
|
|
from PIL import Image |
|
|
import base64 |
|
|
from io import BytesIO |
|
|
|
|
|
class EndpointHandler: |
|
|
def __init__(self, path=""): |
|
|
|
|
|
print("Loading model and processor from local path...") |
|
|
self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True) |
|
|
self.model = LlavaForConditionalGeneration.from_pretrained( |
|
|
path, |
|
|
load_in_4bit=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
print("✅ Model loaded successfully.") |
|
|
|
|
|
def __call__(self, data: dict) -> dict: |
|
|
payload = data.pop("inputs", data) |
|
|
|
|
|
prompt_text = payload.pop("prompt", "Describe the image in detail.") |
|
|
image_b64 = payload.pop("image_b64", None) |
|
|
max_new_tokens = payload.pop("max_new_tokens", 200) |
|
|
|
|
|
image = None |
|
|
if image_b64: |
|
|
try: |
|
|
image_bytes = base64.b64decode(image_b64) |
|
|
image = Image.open(BytesIO(image_bytes)) |
|
|
except Exception as e: |
|
|
return {"error": f"Failed to decode or open base64 image: {e}"} |
|
|
|
|
|
if image is not None: |
|
|
|
|
|
print("Processing multimodal request...") |
|
|
prompt = f"USER: <image>\n{prompt_text} ASSISTANT:" |
|
|
inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device) |
|
|
else: |
|
|
|
|
|
print("Processing text-only request...") |
|
|
prompt = f"USER: {prompt_text} ASSISTANT:" |
|
|
|
|
|
|
|
|
inputs = self.processor(text=prompt, return_tensors="pt") |
|
|
|
|
|
|
|
|
image_processor = self.processor.image_processor |
|
|
config = image_processor.config |
|
|
|
|
|
|
|
|
dummy_pixel_values = torch.zeros( |
|
|
( |
|
|
1, |
|
|
config.num_channels, |
|
|
config.crop_size['height'], |
|
|
config.crop_size['width'] |
|
|
), |
|
|
dtype=self.model.dtype, |
|
|
device=self.model.device |
|
|
) |
|
|
|
|
|
|
|
|
inputs['pixel_values'] = dummy_pixel_values |
|
|
|
|
|
|
|
|
inputs = inputs.to(self.model.device) |
|
|
|
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
output = self.model.generate(**inputs, max_new_tokens=max_new_tokens) |
|
|
|
|
|
full_response = self.processor.decode(output[0], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
assistant_response = full_response.split("ASSISTANT:")[-1].strip() |
|
|
|
|
|
return {"generated_text": assistant_response} |