MultiModalModel / app.py
marveljo's picture
Update app.py
7d0535a verified
raw
history blame
1.11 kB
from fastapi import FastAPI, UploadFile
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image
import io
app = FastAPI()
model_id = "HPAI-BSC/Aloe-Vision-7B-AR"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
model_id,
dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
@app.post("/analyze")
async def analyze(file: UploadFile):
image = Image.open(io.BytesIO(await file.read())).convert("RGB")
messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": "Describe this image"}]}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs = processor.process_vision_info(messages)
inputs = processor(text=[text], **image_inputs, return_tensors="pt").to(model.device)
generated = model.generate(**inputs, max_new_tokens=256)
output_text = processor.batch_decode(generated, skip_special_tokens=True)[0]
return {"result": output_text}