from fastapi import FastAPI, UploadFile
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
from PIL import Image
import io

app = FastAPI()

model_id = "HPAI-BSC/Aloe-Vision-7B-AR"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

@app.post("/analyze")
async def analyze(file: UploadFile):
    image = Image.open(io.BytesIO(await file.read())).convert("RGB")
    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": "Describe this image"}]}]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs = processor.process_vision_info(messages)
    inputs = processor(text=[text], **image_inputs, return_tensors="pt").to(model.device)

    generated = model.generate(**inputs, max_new_tokens=256)
    output_text = processor.batch_decode(generated, skip_special_tokens=True)[0]
    return {"result": output_text}