Spaces:

marveljo
/

MultiModalModel

Runtime error

App Files Files Community

marveljo commited on Nov 11, 2025

Commit

039ebfd

verified ·

1 Parent(s): 616d85c

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -13

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ from fastapi.responses import JSONResponse
 from io import BytesIO
 from PIL import Image
-# --- Load model and processor ---
 model_id = "HPAI-BSC/Aloe-Vision-7B-AR"
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForVision2Seq.from_pretrained(
     model_id,
@@ -17,25 +17,39 @@ model = AutoModelForVision2Seq.from_pretrained(
 app = FastAPI(title="Aloe Vision 7B AR API")
-# --- Inference endpoint ---
 @app.post("/predict")
 async def predict(
-    file: UploadFile = File(...),
-    question: str = Form("What do you see?")
 ):
     try:
-        image = Image.open(BytesIO(await file.read())).convert("RGB")
-        messages = [
-            {
-                "role": "user",
-                "content": [
                     {"type": "image", "image": image},
-                    {"type": "text", "text": question},
-                ],
-            }
-        ]
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs = processor.process_vision_info(messages)
         inputs = processor(text=[text], **image_inputs, return_tensors="pt").to(model.device)

 from io import BytesIO
 from PIL import Image
 model_id = "HPAI-BSC/Aloe-Vision-7B-AR"
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForVision2Seq.from_pretrained(
     model_id,
 app = FastAPI(title="Aloe Vision 7B AR API")
 @app.post("/predict")
 async def predict(
+    file: UploadFile = File(None),
+    question: str = Form(None)
 ):
     try:
+        # --- Case 1: both image and text ---
+        if file and question:
+            image = Image.open(BytesIO(await file.read())).convert("RGB")
+            messages = [
+                {"role": "user", "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": question}
+                ]}
+            ]
+        # --- Case 2: text only ---
+        elif question and not file:
+            messages = [{"role": "user", "content": [{"type": "text", "text": question}]}]
+        # --- Case 3: image only ---
+        elif file and not question:
+            image = Image.open(BytesIO(await file.read())).convert("RGB")
+            messages = [
+                {"role": "user", "content": [
                     {"type": "image", "image": image},
+                    {"type": "text", "text": "Describe this image briefly."}
+                ]}
+            ]
+        else:
+            return JSONResponse({"error": "You must provide an image, text, or both."}, status_code=400)
+        # --- Process ---
         text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs = processor.process_vision_info(messages)
         inputs = processor(text=[text], **image_inputs, return_tensors="pt").to(model.device)