colt12
/

maxcushion

+import io
+from PIL import Image
+import torch
+from transformers import AutoProcessor, AutoModelForVisionEncoderDecoder
+# Load the model and processor
+model_name = "colt12/maxcushion"
+processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForVisionEncoderDecoder.from_pretrained(model_name)
+def predict(image_bytes):
+    # Open the image using PIL
+    image = Image.open(io.BytesIO(image_bytes))
+    # Preprocess the image
+    pixel_values = processor(images=image, return_tensors="pt").pixel_values
+    # Generate the caption
+    generated_ids = model.generate(pixel_values, max_length=50)
+    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return generated_caption
+def run(raw_image):
+    # Input validation
+    if not raw_image:
+        raise ValueError("No image provided")
+    try:
+        # Process the image and generate the caption
+        result = predict(raw_image)
+        return {"caption": result}
+    except Exception as e:
+        # Error handling
+        return {"error": str(e)}