zesquirrelnator
/

idefics2-8b-docvqa-finetuned-tutorial

@@ -1,47 +1,56 @@
-import requests
-from typing import Dict, Any
 from PIL import Image
 import torch
-import base64
 from io import BytesIO
-from transformers import AutoProcessor, AutoModelForVision2Seq
-from transformers.image_utils import load_image
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-class EndpointHandler():
-    def __init__(self, path=""):
-        self.processor = AutoProcessor.from_pretrained("zesquirrelnator/idefics2-8b-docvqa-finetuned-tutorial")
-        self.model = AutoModelForVision2Seq.from_pretrained(
-            "zesquirrelnator/idefics2-8b-docvqa-finetuned-tutorial"
-        ).to(device)
-        self.model.eval()
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        input_data = data.get("inputs", {})
-        encoded_images = input_data.get("images")
-        if not encoded_images:
-            return {"captions": [], "error": "No images provided"}
-        texts = input_data.get("texts", ["move to red ball"] * len(encoded_images))
-        try:
-            raw_images = [Image.open(BytesIO(base64.b64decode(img))).convert("RGB") for img in encoded_images]
-            processed_inputs = [
-                self.processor(image, text, return_tensors="pt") for image, text in zip(raw_images, texts)
-            ]
-            processed_inputs = {
-                "pixel_values": torch.cat([inp["pixel_values"] for inp in processed_inputs], dim=0).to(device),
-                "input_ids": torch.cat([inp["input_ids"] for inp in processed_inputs], dim=0).to(device),
-                "attention_mask": torch.cat([inp["attention_mask"] for inp in processed_inputs], dim=0).to(device)
-            }
-            with torch.no_grad():
-                out = self.model.generate(**processed_inputs)
-            captions = self.processor.batch_decode(out, skip_special_tokens=True)
-            return {"captions": captions}
-        except Exception as e:
-            print(f"Error during processing: {str(e)}")
-            return {"captions": [], "error": str(e)}

+from transformers import AutoModelForCausalLM, AutoTokenizer
 from PIL import Image
 import torch
 from io import BytesIO
+import base64
+# Initialize the model and tokenizer
+model_id = "HuggingFaceM4/idefics2-8b"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Check if CUDA (GPU support) is available and then set the device to GPU or CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+def preprocess_image(encoded_image):
+    """Decode and preprocess the input image."""
+    decoded_image = base64.b64decode(encoded_image)
+    img = Image.open(BytesIO(decoded_image)).convert("RGB")
+    return img
+def handler(event, context):
+    """Handle the incoming request."""
+    try:
+        # Extract the base64-encoded image and question from the event
+        input_image = event['body']['image']
+        question = event['body'].get('question', "What is this image about?")
+        # Preprocess the image
+        img = preprocess_image(input_image)
+        # Perform inference
+        enc_image = model.encode_image(img).to(device)
+        answer = model.answer_question(enc_image, question, tokenizer)
+        # If the output is a tensor, move it back to CPU and convert to list
+        if isinstance(answer, torch.Tensor):
+            answer = answer.cpu().numpy().tolist()
+        # Create the response
+        response = {
+            "statusCode": 200,
+            "body": {
+                "answer": answer
+            }
+        }
+        return response
+    except Exception as e:
+        # Handle any errors
+        response = {
+            "statusCode": 500,
+            "body": {
+                "error": str(e)
+            }
+        }
+        return response