caiofabio1
/

GLM-OCR-endpoint

@@ -1,8 +1,14 @@
 import torch
 import base64
 import io
-import re
 from typing import Dict, List, Any
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
@@ -18,11 +24,11 @@ class EndpointHandler:
             trust_remote_code=True,
         ).eval()
         self.device = next(self.model.parameters()).device
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs_data = data.get("inputs", data)
-        # Accept base64 image
         if isinstance(inputs_data, dict):
             image_b64 = inputs_data.get("image", "")
             prompt = inputs_data.get("prompt", "Text Recognition:")
@@ -30,27 +36,22 @@ class EndpointHandler:
             image_b64 = inputs_data
             prompt = "Text Recognition:"
         else:
-            return [{"error": "Invalid input format"}]
-        # Decode image
         try:
             image_bytes = base64.b64decode(image_b64)
             image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         except Exception as e:
             return [{"error": f"Failed to decode image: {str(e)}"}]
-        # Build messages
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        # Process
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
@@ -59,7 +60,6 @@ class EndpointHandler:
         )
         proc_inputs = {k: v.to(self.device) for k, v in proc_inputs.items()}
-        # Generate
         with torch.no_grad():
             output = self.model.generate(
                 **proc_inputs,

+import subprocess
+import sys
+# Force install latest transformers with glm_ocr support
+subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade",
+    "git+https://github.com/huggingface/transformers.git", "accelerate"],
+    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 import torch
 import base64
 import io
 from typing import Dict, List, Any
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
             trust_remote_code=True,
         ).eval()
         self.device = next(self.model.parameters()).device
+        print(f"Model loaded on {self.device}")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs_data = data.get("inputs", data)
         if isinstance(inputs_data, dict):
             image_b64 = inputs_data.get("image", "")
             prompt = inputs_data.get("prompt", "Text Recognition:")
             image_b64 = inputs_data
             prompt = "Text Recognition:"
         else:
+            return [{"error": "Invalid input. Send {inputs: {image: base64, prompt: str}}"}]
         try:
             image_bytes = base64.b64decode(image_b64)
             image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         except Exception as e:
             return [{"error": f"Failed to decode image: {str(e)}"}]
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        }]
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         )
         proc_inputs = {k: v.to(self.device) for k, v in proc_inputs.items()}
         with torch.no_grad():
             output = self.model.generate(
                 **proc_inputs,