BoghdadyJR
/

Qwen_UI_final

PEFT

Safetensors

Model card Files Files and versions

xet

Community

BoghdadyJR commited on Jun 15, 2025

Commit

1471372

verified ·

1 Parent(s): ee1fa58

Add inference handler for HF Endpoints

Browse files

Files changed (2) hide show

handler.py +95 -68
requirements.txt +6 -5

handler.py CHANGED Viewed

@@ -1,20 +1,43 @@
 from typing import Dict, List, Any
-from unsloth import FastVisionModel
 from PIL import Image
 import torch
 class EndpointHandler():
-    def __init__(self, path=""):
-        # Load model and tokenizer
-        self.model, self.tokenizer = FastVisionModel.from_pretrained(
-            path,
             device_map="auto",
-            load_in_4bit=False,  # Use 4bit to reduce memory use. False for 16bit LoRA
-            use_gradient_checkpointing="unsloth",  # True or "unsloth" for long context
         )
-        # Enable for inference
-        FastVisionModel.for_inference(self.model)
         # Store the instruction template
         self.instruction = """
@@ -46,77 +69,81 @@ Brief Structured Report:
 </answer>
 """
-    def __call__(self, data: Any) -> List[Dict[str, Any]]:
         """
-        Args:
-            data (:obj:):
-                includes the input data and the parameters for the inference.
-                Expected format:
-                {
-                    "inputs": {
-                        "image": PIL.Image object,
-                        "instruction": optional_custom_instruction
-                    },
-                    "parameters": {
-                        "max_new_tokens": 512,
-                        "temperature": 0.7,
-                        "top_p": 0.9,
-                        ...
-                    }
-                }
         Return:
-            A :obj:`list`:. The list contains a dictionary with:
-                - "generated_text": The model's response
         """
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
-        # Extract image and instruction
-        image = inputs.get("image")
-        custom_instruction = inputs.get("instruction", self.instruction)
-        # Prepare messages
         messages = [
-            {"role": "user", "content": [
-                {"type": "image"},
-                {"type": "text", "text": custom_instruction}
-            ]}
         ]
-        # Apply chat template
-        input_text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-        # Tokenize inputs
-        model_inputs = self.tokenizer(
-            image,
-            input_text,
-            add_special_tokens=False,
-            return_tensors="pt",
         ).to(self.model.device)
-        # Set default parameters
-        generation_params = {
-            "max_new_tokens": parameters.get("max_new_tokens", 512),
-            "temperature": parameters.get("temperature", 0.7),
-            "top_p": parameters.get("top_p", 0.9),
-            "min_p": parameters.get("min_p", 0.1),
-            "use_cache": True,
-            "do_sample": parameters.get("do_sample", True),
-            "repetition_penalty": parameters.get("repetition_penalty", 1.1),
-        }
-        output_ids = self.model.generate(
-                **model_inputs,
-                **generation_params
-        )
-        # Decode output
-        generated_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        # Extract only the generated response (remove the prompt)
-        response = generated_text.split(custom_instruction)[-1].strip()
-        return [{
-            "generated_text": response
-        }]

 from typing import Dict, List, Any
+from transformers import AutoModelForVision2Seq, AutoTokenizer, AutoProcessor
 from PIL import Image
 import torch
+import io
+import base64
+from peft import PeftModel
 class EndpointHandler():
+    def __init__(self):
+        self.path = "BoghdadyJR/Qwen_UI_final"
+        # Load base model and tokenizer
+        base_model_id = "Qwen/Qwen2-VL-2B-Instruct"
+        # Load tokenizer/processor
+        self.processor = AutoProcessor.from_pretrained(
+            self.path,
+            trust_remote_code=True
+        )
+        # Load base model
+        self.model = AutoModelForVision2Seq.from_pretrained(
+            base_model_id,
+            torch_dtype=torch.float16,
             device_map="auto",
+            trust_remote_code=True
         )
+        # Load LoRA adapter
+        self.model = PeftModel.from_pretrained(
+            self.model,
+            self.path,
+            device_map="auto"
+        )
+        # Merge and unload for faster inference
+        self.model = self.model.merge_and_unload()
+        # Set to eval mode
+        self.model.eval()
         # Store the instruction template
         self.instruction = """
 </answer>
 """
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]:
         """
+        data args:
+            inputs (:obj: `str` | `PIL.Image` | `np.array`)
+            parameters (:obj: `Dict[str, Any]`, *optional*)
         Return:
+            A :obj:`list` | `dict`: will be serialized and returned
         """
+        # Extract inputs and parameters
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
+        # Handle different input formats
+        if isinstance(inputs, str):
+            # Base64 encoded image
+            image_bytes = base64.b64decode(inputs)
+            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        elif isinstance(inputs, dict):
+            # Dictionary with image key
+            image_data = inputs.get("image", inputs.get("inputs", ""))
+            if isinstance(image_data, str):
+                image_bytes = base64.b64decode(image_data)
+                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+            else:
+                image = image_data
+        else:
+            # Direct image
+            image = inputs
+        # Ensure image is RGB
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Prepare messages in Qwen format
         messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": self.instruction}
+                ]
+            }
         ]
+        # Process inputs
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Prepare inputs for model
+        inputs = self.processor(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt"
         ).to(self.model.device)
+        # Generate response
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=parameters.get("max_new_tokens", 512),
+                temperature=parameters.get("temperature", 0.7),
+                top_p=parameters.get("top_p", 0.9),
+                do_sample=True,
+                pad_token_id=self.processor.tokenizer.pad_token_id,
+                eos_token_id=self.processor.tokenizer.eos_token_id,
+            )
+        # Decode output - only the generated part
+        output_text = self.processor.batch_decode(
+            output_ids[:, inputs.input_ids.shape[1]:],
+            skip_special_tokens=True
+        )[0]
+        return [{"generated_text": output_text}]

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-torch>=2.0.0
-transformers>=4.36.0
-unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
-Pillow>=9.0.0
-accelerate>=0.25.0

+transformers==4.52.4
+accelerate==1.7.0
+peft==0.15.2
+pillow==11.2.1
+torch==2.7.1
+torchvision=0.22.1