CanerDedeoglu
/

Rapid_ECG

@@ -1,25 +1,24 @@
-# -*- coding: utf-8 -*-
-# handler.py — PULSE-7B / LLaVA endpoint (mm_utils_local ile)
-# - LLaVA kaynaklarını runtime'da git clone ile getirir (model builder, conv, constants)
-# - Görsel işleme: mm_utils_local.process_images / tokenizer_image_token
-# - image_processor fallback (AutoProcessor / vision_tower)
-# - anyres -> pad güvenli düşüş (mm_utils_local zaten robust)
-# - forward patch (cache_position/input_positions sessizce düşür)
-# - attention_mask: model destekliyorsa gönder (unused kwargs hatasını önlemek için koşullu)
-import os, io, sys, subprocess, base64, inspect
-from typing import Any, Dict, List, Optional, Tuple
 import torch
 from PIL import Image
 import requests
-# ===== Model ID =====
 MODEL_ID = os.getenv("HF_MODEL_ID", "PULSE-ECG/PULSE-7B")
-# ===== LLaVA kaynaklarını runtime'da çek =====
 LLAVA_GIT_URL = os.getenv("LLAVA_GIT_URL", "https://github.com/haotian-liu/LLaVA.git")
-LLAVA_GIT_REF = os.getenv("LLAVA_GIT_REF", "v1.2.2.post1")
 LLAVA_SRC_DIR = os.getenv("LLAVA_SRC_DIR", "/tmp/llava_src/LLaVA")
 def _ensure_llava():
@@ -34,7 +33,141 @@ def _ensure_llava():
 _ensure_llava()
-# ---- LLaVA parçaları ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
@@ -45,53 +178,42 @@ from llava.constants import (
 from llava.conversation import conv_templates
 from llava.utils import disable_torch_init
-# ---- mm_utils_local (senin dosyan) ----
-from mm_utils_local import (
-    tokenizer_image_token,
-    process_images,
-    get_model_name_from_path,
-)
-# HF processor fallback'ları
-from transformers import AutoProcessor, AutoImageProcessor, CLIPImageProcessor
 class EndpointHandler:
     """
     Girdi:
-      {
-        "inputs": { "query": "...", "image": "<url|dataurl|path>" },
-        "parameters": {
-          "max_new_tokens": 256, "temperature": 0.0, "top_p": 1.0,
-          "repetition_penalty": 1.0, "do_sample": false, "use_cache": true
-        },
-        "conv_mode": "llava_v2"  # opsiyonel
-      }
     Çıktı: [ { "generated_text": "..." } ]
     """
     def __init__(self, path: str = "") -> None:
         disable_torch_init()
-        # Model yolu önceliği: HF_MODEL_LOCAL_DIR > HF_MODEL_ID > MODEL_ID
         if os.getenv("HF_MODEL_LOCAL_DIR", "").strip():
             model_path = os.getenv("HF_MODEL_LOCAL_DIR").strip()
         elif os.getenv("HF_MODEL_ID", "").strip():
             model_path = os.getenv("HF_MODEL_ID").strip()
         else:
-            model_path = MODEL_ID
-        if not model_path:
-            raise RuntimeError("Model path belirlenemedi. HF_MODEL_LOCAL_DIR / HF_MODEL_ID / MODEL_ID ayarla.")
         self.model_name = get_model_name_from_path(model_path)
-        # Attention implementation seçimi
         try:
-            import flash_attn  # noqa: F401
             attn_impl = "flash_attention_2"
-        except Exception:
             attn_impl = "sdpa"
-        # Modeli yükle
         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
             model_path=model_path,
             model_base=None,
@@ -102,15 +224,18 @@ class EndpointHandler:
         )
         self.model.eval()
-        # ---- forward patch: yeni HF arg uyumu ----
         def _patch_forward(obj, label="model"):
             try:
-                if not hasattr(obj, "forward"): return False
-                orig = obj.forward
                 def patched_forward(*args, **kwargs):
                     kwargs.pop("cache_position", None)
                     kwargs.pop("input_positions", None)
-                    return orig(*args, **kwargs)
                 obj.forward = patched_forward
                 print(f"[hotfix] Patched forward on {label}")
                 return True
@@ -118,64 +243,34 @@ class EndpointHandler:
                 print(f"[warn] forward patch failed on {label}: {e}")
                 return False
         _patch_forward(self.model, "self.model")
-        if hasattr(self.model, "model"): _patch_forward(self.model.model, "self.model.model")
-        if hasattr(self.model, "base_model"): _patch_forward(self.model.base_model, "self.model.base_model")
-        # ---- image_processor fallback ----
-        if self.image_processor is None:
-            print("[hotfix] image_processor None, AutoProcessor fallback deneniyor...")
-            try:
-                proc = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
-                self.image_processor = getattr(proc, "image_processor", proc)
-            except Exception as e:
-                print(f"[warn] AutoProcessor başarısız: {e}")
-                vt = getattr(self.model.config, "vision_tower", None)
-                if vt:
-                    try:
-                        self.image_processor = AutoImageProcessor.from_pretrained(vt, trust_remote_code=True)
-                    except Exception:
-                        self.image_processor = CLIPImageProcessor.from_pretrained(vt)
-        # anyres -> pad fallback (processor/crop_size yoksa)
-        iar = getattr(self.model.config, "mm_image_aspect_ratio", None) or \
-              getattr(self.model.config, "image_aspect_ratio", None)
-        needs_crop = (self.image_processor is None) or (getattr(self.image_processor, "crop_size", None) is None)
-        if iar == "anyres" and needs_crop:
-            print("[hotfix] image_aspect_ratio:anyres -> pad (processor/crop_size eksik)")
-            if hasattr(self.model.config, "image_aspect_ratio"):
-                self.model.config.image_aspect_ratio = "pad"
-            if hasattr(self.model.config, "mm_image_aspect_ratio"):
-                self.model.config.mm_image_aspect_ratio = "pad"
-        # multimodal bayraklar
         self.use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
-        self.is_multimodal = ('llava' in self.model_name.lower()) or ('pulse' in self.model_name.lower())
-        # Varsayılanlar
-        self.DEFAULT_CONV_MODE  = os.getenv("LLAVA_CONV_MODE", "llava_v1")
-        self.MAX_NEW_TOKENS_DEF = int(os.getenv("MAX_NEW_TOKENS", "1024"))
-        # attention_mask desteğini bir kez tespit et
-        self._supports_attention_mask = False
-        try:
-            sig = inspect.signature(self.model.forward)
-            self._supports_attention_mask = ("attention_mask" in sig.parameters)
-        except Exception:
-            self._supports_attention_mask = False
-    # -------------------------
-    # İç yardımcılar
-    # -------------------------
     def _load_image(self, img_field: str) -> Optional[Image.Image]:
         """URL / base64 / path -> PIL.Image"""
-        if not img_field: return None
         try:
             if img_field.startswith("data:image"):
                 _, b64 = img_field.split(",", 1)
                 return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
             if img_field.startswith(("http://", "https://")):
-                r = requests.get(img_field, timeout=20); r.raise_for_status()
                 return Image.open(io.BytesIO(r.content)).convert("RGB")
             return Image.open(img_field).convert("RGB")
         except Exception as e:
@@ -183,88 +278,106 @@ class EndpointHandler:
             return None
     def _build_prompt(self, user_text: str, conv_mode: str) -> str:
-        """LLaVA model worker tarzı prompt oluştur."""
         if conv_mode not in conv_templates:
-            conv_mode = self.DEFAULT_CONV_MODE
         conv = conv_templates[conv_mode].copy()
         conv.append_message(conv.roles[0], user_text)
         conv.append_message(conv.roles[1], None)
         return conv.get_prompt()
-    # -------------------------
-    # Inference Entry
-    # -------------------------
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        inputs: Dict[str, Any] = data.get("inputs") or {}
-        params: Dict[str, Any] = data.get("parameters") or {}
-        conv_mode_req: Optional[str] = data.get("conv_mode")
-        conv_mode = conv_mode_req if conv_mode_req in conv_templates else self.DEFAULT_CONV_MODE
         query_text = inputs.get("query", "") or inputs.get("text", "") or inputs.get("prompt", "")
         image_f = inputs.get("image") or inputs.get("image_url") or inputs.get("image_base64")
-        # 1) Prompt
         prompt = self._build_prompt(query_text, conv_mode)
-        # 2) Görsel işleme
         images = None
         image_sizes = None
         if image_f and self.is_multimodal:
             try:
                 pil_image = self._load_image(image_f)
-                if pil_image is not None and self.image_processor is not None:
                     images_list = [pil_image]
                     image_sizes = [pil_image.size]
                     processed_images = process_images(images_list, self.image_processor, self.model.config)
                     if isinstance(processed_images, list):
                         images = [img.to(self.model.device, dtype=torch.float16) for img in processed_images]
                     else:
                         images = processed_images.to(self.model.device, dtype=torch.float16)
-                    # Görsel token ekle + im_start/end sarma
-                    prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
                     replace_token = DEFAULT_IMAGE_TOKEN
                     if self.use_im_start_end:
                         replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
                     prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
-                    print("[info] Image processed successfully.")
                 else:
-                    print("[warn] Could not load image or image_processor is None.")
             except Exception as e:
                 print(f"[warn] Image processing failed: {e}")
-                import traceback; traceback.print_exc()
-                images = None; image_sizes = None
-        # 3) Tokenization
         try:
             input_ids = tokenizer_image_token(
                 prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
             ).unsqueeze(0).to(self.model.device)
         except Exception as e:
             print(f"[error] Tokenization failed: {e}")
-            enc = self.tokenizer(query_text, return_tensors="pt")
-            input_ids = enc.input_ids.to(self.model.device)
-            images = None; image_sizes = None
-        # attention_mask: model destekliyorsa üret ve ekleyeceğiz
-        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
-        # 4) Generation params
         temperature = float(params.get("temperature", 0.0))
         top_p = float(params.get("top_p", 1.0))
         repetition_penalty = float(params.get("repetition_penalty", 1.0))
-        max_new_tokens = min(int(params.get("max_new_tokens", self.MAX_NEW_TOKENS_DEF)), 1024)
         do_sample = bool(params.get("do_sample", temperature > 0.001))
-        max_context_length = getattr(self.model.config, 'max_position_embeddings', 4096)
-        max_new_tokens = min(max_new_tokens, max(1, max_context_length - input_ids.shape[-1] - 50))
         if max_new_tokens < 1:
             return [{"generated_text": "Error: Input too long, exceeds max token length."}]
-        # 5) Gen kwargs
-        gen_kwargs: Dict[str, Any] = {
-            "inputs": input_ids,
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
@@ -273,40 +386,30 @@ class EndpointHandler:
             "use_cache": bool(params.get("use_cache", True)),
             "pad_token_id": self.tokenizer.eos_token_id,
         }
-        if self._supports_attention_mask:
-            gen_kwargs["attention_mask"] = attention_mask
         if images is not None and image_sizes is not None:
             gen_kwargs["images"] = images
             gen_kwargs["image_sizes"] = image_sizes
-            print("[info] Using images in generation.")
         else:
-            # Prompt’ta olası görsel tokenlarını temizle (text-only güvenliği)
-            prompt_clean = prompt.replace(DEFAULT_IMAGE_TOKEN, "") \
-                                 .replace(DEFAULT_IM_START_TOKEN, "") \
-                                 .replace(DEFAULT_IM_END_TOKEN, "")
-            if prompt_clean != prompt:
-                try:
-                    input_ids = self.tokenizer(prompt_clean, return_tensors="pt").input_ids.to(self.model.device)
-                    gen_kwargs["inputs"] = input_ids
-                    if self._supports_attention_mask:
-                        gen_kwargs["attention_mask"] = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
-                except Exception as e:
-                    print(f"[warn] prompt cleanup failed: {e}")
-            print("[info] Text-only generation.")
-        # 6) Generate
         try:
             with torch.inference_mode():
                 output_ids = self.model.generate(**gen_kwargs)
-            if output_ids.shape[-1] > gen_kwargs["inputs"].shape[-1]:
-                response_ids = output_ids[:, gen_kwargs["inputs"].shape[-1]:]
                 text = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True)[0].strip()
             else:
                 text = "Error: No response generated"
         except Exception as e:
             print(f"Generation error: {e}")
-            import traceback; traceback.print_exc()
             text = f"Error during generation: {str(e)}"
-        return [{"generated_text": text}]

+# -*- coding: utf-8 -*-
+import os, io, sys, subprocess, base64
+from typing import Any, Dict, List, Optional
 import torch
 from PIL import Image
 import requests
+import math
+import ast
+# ===== Kullanılacak HF model id =====
 MODEL_ID = os.getenv("HF_MODEL_ID", "PULSE-ECG/PULSE-7B")
+# Flash Attention için environment
+os.environ.setdefault("FLASH_ATTENTION", "1")
+os.environ.setdefault("ATTN_IMPLEMENTATION", "flash_attention_2")
+# ===== LLaVA kaynak kodunu runtime'da getir (pip yok) =====
 LLAVA_GIT_URL = os.getenv("LLAVA_GIT_URL", "https://github.com/haotian-liu/LLaVA.git")
+LLAVA_GIT_REF = os.getenv("LLAVA_GIT_REF", "v1.2.2.post1")  # kanıtlı, stabil
 LLAVA_SRC_DIR = os.getenv("LLAVA_SRC_DIR", "/tmp/llava_src/LLaVA")
 def _ensure_llava():
 _ensure_llava()
+# ---- mm_utils fonksiyonlarını import etmeye çalış, yoksa kendi implement edelim ----
+try:
+    from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path, load_image_from_base64
+except ImportError:
+    # Fallback: kendi implementasyonumuzu kullan
+    from llava.constants import IMAGE_TOKEN_INDEX
+    def expand2square(pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+    def select_best_resolution(original_size, possible_resolutions):
+        original_width, original_height = original_size
+        best_fit = None
+        max_effective_resolution = 0
+        min_wasted_resolution = float('inf')
+        for width, height in possible_resolutions:
+            scale = min(width / original_width, height / original_height)
+            downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+            effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+            wasted_resolution = (width * height) - effective_resolution
+            if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
+                max_effective_resolution = effective_resolution
+                min_wasted_resolution = wasted_resolution
+                best_fit = (width, height)
+        return best_fit
+    def resize_and_pad_image(image, target_resolution):
+        original_width, original_height = image.size
+        target_width, target_height = target_resolution
+        scale_w = target_width / original_width
+        scale_h = target_height / original_height
+        if scale_w < scale_h:
+            new_width = target_width
+            new_height = min(math.ceil(original_height * scale_w), target_height)
+        else:
+            new_height = target_height
+            new_width = min(math.ceil(original_width * scale_h), target_width)
+        resized_image = image.resize((new_width, new_height))
+        new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+        paste_x = (target_width - new_width) // 2
+        paste_y = (target_height - new_height) // 2
+        new_image.paste(resized_image, (paste_x, paste_y))
+        return new_image
+    def divide_to_patches(image, patch_size):
+        patches = []
+        width, height = image.size
+        for i in range(0, height, patch_size):
+            for j in range(0, width, patch_size):
+                box = (j, i, j + patch_size, i + patch_size)
+                patch = image.crop(box)
+                patches.append(patch)
+        return patches
+    def process_anyres_image(image, processor, grid_pinpoints):
+        if type(grid_pinpoints) is list:
+            possible_resolutions = grid_pinpoints
+        else:
+            possible_resolutions = ast.literal_eval(grid_pinpoints)
+        best_resolution = select_best_resolution(image.size, possible_resolutions)
+        image_padded = resize_and_pad_image(image, best_resolution)
+        patches = divide_to_patches(image_padded, processor.crop_size['height'])
+        image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+                         for image_patch in image_patches]
+        return torch.stack(image_patches, dim=0)
+    def process_images(images, image_processor, model_cfg):
+        """CRITICAL: Tam mm_utils.py implementasyonu"""
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == 'pad':
+            for image in images:
+                image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
+                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                new_images.append(image)
+        elif image_aspect_ratio == "anyres":
+            for image in images:
+                image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors='pt')['pixel_values']
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+    def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+        prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+        def insert_separator(X, sep):
+            return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+        input_ids = []
+        offset = 0
+        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+            input_ids.extend(x[offset:])
+        if return_tensors is not None:
+            if return_tensors == 'pt':
+                return torch.tensor(input_ids, dtype=torch.long)
+            raise ValueError(f'Unsupported tensor type: {return_tensors}')
+        return input_ids
+    def get_model_name_from_path(model_path):
+        model_path = model_path.strip("/")
+        model_paths = model_path.split("/")
+        if model_paths[-1].startswith('checkpoint-'):
+            return model_paths[-2] + "_" + model_paths[-1]
+        else:
+            return model_paths[-1]
+    def load_image_from_base64(image):
+        return Image.open(io.BytesIO(base64.b64decode(image)))
+# ---- LLaVA parçaları (model worker'dan alındı) ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
 from llava.conversation import conv_templates
 from llava.utils import disable_torch_init
+# Varsayılanlar
+DEFAULT_CONV_MODE  = os.getenv("LLAVA_CONV_MODE", "llava_v1")
+MAX_NEW_TOKENS_DEF = int(os.getenv("MAX_NEW_TOKENS", "1024"))
 class EndpointHandler:
     """
     Girdi:
+    {
+      "inputs": { "query": "...", "image": "<url|dataurl|path>" },
+      "parameters": { "max_new_tokens": 256, "temperature": 0.0, "top_p": 1.0,
+                      "repetition_penalty": 1.0, "do_sample": false, "use_cache": true },
+      "conv_mode": "llava_v2"  # opsiyonel
+    }
     Çıktı: [ { "generated_text": "..." } ]
     """
     def __init__(self, path: str = "") -> None:
         disable_torch_init()
+        # PULSE-7B HF'den/yerelden nereden yükleniyorsa yolu belirle
         if os.getenv("HF_MODEL_LOCAL_DIR", "").strip():
             model_path = os.getenv("HF_MODEL_LOCAL_DIR").strip()
         elif os.getenv("HF_MODEL_ID", "").strip():
             model_path = os.getenv("HF_MODEL_ID").strip()
         else:
+            model_path = MODEL_ID  # default: HF Hub PULSE-7B
         self.model_name = get_model_name_from_path(model_path)
+        # Attention implementation otomatik seç
         try:
+            import flash_attn
             attn_impl = "flash_attention_2"
+        except ImportError:
             attn_impl = "sdpa"
+        # PULSE, LLaVA tabanlı olduğundan LLaVA loader ile yüklenir
         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
             model_path=model_path,
             model_base=None,
         )
         self.model.eval()
         def _patch_forward(obj, label="model"):
             try:
+                if not hasattr(obj, "forward"):
+                    return False
+                orig_forward = obj.forward
                 def patched_forward(*args, **kwargs):
+                    # Sessizce düşürülecek yeni anahtarlar
                     kwargs.pop("cache_position", None)
                     kwargs.pop("input_positions", None)
+                    return orig_forward(*args, **kwargs)
                 obj.forward = patched_forward
                 print(f"[hotfix] Patched forward on {label}")
                 return True
                 print(f"[warn] forward patch failed on {label}: {e}")
                 return False
+        # Ana modelde dene
         _patch_forward(self.model, "self.model")
+        # Bazı sürümlerde forward zinciri iç modüle de gider
+        if hasattr(self.model, "model"):
+            _patch_forward(self.model.model, "self.model.model")
+        if hasattr(self.model, "base_model"):
+            _patch_forward(self.model.base_model, "self.model.base_model")
+        # =======================================================================
+        # Model worker'dan: multimodal check
+        self.is_multimodal = 'llava' in self.model_name.lower() or 'pulse' in self.model_name.lower()
+        # Görsel token işaretleri (LLaVA config)
         self.use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
+    # ---- yardımcılar ----
     def _load_image(self, img_field: str) -> Optional[Image.Image]:
         """URL / base64 / path -> PIL.Image"""
+        if not img_field:
+            return None
         try:
             if img_field.startswith("data:image"):
                 _, b64 = img_field.split(",", 1)
                 return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
             if img_field.startswith(("http://", "https://")):
+                r = requests.get(img_field, timeout=20)
+                r.raise_for_status()
                 return Image.open(io.BytesIO(r.content)).convert("RGB")
             return Image.open(img_field).convert("RGB")
         except Exception as e:
             return None
     def _build_prompt(self, user_text: str, conv_mode: str) -> str:
+        """Model worker tarzında prompt oluştur"""
         if conv_mode not in conv_templates:
+            conv_mode = DEFAULT_CONV_MODE
         conv = conv_templates[conv_mode].copy()
+        # Model worker'da görüntüler sonradan replace edilir
+        # Şimdilik sadece text ile başlayalım
         conv.append_message(conv.roles[0], user_text)
         conv.append_message(conv.roles[1], None)
         return conv.get_prompt()
+    # ---- inference ----
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        inputs = data.get("inputs") or {}
+        params = data.get("parameters") or {}
+        conv_mode_req = data.get("conv_mode")
+        conv_mode = conv_mode_req if conv_mode_req in conv_templates else DEFAULT_CONV_MODE
         query_text = inputs.get("query", "") or inputs.get("text", "") or inputs.get("prompt", "")
         image_f = inputs.get("image") or inputs.get("image_url") or inputs.get("image_base64")
+        # 1) İlk prompt oluştur (görüntü olmadan)
         prompt = self._build_prompt(query_text, conv_mode)
+        # 2) Görüntü işleme (model worker tarzında)
         images = None
         image_sizes = None
         if image_f and self.is_multimodal:
             try:
                 pil_image = self._load_image(image_f)
+                if pil_image is not None:
                     images_list = [pil_image]
                     image_sizes = [pil_image.size]
+                    # Model worker'daki gibi process et
                     processed_images = process_images(images_list, self.image_processor, self.model.config)
                     if isinstance(processed_images, list):
                         images = [img.to(self.model.device, dtype=torch.float16) for img in processed_images]
                     else:
                         images = processed_images.to(self.model.device, dtype=torch.float16)
+                    # Model worker'daki gibi prompt'u düzenle
+                    # DEFAULT_IMAGE_TOKEN'ı prompt'a ekle
+                    prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
+                    # Replace token hesapla (model worker'dan)
                     replace_token = DEFAULT_IMAGE_TOKEN
                     if self.use_im_start_end:
                         replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                    # Prompt'taki image token'ları replace et
                     prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                    print(f"[info] Image processed successfully")
+                    print(f"[debug] Final prompt: {repr(prompt[:200])}")
                 else:
+                    print("[warn] Could not load image")
             except Exception as e:
                 print(f"[warn] Image processing failed: {e}")
+                import traceback
+                traceback.print_exc()
+                images = None
+                image_sizes = None
+        # 3) Tokenize (model worker tarzında)
         try:
             input_ids = tokenizer_image_token(
                 prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
             ).unsqueeze(0).to(self.model.device)
+            print(f"[debug] input_ids shape: {input_ids.shape}")
+            print(f"[debug] Has images: {images is not None}")
         except Exception as e:
             print(f"[error] Tokenization failed: {e}")
+            # Fallback to text-only
+            input_ids = self.tokenizer(query_text, return_tensors="pt").input_ids
+            input_ids = input_ids.to(self.model.device)
+            images = None
+            image_sizes = None
+        # 4) Generation parameters (model worker tarzında)
         temperature = float(params.get("temperature", 0.0))
         top_p = float(params.get("top_p", 1.0))
         repetition_penalty = float(params.get("repetition_penalty", 1.0))
+        max_new_tokens = min(int(params.get("max_new_tokens", MAX_NEW_TOKENS_DEF)), 1024)
         do_sample = bool(params.get("do_sample", temperature > 0.001))
+        # Context length check
+        max_context_length = getattr(self.model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - 50)
         if max_new_tokens < 1:
             return [{"generated_text": "Error: Input too long, exceeds max token length."}]
+        # 5) Generation kwargs (model worker tarzında)
+        gen_kwargs = {
+            "inputs": input_ids,  # model worker 'inputs' kullanır
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "use_cache": bool(params.get("use_cache", True)),
             "pad_token_id": self.tokenizer.eos_token_id,
         }
+        # Image args (model worker tarzında)
         if images is not None and image_sizes is not None:
             gen_kwargs["images"] = images
             gen_kwargs["image_sizes"] = image_sizes
+            print(f"[info] Using images in generation")
         else:
+            print("[info] Text-only generation")
         try:
             with torch.inference_mode():
                 output_ids = self.model.generate(**gen_kwargs)
+            # Output'u input'tan ayır
+            if output_ids.shape[-1] > input_ids.shape[-1]:
+                response_ids = output_ids[:, input_ids.shape[-1]:]
                 text = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True)[0].strip()
             else:
                 text = "Error: No response generated"
         except Exception as e:
             print(f"Generation error: {e}")
+            import traceback
+            traceback.print_exc()
             text = f"Error during generation: {str(e)}"
+        return [{"generated_text": text}]