CanerDedeoglu
/

Rapid_ECG

@@ -1,24 +1,31 @@
 # -*- coding: utf-8 -*-
 import os, io, sys, subprocess, base64
-from typing import Any, Dict, List, Optional
 import torch
 from PIL import Image
 import requests
 import math
-import ast
 # ===== Kullanılacak HF model id =====
 MODEL_ID = os.getenv("HF_MODEL_ID", "PULSE-ECG/PULSE-7B")
-# Flash Attention için environment
 os.environ.setdefault("FLASH_ATTENTION", "1")
 os.environ.setdefault("ATTN_IMPLEMENTATION", "flash_attention_2")
-# ===== LLaVA kaynak kodunu runtime'da getir (pip yok) =====
 LLAVA_GIT_URL = os.getenv("LLAVA_GIT_URL", "https://github.com/haotian-liu/LLaVA.git")
-LLAVA_GIT_REF = os.getenv("LLAVA_GIT_REF", "v1.2.2.post1")  # kanıtlı, stabil
 LLAVA_SRC_DIR = os.getenv("LLAVA_SRC_DIR", "/tmp/llava_src/LLaVA")
 def _ensure_llava():
@@ -33,141 +40,7 @@ def _ensure_llava():
 _ensure_llava()
-# ---- mm_utils fonksiyonlarını import etmeye çalış, yoksa kendi implement edelim ----
-try:
-    from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path, load_image_from_base64
-except ImportError:
-    # Fallback: kendi implementasyonumuzu kullan
-    from llava.constants import IMAGE_TOKEN_INDEX
-    def expand2square(pil_img, background_color):
-        width, height = pil_img.size
-        if width == height:
-            return pil_img
-        elif width > height:
-            result = Image.new(pil_img.mode, (width, width), background_color)
-            result.paste(pil_img, (0, (width - height) // 2))
-            return result
-        else:
-            result = Image.new(pil_img.mode, (height, height), background_color)
-            result.paste(pil_img, ((height - width) // 2, 0))
-            return result
-    def select_best_resolution(original_size, possible_resolutions):
-        original_width, original_height = original_size
-        best_fit = None
-        max_effective_resolution = 0
-        min_wasted_resolution = float('inf')
-        for width, height in possible_resolutions:
-            scale = min(width / original_width, height / original_height)
-            downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-            effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-            wasted_resolution = (width * height) - effective_resolution
-            if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
-                max_effective_resolution = effective_resolution
-                min_wasted_resolution = wasted_resolution
-                best_fit = (width, height)
-        return best_fit
-    def resize_and_pad_image(image, target_resolution):
-        original_width, original_height = image.size
-        target_width, target_height = target_resolution
-        scale_w = target_width / original_width
-        scale_h = target_height / original_height
-        if scale_w < scale_h:
-            new_width = target_width
-            new_height = min(math.ceil(original_height * scale_w), target_height)
-        else:
-            new_height = target_height
-            new_width = min(math.ceil(original_width * scale_h), target_width)
-        resized_image = image.resize((new_width, new_height))
-        new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
-        paste_x = (target_width - new_width) // 2
-        paste_y = (target_height - new_height) // 2
-        new_image.paste(resized_image, (paste_x, paste_y))
-        return new_image
-    def divide_to_patches(image, patch_size):
-        patches = []
-        width, height = image.size
-        for i in range(0, height, patch_size):
-            for j in range(0, width, patch_size):
-                box = (j, i, j + patch_size, i + patch_size)
-                patch = image.crop(box)
-                patches.append(patch)
-        return patches
-    def process_anyres_image(image, processor, grid_pinpoints):
-        if type(grid_pinpoints) is list:
-            possible_resolutions = grid_pinpoints
-        else:
-            possible_resolutions = ast.literal_eval(grid_pinpoints)
-        best_resolution = select_best_resolution(image.size, possible_resolutions)
-        image_padded = resize_and_pad_image(image, best_resolution)
-        patches = divide_to_patches(image_padded, processor.crop_size['height'])
-        image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
-        image_patches = [image_original_resize] + patches
-        image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
-                         for image_patch in image_patches]
-        return torch.stack(image_patches, dim=0)
-    def process_images(images, image_processor, model_cfg):
-        """CRITICAL: Tam mm_utils.py implementasyonu"""
-        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
-        new_images = []
-        if image_aspect_ratio == 'pad':
-            for image in images:
-                image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
-                image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-                new_images.append(image)
-        elif image_aspect_ratio == "anyres":
-            for image in images:
-                image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
-                new_images.append(image)
-        else:
-            return image_processor(images, return_tensors='pt')['pixel_values']
-        if all(x.shape == new_images[0].shape for x in new_images):
-            new_images = torch.stack(new_images, dim=0)
-        return new_images
-    def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
-        prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
-        def insert_separator(X, sep):
-            return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
-        input_ids = []
-        offset = 0
-        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
-            offset = 1
-            input_ids.append(prompt_chunks[0][0])
-        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
-            input_ids.extend(x[offset:])
-        if return_tensors is not None:
-            if return_tensors == 'pt':
-                return torch.tensor(input_ids, dtype=torch.long)
-            raise ValueError(f'Unsupported tensor type: {return_tensors}')
-        return input_ids
-    def get_model_name_from_path(model_path):
-        model_path = model_path.strip("/")
-        model_paths = model_path.split("/")
-        if model_paths[-1].startswith('checkpoint-'):
-            return model_paths[-2] + "_" + model_paths[-1]
-        else:
-            return model_paths[-1]
-    def load_image_from_base64(image):
-        return Image.open(io.BytesIO(base64.b64decode(image)))
-# ---- LLaVA parçaları (model worker'dan alındı) ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
@@ -178,42 +51,206 @@ from llava.constants import (
 from llava.conversation import conv_templates
 from llava.utils import disable_torch_init
-# Varsayılanlar
-DEFAULT_CONV_MODE  = os.getenv("LLAVA_CONV_MODE", "llava_v1")
-MAX_NEW_TOKENS_DEF = int(os.getenv("MAX_NEW_TOKENS", "1024"))
 class EndpointHandler:
     """
     Girdi:
-    {
-      "inputs": { "query": "...", "image": "<url|dataurl|path>" },
-      "parameters": { "max_new_tokens": 256, "temperature": 0.0, "top_p": 1.0,
-                      "repetition_penalty": 1.0, "do_sample": false, "use_cache": true },
-      "conv_mode": "llava_v2"  # opsiyonel
-    }
     Çıktı: [ { "generated_text": "..." } ]
     """
     def __init__(self, path: str = "") -> None:
         disable_torch_init()
-        # PULSE-7B HF'den/yerelden nereden yükleniyorsa yolu belirle
         if os.getenv("HF_MODEL_LOCAL_DIR", "").strip():
             model_path = os.getenv("HF_MODEL_LOCAL_DIR").strip()
         elif os.getenv("HF_MODEL_ID", "").strip():
             model_path = os.getenv("HF_MODEL_ID").strip()
         else:
-            model_path = MODEL_ID  # default: HF Hub PULSE-7B
         self.model_name = get_model_name_from_path(model_path)
-        # Attention implementation otomatik seç
         try:
-            import flash_attn
             attn_impl = "flash_attention_2"
-        except ImportError:
             attn_impl = "sdpa"
-        # PULSE, LLaVA tabanlı olduğundan LLaVA loader ile yüklenir
         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
             model_path=model_path,
             model_base=None,
@@ -224,18 +261,15 @@ class EndpointHandler:
         )
         self.model.eval()
         def _patch_forward(obj, label="model"):
             try:
-                if not hasattr(obj, "forward"):
-                    return False
-                orig_forward = obj.forward
                 def patched_forward(*args, **kwargs):
-                    # Sessizce düşürülecek yeni anahtarlar
                     kwargs.pop("cache_position", None)
                     kwargs.pop("input_positions", None)
-                    return orig_forward(*args, **kwargs)
                 obj.forward = patched_forward
                 print(f"[hotfix] Patched forward on {label}")
                 return True
@@ -243,34 +277,56 @@ class EndpointHandler:
                 print(f"[warn] forward patch failed on {label}: {e}")
                 return False
-        # Ana modelde dene
         _patch_forward(self.model, "self.model")
-        # Bazı sürümlerde forward zinciri iç modüle de gider
-        if hasattr(self.model, "model"):
-            _patch_forward(self.model.model, "self.model.model")
-        if hasattr(self.model, "base_model"):
-            _patch_forward(self.model.base_model, "self.model.base_model")
-        # =======================================================================
-        # Model worker'dan: multimodal check
-        self.is_multimodal = 'llava' in self.model_name.lower() or 'pulse' in self.model_name.lower()
-        # Görsel token işaretleri (LLaVA config)
         self.use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
-    # ---- yardımcılar ----
     def _load_image(self, img_field: str) -> Optional[Image.Image]:
         """URL / base64 / path -> PIL.Image"""
-        if not img_field:
-            return None
         try:
             if img_field.startswith("data:image"):
                 _, b64 = img_field.split(",", 1)
                 return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
             if img_field.startswith(("http://", "https://")):
-                r = requests.get(img_field, timeout=20)
-                r.raise_for_status()
                 return Image.open(io.BytesIO(r.content)).convert("RGB")
             return Image.open(img_field).convert("RGB")
         except Exception as e:
@@ -278,106 +334,90 @@ class EndpointHandler:
             return None
     def _build_prompt(self, user_text: str, conv_mode: str) -> str:
-        """Model worker tarzında prompt oluştur"""
         if conv_mode not in conv_templates:
-            conv_mode = DEFAULT_CONV_MODE
         conv = conv_templates[conv_mode].copy()
-        # Model worker'da görüntüler sonradan replace edilir
-        # Şimdilik sadece text ile başlayalım
         conv.append_message(conv.roles[0], user_text)
         conv.append_message(conv.roles[1], None)
         return conv.get_prompt()
-    # ---- inference ----
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        inputs = data.get("inputs") or {}
-        params = data.get("parameters") or {}
-        conv_mode_req = data.get("conv_mode")
-        conv_mode = conv_mode_req if conv_mode_req in conv_templates else DEFAULT_CONV_MODE
         query_text = inputs.get("query", "") or inputs.get("text", "") or inputs.get("prompt", "")
         image_f = inputs.get("image") or inputs.get("image_url") or inputs.get("image_base64")
-        # 1) İlk prompt oluştur (görüntü olmadan)
         prompt = self._build_prompt(query_text, conv_mode)
-        # 2) Görüntü işleme (model worker tarzında)
         images = None
         image_sizes = None
         if image_f and self.is_multimodal:
             try:
                 pil_image = self._load_image(image_f)
-                if pil_image is not None:
                     images_list = [pil_image]
                     image_sizes = [pil_image.size]
-                    # Model worker'daki gibi process et
                     processed_images = process_images(images_list, self.image_processor, self.model.config)
                     if isinstance(processed_images, list):
                         images = [img.to(self.model.device, dtype=torch.float16) for img in processed_images]
                     else:
                         images = processed_images.to(self.model.device, dtype=torch.float16)
-                    # Model worker'daki gibi prompt'u düzenle
-                    # DEFAULT_IMAGE_TOKEN'ı prompt'a ekle
-                    prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
-                    # Replace token hesapla (model worker'dan)
                     replace_token = DEFAULT_IMAGE_TOKEN
                     if self.use_im_start_end:
                         replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
-                    # Prompt'taki image token'ları replace et
                     prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
-                    print(f"[info] Image processed successfully")
-                    print(f"[debug] Final prompt: {repr(prompt[:200])}")
                 else:
-                    print("[warn] Could not load image")
             except Exception as e:
                 print(f"[warn] Image processing failed: {e}")
-                import traceback
-                traceback.print_exc()
-                images = None
-                image_sizes = None
-        # 3) Tokenize (model worker tarzında)
         try:
             input_ids = tokenizer_image_token(
                 prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
             ).unsqueeze(0).to(self.model.device)
-            print(f"[debug] input_ids shape: {input_ids.shape}")
-            print(f"[debug] Has images: {images is not None}")
         except Exception as e:
             print(f"[error] Tokenization failed: {e}")
-            # Fallback to text-only
-            input_ids = self.tokenizer(query_text, return_tensors="pt").input_ids
-            input_ids = input_ids.to(self.model.device)
-            images = None
-            image_sizes = None
-        # 4) Generation parameters (model worker tarzında)
         temperature = float(params.get("temperature", 0.0))
         top_p = float(params.get("top_p", 1.0))
         repetition_penalty = float(params.get("repetition_penalty", 1.0))
-        max_new_tokens = min(int(params.get("max_new_tokens", MAX_NEW_TOKENS_DEF)), 1024)
         do_sample = bool(params.get("do_sample", temperature > 0.001))
-        # Context length check
-        max_context_length = getattr(self.model.config, 'max_position_embeddings', 2048)
-        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - 50)
         if max_new_tokens < 1:
             return [{"generated_text": "Error: Input too long, exceeds max token length."}]
-        # 5) Generation kwargs (model worker tarzında)
-        gen_kwargs = {
-            "inputs": input_ids,  # model worker 'inputs' kullanır
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
@@ -387,29 +427,37 @@ class EndpointHandler:
             "pad_token_id": self.tokenizer.eos_token_id,
         }
-        # Image args (model worker tarzında)
         if images is not None and image_sizes is not None:
             gen_kwargs["images"] = images
             gen_kwargs["image_sizes"] = image_sizes
-            print(f"[info] Using images in generation")
         else:
-            print("[info] Text-only generation")
         try:
             with torch.inference_mode():
                 output_ids = self.model.generate(**gen_kwargs)
-            # Output'u input'tan ayır
-            if output_ids.shape[-1] > input_ids.shape[-1]:
-                response_ids = output_ids[:, input_ids.shape[-1]:]
                 text = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True)[0].strip()
             else:
                 text = "Error: No response generated"
         except Exception as e:
             print(f"Generation error: {e}")
-            import traceback
-            traceback.print_exc()
             text = f"Error during generation: {str(e)}"
-        return [{"generated_text": text}]

 # -*- coding: utf-8 -*-
+# handler.py — PULSE-7B / LLaVA robust endpoint
+# - LLaVA kaynak kodunu runtime'da git clone ile getirir
+# - image_processor fallback (AutoProcessor / vision_tower)
+# - anyres -> pad güvenli düşüş
+# - preprocess/call farkını soyutlama
+# - attention_mask zorunlu (HF generate NoneType.new_ones fix)
+# - forward patch (cache_position/input_positions sessizce düşür)
+# - robust image pipeline (pad_to_multiple, crop_size/shortest_edge tespiti)
 import os, io, sys, subprocess, base64
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 from PIL import Image
 import requests
 import math
 # ===== Kullanılacak HF model id =====
 MODEL_ID = os.getenv("HF_MODEL_ID", "PULSE-ECG/PULSE-7B")
+# Flash Attention / attention impl ayarları (müsaitse kullanırız)
 os.environ.setdefault("FLASH_ATTENTION", "1")
 os.environ.setdefault("ATTN_IMPLEMENTATION", "flash_attention_2")
+# ===== LLaVA kaynak kodunu runtime'da getir (pip yoksa!) =====
 LLAVA_GIT_URL = os.getenv("LLAVA_GIT_URL", "https://github.com/haotian-liu/LLaVA.git")
+LLAVA_GIT_REF = os.getenv("LLAVA_GIT_REF", "v1.2.2.post1")  # stabil bir sürüm
 LLAVA_SRC_DIR = os.getenv("LLAVA_SRC_DIR", "/tmp/llava_src/LLaVA")
 def _ensure_llava():
 _ensure_llava()
+# ---- LLaVA parçaları ----
 from llava.model.builder import load_pretrained_model
 from llava.constants import (
     IMAGE_TOKEN_INDEX,
 from llava.conversation import conv_templates
 from llava.utils import disable_torch_init
+# HF processor fallback'ları
+from transformers import AutoProcessor, AutoImageProcessor, CLIPImageProcessor
+# ==========================
+# Yardımcı Fonksiyonlar
+# ==========================
+def get_model_name_from_path(model_path: str) -> str:
+    p = model_path.strip("/").split("/")
+    return (p[-2] + "_" + p[-1]) if p[-1].startswith("checkpoint-") else p[-1]
+def load_image_from_base64(image: str) -> Image.Image:
+    return Image.open(io.BytesIO(base64.b64decode(image)))
+def expand2square(pil_img: Image.Image, background_color: Tuple[int,int,int]) -> Image.Image:
+    w, h = pil_img.size
+    if w == h:
+        return pil_img
+    if w > h:
+        result = Image.new(pil_img.mode, (w, w), background_color); result.paste(pil_img, (0, (w - h)//2)); return result
+    result = Image.new(pil_img.mode, (h, h), background_color); result.paste(pil_img, ((h - w)//2, 0)); return result
+def select_best_resolution(original_size: Tuple[int,int], possible_resolutions: List[Tuple[int,int]]) -> Tuple[int,int]:
+    ow, oh = original_size
+    best, max_eff, min_waste = None, 0, float("inf")
+    for W, H in possible_resolutions:
+        s = min(W/ow, H/oh)
+        dw, dh = int(ow*s), int(oh*s)
+        eff = min(dw*dh, ow*oh)
+        waste = (W*H) - eff
+        if (eff > max_eff) or (eff == max_eff and waste < min_waste):
+            max_eff, min_waste, best = eff, waste, (W, H)
+    return best
+def resize_and_pad_image(image: Image.Image, target_resolution: Tuple[int,int]) -> Image.Image:
+    ow, oh = image.size
+    W, H = target_resolution
+    sw, sh = W/ow, H/oh
+    if sw < sh:
+        nw, nh = W, min(math.ceil(oh*sw), H)
+    else:
+        nh, nw = H, min(math.ceil(ow*sh), W)
+    resized = image.resize((nw, nh))
+    canvas = Image.new("RGB", (W, H), (0,0,0))
+    canvas.paste(resized, ((W - nw)//2, (H - nh)//2))
+    return canvas
+def pad_to_multiple(image: Image.Image, multiple: int) -> Image.Image:
+    w, h = image.size
+    W = math.ceil(w / multiple) * multiple
+    H = math.ceil(h / multiple) * multiple
+    if (W, H) == (w, h):
+        return image
+    canvas = Image.new(image.mode, (W, H), (0,0,0))
+    canvas.paste(image, (0,0))
+    return canvas
+def divide_to_patches(image: Image.Image, patch_size: int) -> List[Image.Image]:
+    patches = []
+    W, H = image.size
+    for y in range(0, H, patch_size):
+        for x in range(0, W, patch_size):
+            patches.append(image.crop((x, y, x+patch_size, y+patch_size)))
+    return patches
+def _get_crop_size(processor: Any, default: int = 224) -> int:
+    cs = getattr(processor, "crop_size", None)
+    if cs is None:
+        sz = getattr(processor, "size", None)
+        if isinstance(sz, dict): return int(sz.get("shortest_edge", default))
+        if isinstance(sz, int):  return int(sz)
+        return int(default)
+    if isinstance(cs, dict):
+        if "height" in cs: return int(cs["height"])
+        if "shortest_edge" in cs: return int(cs["shortest_edge"])
+        for v in cs.values(): return int(v)
+    return int(cs)
+def _get_shortest_edge(processor: Any, fallback: Optional[int] = None) -> int:
+    sz = getattr(processor, "size", None)
+    if isinstance(sz, dict) and "shortest_edge" in sz: return int(sz["shortest_edge"])
+    if isinstance(sz, int): return int(sz)
+    return _get_crop_size(processor, default=(fallback or 224))
+def _preprocess_one(processor: Any, img: Image.Image) -> torch.Tensor:
+    if hasattr(processor, "preprocess"):
+        out = processor.preprocess(img, return_tensors="pt")
+    else:
+        out = processor(img, return_tensors="pt")
+    return out["pixel_values"][0]
+def process_anyres_image(image: Image.Image, processor: Any, grid_pinpoints: Any) -> torch.Tensor:
+    if isinstance(grid_pinpoints, list):
+        poss = grid_pinpoints
+    else:
+        import ast
+        poss = ast.literal_eval(grid_pinpoints)
+    patch_size = _get_crop_size(processor, 224)
+    shortest = _get_shortest_edge(processor, fallback=patch_size)
+    best = select_best_resolution(image.size, poss)
+    padded = resize_and_pad_image(image, best)
+    padded = pad_to_multiple(padded, patch_size)
+    patches = divide_to_patches(padded, patch_size)
+    resized_orig = image.resize((shortest, shortest))
+    tensors = [_preprocess_one(processor, resized_orig)] + [_preprocess_one(processor, p) for p in patches]
+    return torch.stack(tensors, dim=0)
+def process_images(images: List[Image.Image], image_processor: Any, model_cfg: Any) -> torch.Tensor:
+    iar = getattr(model_cfg, "image_aspect_ratio", None) or getattr(model_cfg, "mm_image_aspect_ratio", None)
+    new_images: List[torch.Tensor] = []
+    if iar == "pad":
+        for img in images:
+            img_mean = getattr(image_processor, "image_mean", [0.5,0.5,0.5])
+            bg = tuple(int(x*255) for x in img_mean)
+            sq = expand2square(img, bg)
+            new_images.append(_preprocess_one(image_processor, sq))
+    elif iar == "anyres":
+        grid = getattr(model_cfg, "image_grid_pinpoints", "[(336,336)]")
+        for img in images:
+            new_images.append(process_anyres_image(img, image_processor, grid))
+    else:
+        # toplu çağrı başarısız olursa tek tek dene
+        try:
+            out = image_processor(images, return_tensors="pt")
+            return out["pixel_values"]
+        except TypeError:
+            outs = [image_processor(im, return_tensors="pt") for im in images]
+            pix = [o["pixel_values"][0] for o in outs]
+            return torch.stack(pix, dim=0)
+    if all(x.shape == new_images[0].shape for x in new_images):
+        return torch.stack(new_images, dim=0)
+    return new_images
+def tokenizer_image_token(prompt: str, tokenizer: Any, image_token_index: int = IMAGE_TOKEN_INDEX,
+                          return_tensors: Optional[str] = None):
+    chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_sep(X, sep):
+        return [e for sub in zip(X, [sep]*len(X)) for e in sub][:-1]
+    ids: List[int] = []
+    offset = 0
+    if len(chunks) > 0 and len(chunks[0]) > 0 and chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        ids.append(chunks[0][0])
+    for x in insert_sep(chunks, [image_token_index]*(offset+1)):
+        ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return ids
+# ==========================
+# Endpoint Handler
+# ==========================
 class EndpointHandler:
     """
     Girdi:
+      {
+        "inputs": { "query": "...", "image": "<url|dataurl|path>" },
+        "parameters": {
+          "max_new_tokens": 256, "temperature": 0.0, "top_p": 1.0,
+          "repetition_penalty": 1.0, "do_sample": false, "use_cache": true
+        },
+        "conv_mode": "llava_v2"  # opsiyonel
+      }
     Çıktı: [ { "generated_text": "..." } ]
     """
     def __init__(self, path: str = "") -> None:
         disable_torch_init()
+        # Model yolu önceliği: HF_MODEL_LOCAL_DIR > HF_MODEL_ID > MODEL_ID
         if os.getenv("HF_MODEL_LOCAL_DIR", "").strip():
             model_path = os.getenv("HF_MODEL_LOCAL_DIR").strip()
         elif os.getenv("HF_MODEL_ID", "").strip():
             model_path = os.getenv("HF_MODEL_ID").strip()
         else:
+            model_path = MODEL_ID
+        if not model_path:
+            raise RuntimeError("Model path belirlenemedi. HF_MODEL_LOCAL_DIR / HF_MODEL_ID / MODEL_ID ayarla.")
         self.model_name = get_model_name_from_path(model_path)
+        # Attention implementation (flash varsa flash, yoksa sdpa)
         try:
+            import flash_attn  # noqa: F401
             attn_impl = "flash_attention_2"
+        except Exception:
             attn_impl = "sdpa"
+        # Model yükle (LLaVA loader)
         self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
             model_path=model_path,
             model_base=None,
         )
         self.model.eval()
+        # ---- forward patch (HF 4.43+ arg uyumu) ----
         def _patch_forward(obj, label="model"):
             try:
+                if not hasattr(obj, "forward"): return False
+                orig = obj.forward
                 def patched_forward(*args, **kwargs):
                     kwargs.pop("cache_position", None)
                     kwargs.pop("input_positions", None)
+                    return orig(*args, **kwargs)
                 obj.forward = patched_forward
                 print(f"[hotfix] Patched forward on {label}")
                 return True
                 print(f"[warn] forward patch failed on {label}: {e}")
                 return False
         _patch_forward(self.model, "self.model")
+        if hasattr(self.model, "model"): _patch_forward(self.model.model, "self.model.model")
+        if hasattr(self.model, "base_model"): _patch_forward(self.model.base_model, "self.model.base_model")
+        # ---- image_processor fallback ----
+        if self.image_processor is None:
+            print("[hotfix] image_processor None, AutoProcessor fallback deneniyor...")
+            try:
+                proc = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+                self.image_processor = getattr(proc, "image_processor", proc)
+            except Exception as e:
+                print(f"[warn] AutoProcessor başarısız: {e}")
+                vt = getattr(self.model.config, "vision_tower", None)
+                if vt:
+                    try:
+                        self.image_processor = AutoImageProcessor.from_pretrained(vt, trust_remote_code=True)
+                    except Exception:
+                        self.image_processor = CLIPImageProcessor.from_pretrained(vt)
+        # anyres -> pad fallback (processor/crop_size yoksa)
+        iar = getattr(self.model.config, "mm_image_aspect_ratio", None) or \
+              getattr(self.model.config, "image_aspect_ratio", None)
+        needs_crop = (self.image_processor is None) or (getattr(self.image_processor, "crop_size", None) is None)
+        if iar == "anyres" and needs_crop:
+            print("[hotfix] image_aspect_ratio:anyres -> pad (processor/crop_size eksik)")
+            if hasattr(self.model.config, "image_aspect_ratio"):
+                self.model.config.image_aspect_ratio = "pad"
+            if hasattr(self.model.config, "mm_image_aspect_ratio"):
+                self.model.config.mm_image_aspect_ratio = "pad"
+        # multimodal bayraklar
         self.use_im_start_end = getattr(self.model.config, "mm_use_im_start_end", False)
+        self.is_multimodal = 'llava' in self.model_name.lower() or 'pulse' in self.model_name.lower()
+        # Varsayılanlar
+        self.DEFAULT_CONV_MODE  = os.getenv("LLAVA_CONV_MODE", "llava_v1")
+        self.MAX_NEW_TOKENS_DEF = int(os.getenv("MAX_NEW_TOKENS", "1024"))
+    # -------------------------
+    # İç yardımcılar
+    # -------------------------
     def _load_image(self, img_field: str) -> Optional[Image.Image]:
         """URL / base64 / path -> PIL.Image"""
+        if not img_field: return None
         try:
             if img_field.startswith("data:image"):
                 _, b64 = img_field.split(",", 1)
                 return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")
             if img_field.startswith(("http://", "https://")):
+                r = requests.get(img_field, timeout=20); r.raise_for_status()
                 return Image.open(io.BytesIO(r.content)).convert("RGB")
             return Image.open(img_field).convert("RGB")
         except Exception as e:
             return None
     def _build_prompt(self, user_text: str, conv_mode: str) -> str:
+        """LLaVA model worker tarzı prompt oluştur."""
         if conv_mode not in conv_templates:
+            conv_mode = self.DEFAULT_CONV_MODE
         conv = conv_templates[conv_mode].copy()
         conv.append_message(conv.roles[0], user_text)
         conv.append_message(conv.roles[1], None)
         return conv.get_prompt()
+    # -------------------------
+    # Inference Entry
+    # -------------------------
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        inputs: Dict[str, Any] = data.get("inputs") or {}
+        params: Dict[str, Any] = data.get("parameters") or {}
+        conv_mode_req: Optional[str] = data.get("conv_mode")
+        conv_mode = conv_mode_req if conv_mode_req in conv_templates else self.DEFAULT_CONV_MODE
         query_text = inputs.get("query", "") or inputs.get("text", "") or inputs.get("prompt", "")
         image_f = inputs.get("image") or inputs.get("image_url") or inputs.get("image_base64")
+        # 1) Prompt
         prompt = self._build_prompt(query_text, conv_mode)
+        # 2) Görsel işleme
         images = None
         image_sizes = None
         if image_f and self.is_multimodal:
             try:
                 pil_image = self._load_image(image_f)
+                if pil_image is not None and self.image_processor is not None:
                     images_list = [pil_image]
                     image_sizes = [pil_image.size]
                     processed_images = process_images(images_list, self.image_processor, self.model.config)
+                    # tensor/list to device + dtype
                     if isinstance(processed_images, list):
                         images = [img.to(self.model.device, dtype=torch.float16) for img in processed_images]
                     else:
                         images = processed_images.to(self.model.device, dtype=torch.float16)
+                    # Görsel token ekle + im_start/end sarma
+                    prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
                     replace_token = DEFAULT_IMAGE_TOKEN
                     if self.use_im_start_end:
                         replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
                     prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                    print("[info] Image processed successfully.")
                 else:
+                    print("[warn] Could not load image or image_processor is None.")
             except Exception as e:
                 print(f"[warn] Image processing failed: {e}")
+                import traceback; traceback.print_exc()
+                images = None; image_sizes = None
+        # 3) Tokenization (+ attention_mask)
         try:
             input_ids = tokenizer_image_token(
                 prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt'
             ).unsqueeze(0).to(self.model.device)
         except Exception as e:
             print(f"[error] Tokenization failed: {e}")
+            enc = self.tokenizer(query_text, return_tensors="pt")
+            input_ids = enc.input_ids.to(self.model.device)
+            images = None; image_sizes = None
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
+        # 4) Generation params
         temperature = float(params.get("temperature", 0.0))
         top_p = float(params.get("top_p", 1.0))
         repetition_penalty = float(params.get("repetition_penalty", 1.0))
+        max_new_tokens = min(int(params.get("max_new_tokens", self.MAX_NEW_TOKENS_DEF)), 1024)
         do_sample = bool(params.get("do_sample", temperature > 0.001))
+        # Context length sınırı (güvenli boşluk)
+        max_context_length = getattr(self.model.config, 'max_position_embeddings', 4096)
+        max_new_tokens = min(max_new_tokens, max(1, max_context_length - input_ids.shape[-1] - 50))
         if max_new_tokens < 1:
             return [{"generated_text": "Error: Input too long, exceeds max token length."}]
+        # 5) Gen kwargs
+        gen_kwargs: Dict[str, Any] = {
+            "inputs": input_ids,
+            "attention_mask": attention_mask,
             "max_new_tokens": max_new_tokens,
             "temperature": temperature,
             "top_p": top_p,
             "pad_token_id": self.tokenizer.eos_token_id,
         }
         if images is not None and image_sizes is not None:
             gen_kwargs["images"] = images
             gen_kwargs["image_sizes"] = image_sizes
+            print("[info] Using images in generation.")
         else:
+            # Prompt’ta olası görsel tokenlarını temizle (text-only güvenliği)
+            prompt_clean = prompt.replace(DEFAULT_IMAGE_TOKEN, "") \
+                                 .replace(DEFAULT_IM_START_TOKEN, "") \
+                                 .replace(DEFAULT_IM_END_TOKEN, "")
+            if prompt_clean != prompt:
+                try:
+                    input_ids = self.tokenizer(prompt_clean, return_tensors="pt").input_ids.to(self.model.device)
+                    attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device)
+                    gen_kwargs["inputs"] = input_ids
+                    gen_kwargs["attention_mask"] = attention_mask
+                except Exception as e:
+                    print(f"[warn] prompt cleanup failed: {e}")
+            print("[info] Text-only generation.")
+        # 6) Generate
         try:
             with torch.inference_mode():
                 output_ids = self.model.generate(**gen_kwargs)
+            if output_ids.shape[-1] > gen_kwargs["inputs"].shape[-1]:
+                response_ids = output_ids[:, gen_kwargs["inputs"].shape[-1]:]
                 text = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True)[0].strip()
             else:
                 text = "Error: No response generated"
         except Exception as e:
             print(f"Generation error: {e}")
+            import traceback; traceback.print_exc()
             text = f"Error during generation: {str(e)}"
+        return [{"generated_text": text}]