CanerDedeoglu
/

Rapid_ECG

@@ -1,3 +1,6 @@
 import os
 import datetime
 import torch
@@ -9,7 +12,7 @@ import requests
 from PIL import Image
 from io import BytesIO
-# Try to import cv2, but make it optional
 try:
     import cv2
     CV2_AVAILABLE = True
@@ -17,7 +20,7 @@ except ImportError:
     CV2_AVAILABLE = False
     print("Warning: cv2 (OpenCV) not available. Video processing will be disabled.")
-# Try to import llava modules, but make them optional
 try:
     from llava import conversation as conversation_lib
     from llava.constants import DEFAULT_IMAGE_TOKEN
@@ -41,15 +44,15 @@ except ImportError as e:
     LLAVA_AVAILABLE = False
     print(f"Warning: LLaVA modules not available: {e}")
-# Try to import transformers
 try:
-    from transformers import TextStreamer, TextIteratorStreamer
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
     print("Warning: Transformers not available")
-# Try to import huggingface_hub
 try:
     from huggingface_hub import HfApi, login
     HF_HUB_AVAILABLE = True
@@ -57,7 +60,7 @@ except ImportError:
     HF_HUB_AVAILABLE = False
     print("Warning: Hugging Face Hub not available")
-# Initialize Hugging Face API
 if HF_HUB_AVAILABLE and "HF_TOKEN" in os.environ:
     try:
         login(token=os.environ["HF_TOKEN"], write_permission=True)
@@ -71,21 +74,23 @@ else:
     api = None
     repo_name = ""
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
-# Global variables for model and tokenizer
 tokenizer = None
 model = None
 image_processor = None
 context_len = None
 args = None
 def get_conv_log_filename():
     t = datetime.datetime.now()
-    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_conv.json")
-    return name
 def get_conv_vote_filename():
     t = datetime.datetime.now()
@@ -98,13 +103,7 @@ def vote_last_response(state, vote_type, model_selector):
     if api and repo_name:
         try:
             with open(get_conv_vote_filename(), "a") as fout:
-                data = {
-                    "type": vote_type,
-                    "model": model_selector,
-                    "state": state,
-                }
-                fout.write(json.dumps(data) + "\n")
             api.upload_file(
                 path_or_fileobj=get_conv_vote_filename(),
                 path_in_repo=get_conv_vote_filename().replace("./votes/", ""),
@@ -115,93 +114,48 @@ def vote_last_response(state, vote_type, model_selector):
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
-        return False  # Video processing disabled
-    video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
-    ext = name.split(".")[-1].lower()
-    return ext in video_extensions
 def is_valid_image_filename(name):
-    image_extensions = ["jpg", "jpeg", "png", "bmp", "gif", "tiff", "webp", "heic", "heif", "jfif", "svg", "eps", "raw"]
-    ext = name.split(".")[-1].lower()
-    return ext in image_extensions
-def sample_frames(video_file, num_frames):
-    if not CV2_AVAILABLE:
-        raise ImportError("cv2 (OpenCV) not available. Video processing is disabled.")
-    video = cv2.VideoCapture(video_file)
-    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-    interval = total_frames // num_frames
-    frames = []
-    for i in range(total_frames):
-        ret, frame = video.read()
-        if not ret:
-            continue
-        if i % interval == 0:
-            pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            frames.append(pil_img)
-    video.release()
-    return frames
 def load_image(image_file):
-    if image_file.startswith("http") or image_file.startswith("https"):
-        response = requests.get(image_file)
-        if response.status_code == 200:
-            image = Image.open(BytesIO(response.content)).convert("RGB")
-        else:
-            raise ValueError("Failed to load image from URL")
-    else:
-        print("Load image from local file")
-        print(image_file)
-        image = Image.open(image_file).convert("RGB")
-    return image
 def process_base64_image(base64_string):
-    """Process base64 encoded image string"""
-    try:
-        # Remove data URL prefix if present
-        if base64_string.startswith('data:image'):
-            base64_string = base64_string.split(',')[1]
-        # Decode base64 to bytes
-        image_data = base64.b64decode(base64_string)
-        # Convert to PIL Image
-        image = Image.open(BytesIO(image_data)).convert("RGB")
-        return image
-    except Exception as e:
-        raise ValueError(f"Failed to process base64 image: {e}")
 def process_image_input(image_input):
-    """Process different types of image input (file path, URL, or base64)"""
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
-            # Try to process as base64
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
-        # Handle base64 image from dict
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
 class InferenceDemo(object):
     def __init__(self, args, model_path, tokenizer, model, image_processor, context_len) -> None:
         if not LLAVA_AVAILABLE:
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
-            tokenizer,
-            model,
-            image_processor,
-            context_len,
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
@@ -213,30 +167,22 @@ class InferenceDemo(object):
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
-            print(
-                "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
-                    conv_mode, args.conv_mode, args.conv_mode
-                )
-            )
         else:
             args.conv_mode = conv_mode
-        self.conv_mode = conv_mode
-        self.conversation = conv_templates[args.conv_mode].copy()
         self.num_frames = args.num_frames
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
@@ -245,237 +191,196 @@ class ChatSessionManager:
 chat_manager = ChatSessionManager()
 def clear_history():
-    """Clear conversation history"""
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
-        chatbot_instance = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
-        try:
-            if hasattr(chatbot_instance, 'conv_mode') and chatbot_instance.conv_mode and LLAVA_AVAILABLE:
-                chatbot_instance.conversation = conv_templates[chatbot_instance.conv_mode].copy()
-            else:
-                # Use default conversation template
-                chatbot_instance.conversation = chatbot_instance.conversation.__class__()
-        except Exception as e:
-            print(f"[DEBUG] Failed to reset conversation in clear_history: {e}")
         return {"status": "success", "message": "Conversation history cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
-def add_message(message_text, image_input=None):
-    """Add a message to the conversation"""
-    return {"status": "success", "message": "Message added"}
-def generate_response(message_text, image_input, temperature=0.05, top_p=1.0, max_output_tokens=4096, repetition_penalty=1.0, conv_mode_override=None):
-    """Generate response for the given message and image"""
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
         if not message_text or not image_input:
             return {"error": "Both message text and image are required"}
-        our_chatbot = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
-        # Process image input
-        try:
-            image = process_image_input(image_input)
-        except Exception as e:
-            return {"error": f"Failed to process image: {str(e)}"}
-        # Save image for logging
-        all_image_hash = []
-        all_image_path = []
-        # Generate hash for the image
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
-        img_byte_arr = img_byte_arr.getvalue()
-        image_hash = hashlib.md5(img_byte_arr).hexdigest()
-        all_image_hash.append(image_hash)
         # Save image to logs
         t = datetime.datetime.now()
-        filename = os.path.join(
-            LOGDIR,
-            "serve_images",
-            f"{t.year}-{t.month:02d}-{t.day:02d}",
-            f"{image_hash}.jpg",
-        )
-        all_image_path.append(filename)
-        if not os.path.isfile(filename):
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            print("image save to", filename)
-            image.save(filename)
-        # Process image for model
-        try:
-            print(f"[DEBUG] Processing image for model...")
-            processed_images = process_images([image], our_chatbot.image_processor, our_chatbot.model.config)
-            print(f"[DEBUG] Processed images length: {len(processed_images)}")
-            if len(processed_images) == 0:
-                return {"error": "Image processing returned empty list"}
-            image_tensor = processed_images[0]
-            image_tensor = image_tensor.half().to(our_chatbot.model.device)
-            image_tensor = image_tensor.unsqueeze(0)
-            print(f"[DEBUG] Image tensor shape: {image_tensor.shape}")
-        except Exception as e:
-            print(f"[DEBUG] Image processing error: {str(e)}")
-            return {"error": f"Image processing failed: {str(e)}"}
-        # Prepare conversation - reset for each request to avoid history issues
-        try:
-            if hasattr(our_chatbot, 'conv_mode') and our_chatbot.conv_mode and LLAVA_AVAILABLE:
-                our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
-            else:
-                # Use default conversation template
-                our_chatbot.conversation = our_chatbot.conversation.__class__()
-        except Exception as e:
-            print(f"[DEBUG] Failed to reset conversation: {e}")
-            # Continue with existing conversation
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
-        our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
-        our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
-        prompt = our_chatbot.conversation.get_prompt()
-        # Tokenize input
-        input_ids = tokenizer_image_token(
-            prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-        ).unsqueeze(0).to(our_chatbot.model.device)
-        # No stopping criteria - let model generate freely up to max_new_tokens
-        print(f"[DEBUG] No stopping criteria - free generation up to {max_output_tokens} tokens")
         stopping_criteria = None
-        # Generate response
         with torch.no_grad():
-            outputs = our_chatbot.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
-                do_sample=False,
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_output_tokens,
-                repetition_penalty=repetition_penalty,
-                use_cache=False,
-                pad_token_id=our_chatbot.tokenizer.eos_token_id,
-                eos_token_id=our_chatbot.tokenizer.eos_token_id,
-                length_penalty=1.0,  # Don't penalize longer sequences
             )
-        # Decode response
-        try:
-            print(f"[DEBUG] Outputs shape: {outputs.shape if hasattr(outputs, 'shape') else 'No shape attr'}")
-            print(f"[DEBUG] Outputs length: {len(outputs) if hasattr(outputs, '__len__') else 'No length'}")
-            print(f"[DEBUG] Input IDs shape: {input_ids.shape}")
-            if len(outputs) == 0:
-                return {"error": "Model generated empty output"}
-            response = our_chatbot.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
-            print(f"[DEBUG] Conversation messages length: {len(our_chatbot.conversation.messages)}")
-            if len(our_chatbot.conversation.messages) > 0:
-                last_message = our_chatbot.conversation.messages[-1]
-                print(f"[DEBUG] Last message: {last_message}")
-                if isinstance(last_message, list) and len(last_message) > 1:
-                    our_chatbot.conversation.messages[-1][-1] = response
-                    print(f"[DEBUG] Response added to conversation")
-                else:
-                    print(f"[DEBUG] Last message format unexpected: {last_message}")
-                    # Add response as new message if format is wrong
-                    our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
-            else:
-                print("[DEBUG] No conversation messages found")
-                # Add response as new message
-                our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
-            print(f"[DEBUG] Generated response length: {len(response)}")
-        except Exception as e:
-            print(f"[DEBUG] Response decoding error: {str(e)}")
-            return {"error": f"Response decoding failed: {str(e)}"}
-        # Log conversation
-        history = [(message_text, response)]
         with open(get_conv_log_filename(), "a") as fout:
-            data = {
                 "type": "chat",
                 "model": "PULSE-7b",
-                "state": history,
-                "images": all_image_hash,
-                "images_path": all_image_path
-            }
-            print("#### conv log", data)
-            fout.write(json.dumps(data) + "\n")
-        # Upload files to Hugging Face if configured
-        if api and repo_name:
-            try:
-                for upload_img in all_image_path:
-                    api.upload_file(
-                        path_or_fileobj=upload_img,
-                        path_in_repo=upload_img.replace("./logs/", ""),
-                        repo_id=repo_name,
-                        repo_type="dataset",
-                    )
-                # Upload conversation log
-                api.upload_file(
-                    path_or_fileobj=get_conv_log_filename(),
-                    path_in_repo=get_conv_log_filename().replace("./logs/", ""),
-                    repo_id=repo_name,
-                    repo_type="dataset")
-            except Exception as e:
-                print(f"Failed to upload files: {e}")
-        return {
-            "status": "success",
-            "response": response,
-            "conversation_id": id(our_chatbot.conversation)
-        }
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
 def upvote_last_response(conversation_id):
-    """Upvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
-        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
-        return {"error": f"Failed to upvote: {str(e)}"}
 def downvote_last_response(conversation_id):
-    """Downvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
-        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
-        return {"error": f"Failed to downvote: {str(e)}"}
 def flag_response(conversation_id):
-    """Flag the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
-        return {"status": "success", "message": "Response flagged successfully"}
     except Exception as e:
-        return {"error": f"Failed to flag response: {str(e)}"}
-# Initialize model when module is imported
 def initialize_model():
-    """Initialize the model and tokenizer"""
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
         print("LLaVA modules not available, skipping model initialization")
         return False
     try:
-        # Set default arguments
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
@@ -488,95 +393,93 @@ def initialize_model():
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
-        # Load model
-        model_path = args.model_path
         model_name = get_model_name_from_path(args.model_path)
-        tokenizer, model, image_processor, context_len = load_pretrained_model(
             args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit
         )
-        print("### image_processor", image_processor)
-        print("### tokenizer", tokenizer)
-        # Move model to GPU if available
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
             print("Model moved to CUDA")
         else:
             print("CUDA not available, using CPU")
         return True
     except Exception as e:
         print(f"Failed to initialize model: {e}")
         return False
-# Don't initialize model on import - do it lazily
-model_initialized = False
-# Main endpoint function for Hugging Face
 def query(payload):
-    """Main endpoint function for Hugging Face inference API"""
     global model_initialized
-    # Lazy initialization - initialize model on first call
     if not model_initialized:
         print("Initializing model on first query...")
         model_initialized = initialize_model()
         if not model_initialized:
             return {"error": "Model initialization failed"}
     try:
         print(f"[DEBUG] query payload keys={list(payload.keys()) if hasattr(payload,'keys') else 'N/A'}")
-        # Extract prompt with multiple possible keys
-        message_text = (payload.get("message") or
-                       payload.get("query") or
-                       payload.get("prompt") or
-                       payload.get("istem") or "")
-        # Extract image with multiple possible keys
-        image_input = (payload.get("image") or
-                      payload.get("image_url") or
-                      payload.get("img") or None)
-        # Extract generation parameters with fallbacks
         temperature = float(payload.get("temperature", 0.05))
         top_p = float(payload.get("top_p", 1.0))
-        max_output_tokens = int(payload.get("max_output_tokens",
-                               payload.get("max_new_tokens",
-                               payload.get("max_tokens", 8192))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
-        if not message_text or not message_text.strip():
-            return {"error": "Missing prompt text. Use 'message', 'query', 'prompt', or 'istem' key"}
         if not image_input:
-            return {"error": "Missing image. Use 'image', 'image_url', or 'img' key"}
-        # Generate response with all parameters
-        result = generate_response(
             message_text=message_text,
             image_input=image_input,
             temperature=temperature,
             top_p=top_p,
             max_output_tokens=max_output_tokens,
             repetition_penalty=repetition_penalty,
-            conv_mode_override=conv_mode_override
         )
-        return result
     except Exception as e:
         return {"error": f"Query failed: {str(e)}"}
-# Additional utility endpoints
 def health_check():
-    """Health check endpoint"""
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
@@ -584,18 +487,12 @@ def health_check():
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
-        "lazy_loading": True  # Model will be loaded on first query
     }
 def get_model_info():
-    """Get model information"""
     if not model_initialized:
-        return {
-            "error": "Model not initialized yet",
-            "lazy_loading": True,
-            "note": "Model will be loaded on first query"
-        }
     return {
         "model_path": args.model_path if args else "Unknown",
         "model_type": "PULSE-7B",
@@ -603,37 +500,19 @@ def get_model_info():
         "device": str(model.device) if model else "Unknown"
     }
-# Hugging Face EndpointHandler class
 class EndpointHandler:
-    """Hugging Face endpoint handler class"""
     def __init__(self, model_dir):
-        """Initialize the endpoint handler"""
         self.model_dir = model_dir
         print(f"EndpointHandler initialized with model_dir: {model_dir}")
     def __call__(self, payload):
-        """Main endpoint function - handles Hugging Face payload format"""
-        # Hugging Face sends payload in "inputs" wrapper
         if "inputs" in payload:
-            # Extract the actual payload from inputs wrapper
-            actual_payload = payload["inputs"]
-            return query(actual_payload)
-        else:
-            # Direct payload (for backward compatibility)
-            return query(payload)
     def health_check(self):
-        """Health check endpoint"""
         return health_check()
     def get_model_info(self):
-        """Get model information"""
         return get_model_info()
-# For backward compatibility and testing
 if __name__ == "__main__":
-    print("Handler module loaded successfully!")
-    print("This handler is now ready for Hugging Face endpoints.")
-    print("Use the 'query' function as the main endpoint.")
-    print("Or use EndpointHandler class for Hugging Face compatibility.")

+# -*- coding: utf-8 -*-
+# handler.py — PULSE-7B / LLaVA endpoint (robust + deterministic-ready)
 import os
 import datetime
 import torch
 from PIL import Image
 from io import BytesIO
+# Optional cv2
 try:
     import cv2
     CV2_AVAILABLE = True
     CV2_AVAILABLE = False
     print("Warning: cv2 (OpenCV) not available. Video processing will be disabled.")
+# LLaVA stack
 try:
     from llava import conversation as conversation_lib
     from llava.constants import DEFAULT_IMAGE_TOKEN
     LLAVA_AVAILABLE = False
     print(f"Warning: LLaVA modules not available: {e}")
+# Transformers
 try:
+    from transformers import GenerationConfig
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
     print("Warning: Transformers not available")
+# HF Hub (optional)
 try:
     from huggingface_hub import HfApi, login
     HF_HUB_AVAILABLE = True
     HF_HUB_AVAILABLE = False
     print("Warning: Hugging Face Hub not available")
+# HF Hub init (optional)
 if HF_HUB_AVAILABLE and "HF_TOKEN" in os.environ:
     try:
         login(token=os.environ["HF_TOKEN"], write_permission=True)
     api = None
     repo_name = ""
+# Logs
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
+# Globals
 tokenizer = None
 model = None
 image_processor = None
 context_len = None
 args = None
+model_initialized = False
+# ----- Utils -----
 def get_conv_log_filename():
     t = datetime.datetime.now()
+    return os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_conv.json")
 def get_conv_vote_filename():
     t = datetime.datetime.now()
     if api and repo_name:
         try:
             with open(get_conv_vote_filename(), "a") as fout:
+                fout.write(json.dumps({"type": vote_type, "model": model_selector, "state": state}) + "\n")
             api.upload_file(
                 path_or_fileobj=get_conv_vote_filename(),
                 path_in_repo=get_conv_vote_filename().replace("./votes/", ""),
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
+        return False
+    return name.split(".")[-1].lower() in ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
 def is_valid_image_filename(name):
+    return name.split(".")[-1].lower() in ["jpg","jpeg","png","bmp","gif","tiff","webp","heic","heif","jfif","svg","eps","raw"]
 def load_image(image_file):
+    if image_file.startswith("http"):
+        r = requests.get(image_file)
+        if r.status_code == 200:
+            return Image.open(BytesIO(r.content)).convert("RGB")
+        raise ValueError("Failed to load image from URL")
+    return Image.open(image_file).convert("RGB")
 def process_base64_image(base64_string):
+    if base64_string.startswith('data:image'):
+        base64_string = base64_string.split(',')[1]
+    image_data = base64.b64decode(base64_string)
+    return Image.open(BytesIO(image_data)).convert("RGB")
 def process_image_input(image_input):
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
+# ----- Chat session -----
 class InferenceDemo(object):
     def __init__(self, args, model_path, tokenizer, model, image_processor, context_len) -> None:
         if not LLAVA_AVAILABLE:
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
+            tokenizer, model, image_processor, context_len
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print(f"[WARNING] auto inferred conv_mode={conv_mode}, using {args.conv_mode}")
         else:
             args.conv_mode = conv_mode
+        self.conv_mode = args.conv_mode
+        self.conversation = conv_templates[self.conv_mode].copy()
         self.num_frames = args.num_frames
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
 chat_manager = ChatSessionManager()
 def clear_history():
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
+        inst = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B",
+                                        tokenizer, model, image_processor, context_len)
+        mode = getattr(inst, 'conv_mode', None)
+        if mode and mode in conv_templates:
+            inst.conversation = conv_templates[mode].copy()
+        else:
+            inst.conversation = inst.conversation.__class__()
         return {"status": "success", "message": "Conversation history cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
+# ----- Robust prefix stripper -----
+def _strip_prefix_relaxed(text: str, prefix: str) -> str:
+    try:
+        if text.startswith(prefix):
+            return text[len(prefix):]
+        t_norm = " ".join(text.split())
+        p_norm = " ".join(prefix.split())
+        if t_norm.startswith(p_norm):
+            idx = text.find(prefix.splitlines()[0]) if prefix.splitlines() else -1
+            if idx >= 0:
+                return text[idx + len(prefix.splitlines()[0]):]
+    except Exception:
+        pass
+    return text
+# ----- Core generate -----
+def generate_response(message_text,
+                      image_input,
+                      temperature=0.05,
+                      top_p=1.0,
+                      max_output_tokens=1024,
+                      repetition_penalty=1.0,
+                      conv_mode_override=None,
+                      do_sample=False,        # default greedy -> deterministik
+                      seed=None,
+                      use_stop=True):
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
         if not message_text or not image_input:
             return {"error": "Both message text and image are required"}
+        # Determinism knobs
+        if seed is not None:
+            try:
+                seed = int(seed)
+                torch.manual_seed(seed)
+                np.random.seed(seed)
+            except Exception:
+                pass
+        inst = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B",
+                                        tokenizer, model, image_processor, context_len)
+        # Image
+        image = process_image_input(image_input)
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
+        image_hash = hashlib.md5(img_byte_arr.getvalue()).hexdigest()
         # Save image to logs
         t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{image_hash}.jpg")
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        image.save(filename)
+        # Preprocess
+        processed_images = process_images([image], inst.image_processor, inst.model.config)
+        if len(processed_images) == 0:
+            return {"error": "Image processing returned empty list"}
+        image_tensor = processed_images[0].half().to(inst.model.device).unsqueeze(0)
+        # Conversation
+        if conv_mode_override:
+            inst.conversation = conv_templates[conv_mode_override].copy()
+        else:
+            inst.conversation = conv_templates[inst.conv_mode].copy()
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
+        inst.conversation.append_message(inst.conversation.roles[0], inp)
+        inst.conversation.append_message(inst.conversation.roles[1], None)
+        prompt = inst.conversation.get_prompt()
+        # Tokenize
+        input_ids = tokenizer_image_token(prompt, inst.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(inst.model.device)
+        # Stop criteria
         stopping_criteria = None
+        stop_str = inst.conversation.sep if inst.conversation.sep_style != SeparatorStyle.TWO else inst.conversation.sep2
+        if use_stop:
+            stopping_criteria = KeywordsStoppingCriteria([stop_str], inst.tokenizer, input_ids)
+        # PAD/EOS safety
+        pad_id = inst.tokenizer.pad_token_id
+        eos_id = inst.tokenizer.eos_token_id if inst.tokenizer.eos_token_id is not None else pad_id
+        if pad_id is None:
+            # safety net (rare)
+            inst.tokenizer.add_special_tokens({"pad_token": inst.tokenizer.eos_token or "</s>"})
+            pad_id = inst.tokenizer.pad_token_id
+            eos_id = inst.tokenizer.eos_token_id or pad_id
+        gen_cfg = GenerationConfig(
+            do_sample=bool(do_sample),
+            temperature=float(temperature),
+            top_p=float(top_p),
+            max_new_tokens=int(max_output_tokens),
+            repetition_penalty=float(repetition_penalty),
+            pad_token_id=pad_id,
+            eos_token_id=eos_id
+        )
         with torch.no_grad():
+            outputs = inst.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
+                generation_config=gen_cfg,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria] if stopping_criteria is not None else None,
+                return_dict_in_generate=True
             )
+        # Robust decode
+        sequences = outputs.sequences
+        gen_ids = sequences[0]
+        full_text = inst.tokenizer.decode(gen_ids, skip_special_tokens=True)
+        prompt_text = inst.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        if gen_ids.shape[0] > input_ids.shape[1]:
+            response = inst.tokenizer.decode(gen_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+        else:
+            response = _strip_prefix_relaxed(full_text, prompt_text).strip()
+        if not response:
+            response = full_text.replace(stop_str, "").strip()
+        # Add to conversation
+        if len(inst.conversation.messages) > 0 and isinstance(inst.conversation.messages[-1], list) and len(inst.conversation.messages[-1]) > 1:
+            inst.conversation.messages[-1][-1] = response
+        else:
+            inst.conversation.append_message(inst.conversation.roles[1], response)
+        # Log
         with open(get_conv_log_filename(), "a") as fout:
+            fout.write(json.dumps({
                 "type": "chat",
                 "model": "PULSE-7b",
+                "state": [(message_text, response)],
+                "images": [image_hash],
+                "images_path": [filename]
+            }) + "\n")
+        return {"status": "success", "response": response, "conversation_id": id(inst.conversation)}
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
+# ----- Votes -----
 def upvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
+        return {"status": "success", "message": "Upvoted"}
     except Exception as e:
+        return {"error": str(e)}
 def downvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
+        return {"status": "success", "message": "Downvoted"}
     except Exception as e:
+        return {"error": str(e)}
 def flag_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
+        return {"status": "success", "message": "Flagged"}
     except Exception as e:
+        return {"error": str(e)}
+# ----- Init model (with PAD/EOS safety) -----
 def initialize_model():
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
         print("LLaVA modules not available, skipping model initialization")
         return False
     try:
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
         model_name = get_model_name_from_path(args.model_path)
+        tok, mdl, img_proc, ctx_len = load_pretrained_model(
             args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit
         )
+        # PAD/EOS safety
+        if tok.eos_token_id is None and tok.eos_token is None:
+            try:
+                tok.add_special_tokens({"eos_token": "</s>"})
+            except Exception:
+                pass
+        if tok.pad_token_id is None:
+            if tok.eos_token is not None:
+                tok.pad_token = tok.eos_token
+            else:
+                if tok.unk_token is None:
+                    try:
+                        tok.add_special_tokens({"unk_token": "<unk>"})
+                    except Exception:
+                        pass
+                tok.pad_token = tok.unk_token or "</s>"
+        tokenizer, model, image_processor, context_len = tok, mdl, img_proc, ctx_len
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
             print("Model moved to CUDA")
         else:
             print("CUDA not available, using CPU")
         return True
     except Exception as e:
         print(f"Failed to initialize model: {e}")
         return False
+# ----- Query entrypoint -----
 def query(payload):
     global model_initialized
     if not model_initialized:
         print("Initializing model on first query...")
         model_initialized = initialize_model()
         if not model_initialized:
             return {"error": "Model initialization failed"}
     try:
+        # Log incoming keys
         print(f"[DEBUG] query payload keys={list(payload.keys()) if hasattr(payload,'keys') else 'N/A'}")
+        # Inputs
+        message_text = (payload.get("message") or payload.get("query") or payload.get("prompt") or payload.get("istem") or "").strip()
+        image_input = (payload.get("image") or payload.get("image_url") or payload.get("img") or None)
+        # Gen params
         temperature = float(payload.get("temperature", 0.05))
         top_p = float(payload.get("top_p", 1.0))
+        max_output_tokens = int(payload.get("max_output_tokens", payload.get("max_new_tokens", payload.get("max_tokens", 1024))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
+        # Determinism toggles
+        do_sample = bool(payload.get("do_sample", False))  # default greedy
+        seed = payload.get("seed", None)
+        use_stop = bool(payload.get("use_stop", True))      # default stop criteria açık
+        if not message_text:
+            return {"error": "Missing prompt text. Provide 'message' (or 'query'/'prompt'/'istem')."}
         if not image_input:
+            return {"error": "Missing image. Provide 'image' (url/base64/path) or 'image_url'/'img'."}
+        return generate_response(
             message_text=message_text,
             image_input=image_input,
             temperature=temperature,
             top_p=top_p,
             max_output_tokens=max_output_tokens,
             repetition_penalty=repetition_penalty,
+            conv_mode_override=conv_mode_override,
+            do_sample=do_sample,
+            seed=seed,
+            use_stop=use_stop
         )
     except Exception as e:
         return {"error": f"Query failed: {str(e)}"}
+# ----- Health / Info -----
 def health_check():
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
+        "lazy_loading": True
     }
 def get_model_info():
     if not model_initialized:
+        return {"error": "Model not initialized yet", "lazy_loading": True}
     return {
         "model_path": args.model_path if args else "Unknown",
         "model_type": "PULSE-7B",
         "device": str(model.device) if model else "Unknown"
     }
+# ----- HF Endpoint handler -----
 class EndpointHandler:
     def __init__(self, model_dir):
         self.model_dir = model_dir
         print(f"EndpointHandler initialized with model_dir: {model_dir}")
     def __call__(self, payload):
         if "inputs" in payload:
+            return query(payload["inputs"])
+        return query(payload)
     def health_check(self):
         return health_check()
     def get_model_info(self):
         return get_model_info()
 if __name__ == "__main__":
+    print("Handler loaded and ready.")