CanerDedeoglu
/

Rapid_ECG

@@ -1,3 +1,9 @@
 import os
 import datetime
 import torch
@@ -43,7 +49,7 @@ except ImportError as e:
 # Try to import transformers
 try:
-    from transformers import TextStreamer, TextIteratorStreamer
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
@@ -75,7 +81,7 @@ external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
-# Global variables for model and tokenizer
 tokenizer = None
 model = None
 image_processor = None
@@ -115,7 +121,7 @@ def vote_last_response(state, vote_type, model_selector):
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
-        return False  # Video processing disabled
     video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
     ext = name.split(".")[-1].lower()
     return ext in video_extensions
@@ -127,8 +133,7 @@ def is_valid_image_filename(name):
 def sample_frames(video_file, num_frames):
     if not CV2_AVAILABLE:
-        raise ImportError("cv2 (OpenCV) not available. Video processing is disabled.")
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
@@ -144,46 +149,32 @@ def sample_frames(video_file, num_frames):
     return frames
 def load_image(image_file):
-    if image_file.startswith("http") or image_file.startswith("https"):
         response = requests.get(image_file)
         if response.status_code == 200:
             image = Image.open(BytesIO(response.content)).convert("RGB")
         else:
             raise ValueError("Failed to load image from URL")
     else:
-        print("Load image from local file")
-        print(image_file)
         image = Image.open(image_file).convert("RGB")
     return image
 def process_base64_image(base64_string):
-    """Process base64 encoded image string"""
-    try:
-        # Remove data URL prefix if present
-        if base64_string.startswith('data:image'):
-            base64_string = base64_string.split(',')[1]
-        # Decode base64 to bytes
-        image_data = base64.b64decode(base64_string)
-        # Convert to PIL Image
-        image = Image.open(BytesIO(image_data)).convert("RGB")
-        return image
-    except Exception as e:
-        raise ValueError(f"Failed to process base64 image: {e}")
 def process_image_input(image_input):
-    """Process different types of image input (file path, URL, or base64)"""
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
-            # Try to process as base64
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
-        # Handle base64 image from dict
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
@@ -194,14 +185,9 @@ class InferenceDemo(object):
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
-            tokenizer,
-            model,
-            image_processor,
-            context_len,
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
@@ -213,13 +199,8 @@ class InferenceDemo(object):
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
-            print(
-                "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
-                    conv_mode, args.conv_mode, args.conv_mode
-                )
-            )
         else:
             args.conv_mode = conv_mode
         self.conv_mode = conv_mode
@@ -229,14 +210,11 @@ class InferenceDemo(object):
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
@@ -245,242 +223,139 @@ class ChatSessionManager:
 chat_manager = ChatSessionManager()
 def clear_history():
-    """Clear conversation history"""
     if not LLAVA_AVAILABLE:
-        return {"error": "LLaVA modules not available"}
     try:
         chatbot_instance = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
-        try:
-            if hasattr(chatbot_instance, 'conv_mode') and chatbot_instance.conv_mode and LLAVA_AVAILABLE:
-                chatbot_instance.conversation = conv_templates[chatbot_instance.conv_mode].copy()
-            else:
-                # Use default conversation template
-                chatbot_instance.conversation = chatbot_instance.conversation.__class__()
-        except Exception as e:
-            print(f"[DEBUG] Failed to reset conversation in clear_history: {e}")
-        return {"status": "success", "message": "Conversation history cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
-def add_message(message_text, image_input=None):
-    """Add a message to the conversation"""
-    return {"status": "success", "message": "Message added"}
 def generate_response(message_text, image_input, temperature=0.05, top_p=1.0, max_output_tokens=4096, repetition_penalty=1.0, conv_mode_override=None):
-    """Generate response for the given message and image"""
     if not LLAVA_AVAILABLE:
-        return {"error": "LLaVA modules not available"}
     try:
         if not message_text or not image_input:
-            return {"error": "Both message text and image are required"}
         our_chatbot = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
-        # Process image input
-        try:
-            image = process_image_input(image_input)
-        except Exception as e:
-            return {"error": f"Failed to process image: {str(e)}"}
-        # Save image for logging
-        all_image_hash = []
-        all_image_path = []
-        # Generate hash for the image
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
-        img_byte_arr = img_byte_arr.getvalue()
-        image_hash = hashlib.md5(img_byte_arr).hexdigest()
-        all_image_hash.append(image_hash)
-        # Save image to logs
         t = datetime.datetime.now()
-        filename = os.path.join(
-            LOGDIR,
-            "serve_images",
-            f"{t.year}-{t.month:02d}-{t.day:02d}",
-            f"{image_hash}.jpg",
-        )
-        all_image_path.append(filename)
-        if not os.path.isfile(filename):
-            os.makedirs(os.path.dirname(filename), exist_ok=True)
-            print("image save to", filename)
-            image.save(filename)
-        # Process image for model
-        try:
-            print(f"[DEBUG] Processing image for model...")
-            processed_images = process_images([image], our_chatbot.image_processor, our_chatbot.model.config)
-            print(f"[DEBUG] Processed images length: {len(processed_images)}")
-            if len(processed_images) == 0:
-                return {"error": "Image processing returned empty list"}
-            image_tensor = processed_images[0]
-            image_tensor = image_tensor.half().to(our_chatbot.model.device)
-            image_tensor = image_tensor.unsqueeze(0)
-            print(f"[DEBUG] Image tensor shape: {image_tensor.shape}")
-        except Exception as e:
-            print(f"[DEBUG] Image processing error: {str(e)}")
-            return {"error": f"Image processing failed: {str(e)}"}
-        # Prepare conversation - reset for each request to avoid history issues
-        try:
-            if hasattr(our_chatbot, 'conv_mode') and our_chatbot.conv_mode and LLAVA_AVAILABLE:
-                our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
-            else:
-                # Use default conversation template
-                our_chatbot.conversation = our_chatbot.conversation.__class__()
-        except Exception as e:
-            print(f"[DEBUG] Failed to reset conversation: {e}")
-            # Continue with existing conversation
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
         our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
         our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
         prompt = our_chatbot.conversation.get_prompt()
-        # Tokenize input
-        input_ids = tokenizer_image_token(
-            prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
-        ).unsqueeze(0).to(our_chatbot.model.device)
-        # Set up stopping criteria
-        stop_str = (
-            our_chatbot.conversation.sep
-            if our_chatbot.conversation.sep_style != SeparatorStyle.TWO
-            else our_chatbot.conversation.sep2
-        )
-        keywords = [stop_str]
-        stopping_criteria = KeywordsStoppingCriteria(
-            keywords, our_chatbot.tokenizer, input_ids
         )
-        # Generate response
         with torch.no_grad():
             outputs = our_chatbot.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                max_new_tokens=max_output_tokens,
-                repetition_penalty=repetition_penalty,
-                use_cache=False,
                 stopping_criteria=[stopping_criteria],
             )
-        # Decode response
-        try:
-            print(f"[DEBUG] Outputs shape: {outputs.shape if hasattr(outputs, 'shape') else 'No shape attr'}")
-            print(f"[DEBUG] Outputs length: {len(outputs) if hasattr(outputs, '__len__') else 'No length'}")
-            print(f"[DEBUG] Input IDs shape: {input_ids.shape}")
-            if len(outputs) == 0:
-                return {"error": "Model generated empty output"}
-            response = our_chatbot.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
-            print(f"[DEBUG] Conversation messages length: {len(our_chatbot.conversation.messages)}")
-            if len(our_chatbot.conversation.messages) > 0:
-                last_message = our_chatbot.conversation.messages[-1]
-                print(f"[DEBUG] Last message: {last_message}")
-                if isinstance(last_message, list) and len(last_message) > 1:
-                    our_chatbot.conversation.messages[-1][-1] = response
-                    print(f"[DEBUG] Response added to conversation")
-                else:
-                    print(f"[DEBUG] Last message format unexpected: {last_message}")
-                    # Add response as new message if format is wrong
-                    our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
-            else:
-                print("[DEBUG] No conversation messages found")
-                # Add response as new message
-                our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
-            print(f"[DEBUG] Generated response length: {len(response)}")
-        except Exception as e:
-            print(f"[DEBUG] Response decoding error: {str(e)}")
-            return {"error": f"Response decoding failed: {str(e)}"}
-        # Log conversation
-        history = [(message_text, response)]
-        with open(get_conv_log_filename(), "a") as fout:
-            data = {
-                "type": "chat",
-                "model": "PULSE-7b",
-                "state": history,
-                "images": all_image_hash,
-                "images_path": all_image_path
-            }
-            print("#### conv log", data)
-            fout.write(json.dumps(data) + "\n")
-        # Upload files to Hugging Face if configured
-        if api and repo_name:
-            try:
-                for upload_img in all_image_path:
-                    api.upload_file(
-                        path_or_fileobj=upload_img,
-                        path_in_repo=upload_img.replace("./logs/", ""),
-                        repo_id=repo_name,
-                        repo_type="dataset",
-                    )
-                # Upload conversation log
-                api.upload_file(
-                    path_or_fileobj=get_conv_log_filename(),
-                    path_in_repo=get_conv_log_filename().replace("./logs/", ""),
-                    repo_id=repo_name,
-                    repo_type="dataset")
-            except Exception as e:
-                print(f"Failed to upload files: {e}")
-        return {
-            "status": "success",
-            "response": response,
-            "conversation_id": id(our_chatbot.conversation)
-        }
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
 def upvote_last_response(conversation_id):
-    """Upvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
-        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
-        return {"error": f"Failed to upvote: {str(e)}"}
 def downvote_last_response(conversation_id):
-    """Downvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
-        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
-        return {"error": f"Failed to downvote: {str(e)}"}
 def flag_response(conversation_id):
-    """Flag the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
-        return {"status": "success", "message": "Response flagged successfully"}
     except Exception as e:
-        return {"error": f"Failed to flag response: {str(e)}"}
-# Initialize model when module is imported
 def initialize_model():
-    """Initialize the model and tokenizer"""
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
-        print("LLaVA modules not available, skipping model initialization")
         return False
     try:
-        # Set default arguments
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
@@ -493,95 +368,45 @@ def initialize_model():
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
-        # Load model
-        model_path = args.model_path
-        model_name = get_model_name_from_path(args.model_path)
-        tokenizer, model, image_processor, context_len = load_pretrained_model(
-            args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit
-        )
-        print("### image_processor", image_processor)
-        print("### tokenizer", tokenizer)
-        # Move model to GPU if available
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
-            print("Model moved to CUDA")
-        else:
-            print("CUDA not available, using CPU")
         return True
     except Exception as e:
-        print(f"Failed to initialize model: {e}")
         return False
-# Don't initialize model on import - do it lazily
 model_initialized = False
-# Main endpoint function for Hugging Face
 def query(payload):
-    """Main endpoint function for Hugging Face inference API"""
     global model_initialized
-    # Lazy initialization - initialize model on first call
     if not model_initialized:
-        print("Initializing model on first query...")
         model_initialized = initialize_model()
         if not model_initialized:
-            return {"error": "Model initialization failed"}
     try:
-        print(f"[DEBUG] query payload keys={list(payload.keys()) if hasattr(payload,'keys') else 'N/A'}")
-        # Extract prompt with multiple possible keys
-        message_text = (payload.get("message") or
-                       payload.get("query") or
-                       payload.get("prompt") or
-                       payload.get("istem") or "")
-        # Extract image with multiple possible keys
-        image_input = (payload.get("image") or
-                      payload.get("image_url") or
-                      payload.get("img") or None)
-        # Extract generation parameters with fallbacks
         temperature = float(payload.get("temperature", 0.05))
         top_p = float(payload.get("top_p", 1.0))
-        max_output_tokens = int(payload.get("max_output_tokens",
-                               payload.get("max_new_tokens",
-                               payload.get("max_tokens", 4096))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
-        if not message_text or not message_text.strip():
-            return {"error": "Missing prompt text. Use 'message', 'query', 'prompt', or 'istem' key"}
         if not image_input:
-            return {"error": "Missing image. Use 'image', 'image_url', or 'img' key"}
-        # Generate response with all parameters
-        result = generate_response(
-            message_text=message_text,
-            image_input=image_input,
-            temperature=temperature,
-            top_p=top_p,
-            max_output_tokens=max_output_tokens,
-            repetition_penalty=repetition_penalty,
-            conv_mode_override=conv_mode_override
-        )
-        return result
     except Exception as e:
-        return {"error": f"Query failed: {str(e)}"}
-# Additional utility endpoints
 def health_check():
-    """Health check endpoint"""
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
@@ -589,56 +414,26 @@ def health_check():
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
-        "lazy_loading": True  # Model will be loaded on first query
     }
 def get_model_info():
-    """Get model information"""
     if not model_initialized:
-        return {
-            "error": "Model not initialized yet",
-            "lazy_loading": True,
-            "note": "Model will be loaded on first query"
-        }
-    return {
-        "model_path": args.model_path if args else "Unknown",
-        "model_type": "PULSE-7B",
-        "cuda_available": torch.cuda.is_available(),
-        "device": str(model.device) if model else "Unknown"
-    }
-# Hugging Face EndpointHandler class
 class EndpointHandler:
-    """Hugging Face endpoint handler class"""
     def __init__(self, model_dir):
-        """Initialize the endpoint handler"""
         self.model_dir = model_dir
-        print(f"EndpointHandler initialized with model_dir: {model_dir}")
     def __call__(self, payload):
-        """Main endpoint function - handles Hugging Face payload format"""
-        # Hugging Face sends payload in "inputs" wrapper
         if "inputs" in payload:
-            # Extract the actual payload from inputs wrapper
-            actual_payload = payload["inputs"]
-            return query(actual_payload)
-        else:
-            # Direct payload (for backward compatibility)
-            return query(payload)
     def health_check(self):
-        """Health check endpoint"""
         return health_check()
     def get_model_info(self):
-        """Get model information"""
         return get_model_info()
-# For backward compatibility and testing
 if __name__ == "__main__":
-    print("Handler module loaded successfully!")
-    print("This handler is now ready for Hugging Face endpoints.")
-    print("Use the 'query' function as the main endpoint.")
-    print("Or use EndpointHandler class for Hugging Face compatibility.")

+# -*- coding: utf-8 -*-
+# handler.py — PULSE-7B / LLaVA robust endpoint
+# - Safe decode (empty output fix)
+# - PAD/EOS safety
+# - Hugging Face endpoint compatible
 import os
 import datetime
 import torch
 # Try to import transformers
 try:
+    from transformers import TextStreamer, TextIteratorStreamer, GenerationConfig
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
+# Global variables
 tokenizer = None
 model = None
 image_processor = None
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
+        return False
     video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
     ext = name.split(".")[-1].lower()
     return ext in video_extensions
 def sample_frames(video_file, num_frames):
     if not CV2_AVAILABLE:
+        raise ImportError("cv2 not available")
     video = cv2.VideoCapture(video_file)
     total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
     interval = total_frames // num_frames
     return frames
 def load_image(image_file):
+    if image_file.startswith("http"):
         response = requests.get(image_file)
         if response.status_code == 200:
             image = Image.open(BytesIO(response.content)).convert("RGB")
         else:
             raise ValueError("Failed to load image from URL")
     else:
         image = Image.open(image_file).convert("RGB")
     return image
 def process_base64_image(base64_string):
+    if base64_string.startswith('data:image'):
+        base64_string = base64_string.split(',')[1]
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(BytesIO(image_data)).convert("RGB")
+    return image
 def process_image_input(image_input):
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
+            tokenizer, model, image_processor, context_len
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print(f"[WARNING] auto inferred conv_mode={conv_mode}, using {args.conv_mode}")
         else:
             args.conv_mode = conv_mode
         self.conv_mode = conv_mode
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
 chat_manager = ChatSessionManager()
 def clear_history():
     if not LLAVA_AVAILABLE:
+        return {"error": "LLaVA not available"}
     try:
         chatbot_instance = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
+        mode = getattr(chatbot_instance, 'conv_mode', None)
+        if mode and LLAVA_AVAILABLE and mode in conv_templates:
+            chatbot_instance.conversation = conv_templates[mode].copy()
+        else:
+            chatbot_instance.conversation = chatbot_instance.conversation.__class__()
+        return {"status": "success", "message": "Conversation cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
+def _strip_prefix_relaxed(text: str, prefix: str) -> str:
+    try:
+        if text.startswith(prefix):
+            return text[len(prefix):]
+        t_norm = " ".join(text.split())
+        p_norm = " ".join(prefix.split())
+        if t_norm.startswith(p_norm):
+            idx = text.find(prefix.splitlines()[0]) if prefix.splitlines() else -1
+            if idx >= 0:
+                return text[idx + len(prefix.splitlines()[0]):]
+    except Exception:
+        pass
+    return text
 def generate_response(message_text, image_input, temperature=0.05, top_p=1.0, max_output_tokens=4096, repetition_penalty=1.0, conv_mode_override=None):
     if not LLAVA_AVAILABLE:
+        return {"error": "LLaVA not available"}
     try:
         if not message_text or not image_input:
+            return {"error": "Both message and image required"}
         our_chatbot = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
+        image = process_image_input(image_input)
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
+        image_hash = hashlib.md5(img_byte_arr.getvalue()).hexdigest()
         t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{image_hash}.jpg")
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        image.save(filename)
+        processed_images = process_images([image], our_chatbot.image_processor, our_chatbot.model.config)
+        image_tensor = processed_images[0].half().to(our_chatbot.model.device).unsqueeze(0)
+        if conv_mode_override:
+            our_chatbot.conversation = conv_templates[conv_mode_override].copy()
+        else:
+            our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
         our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
         our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
         prompt = our_chatbot.conversation.get_prompt()
+        input_ids = tokenizer_image_token(prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(our_chatbot.model.device)
+        stop_str = our_chatbot.conversation.sep if our_chatbot.conversation.sep_style != SeparatorStyle.TWO else our_chatbot.conversation.sep2
+        stopping_criteria = KeywordsStoppingCriteria([stop_str], our_chatbot.tokenizer, input_ids)
+        pad_id = our_chatbot.tokenizer.pad_token_id
+        eos_id = our_chatbot.tokenizer.eos_token_id if our_chatbot.tokenizer.eos_token_id is not None else pad_id
+        gen_cfg = GenerationConfig(
+            do_sample=True, temperature=float(temperature), top_p=float(top_p),
+            max_new_tokens=int(max_output_tokens), repetition_penalty=float(repetition_penalty),
+            pad_token_id=pad_id, eos_token_id=eos_id
         )
         with torch.no_grad():
             outputs = our_chatbot.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
+                generation_config=gen_cfg,
+                use_cache=True,
                 stopping_criteria=[stopping_criteria],
+                return_dict_in_generate=True
             )
+        sequences = outputs.sequences
+        gen_ids = sequences[0]
+        full_text = our_chatbot.tokenizer.decode(gen_ids, skip_special_tokens=True)
+        prompt_text = our_chatbot.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        if gen_ids.shape[0] > input_ids.shape[1]:
+            response = our_chatbot.tokenizer.decode(gen_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+        else:
+            response = _strip_prefix_relaxed(full_text, prompt_text).strip()
+        if not response:
+            response = full_text.replace(stop_str, "").strip()
+        our_chatbot.conversation.messages[-1][-1] = response
+        history = [(message_text, response)]
+        with open(get_conv_log_filename(), "a") as fout:
+            fout.write(json.dumps({
+                "type": "chat", "model": "PULSE-7b", "state": history,
+                "images": [image_hash], "images_path": [filename]
+            }) + "\n")
+        return {"status": "success", "response": response, "conversation_id": id(our_chatbot.conversation)}
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
 def upvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
+        return {"status": "success", "message": "Upvoted"}
     except Exception as e:
+        return {"error": str(e)}
 def downvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
+        return {"status": "success", "message": "Downvoted"}
     except Exception as e:
+        return {"error": str(e)}
 def flag_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
+        return {"status": "success", "message": "Flagged"}
     except Exception as e:
+        return {"error": str(e)}
 def initialize_model():
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
+        print("LLaVA not available")
         return False
     try:
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
+        tok, mdl, img_proc, ctx_len = load_pretrained_model(args.model_path, args.model_base, get_model_name_from_path(args.model_path), args.load_8bit, args.load_4bit)
+        if tok.eos_token_id is None:
+            tok.add_special_tokens({"eos_token": "</s>"})
+        if tok.pad_token_id is None:
+            tok.pad_token = tok.eos_token
+        tokenizer, model, image_processor, context_len = tok, mdl, img_proc, ctx_len
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
         return True
     except Exception as e:
+        print(f"Init model fail: {e}")
         return False
 model_initialized = False
 def query(payload):
     global model_initialized
     if not model_initialized:
         model_initialized = initialize_model()
         if not model_initialized:
+            return {"error": "Model init failed"}
     try:
+        message_text = payload.get("message") or payload.get("query") or payload.get("prompt") or payload.get("istem") or ""
+        image_input = payload.get("image") or payload.get("image_url") or payload.get("img") or None
         temperature = float(payload.get("temperature", 0.05))
         top_p = float(payload.get("top_p", 1.0))
+        max_output_tokens = int(payload.get("max_output_tokens", payload.get("max_new_tokens", payload.get("max_tokens", 4096))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
+        if not message_text.strip():
+            return {"error": "Missing prompt text"}
         if not image_input:
+            return {"error": "Missing image"}
+        return generate_response(message_text, image_input, temperature, top_p, max_output_tokens, repetition_penalty, conv_mode_override)
     except Exception as e:
+        return {"error": str(e)}
 def health_check():
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
+        "lazy_loading": True
     }
 def get_model_info():
     if not model_initialized:
+        return {"error": "Not initialized", "lazy_loading": True}
+    return {"model_path": args.model_path if args else "Unknown", "model_type": "PULSE-7B", "cuda_available": torch.cuda.is_available(), "device": str(model.device) if model else "Unknown"}
 class EndpointHandler:
     def __init__(self, model_dir):
         self.model_dir = model_dir
+        print(f"Handler init with model_dir={model_dir}")
     def __call__(self, payload):
         if "inputs" in payload:
+            return query(payload["inputs"])
+        return query(payload)
     def health_check(self):
         return health_check()
     def get_model_info(self):
         return get_model_info()
 if __name__ == "__main__":
+    print("Handler loaded and ready.")