CanerDedeoglu
/

Rapid_ECG

@@ -1,5 +1,16 @@
-# -*- coding: utf-8 -*-
-# handler.py — PULSE-7B / LLaVA endpoint (robust + deterministic-ready)
 import os
 import datetime
@@ -12,7 +23,7 @@ import requests
 from PIL import Image
 from io import BytesIO
-# Optional cv2
 try:
     import cv2
     CV2_AVAILABLE = True
@@ -20,7 +31,7 @@ except ImportError:
     CV2_AVAILABLE = False
     print("Warning: cv2 (OpenCV) not available. Video processing will be disabled.")
-# LLaVA stack
 try:
     from llava import conversation as conversation_lib
     from llava.constants import DEFAULT_IMAGE_TOKEN
@@ -44,15 +55,15 @@ except ImportError as e:
     LLAVA_AVAILABLE = False
     print(f"Warning: LLaVA modules not available: {e}")
-# Transformers
 try:
-    from transformers import GenerationConfig
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
     print("Warning: Transformers not available")
-# HF Hub (optional)
 try:
     from huggingface_hub import HfApi, login
     HF_HUB_AVAILABLE = True
@@ -60,7 +71,7 @@ except ImportError:
     HF_HUB_AVAILABLE = False
     print("Warning: Hugging Face Hub not available")
-# HF Hub init (optional)
 if HF_HUB_AVAILABLE and "HF_TOKEN" in os.environ:
     try:
         login(token=os.environ["HF_TOKEN"], write_permission=True)
@@ -74,23 +85,21 @@ else:
     api = None
     repo_name = ""
-# Logs
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
-# Globals
 tokenizer = None
 model = None
 image_processor = None
 context_len = None
 args = None
-model_initialized = False
-# ----- Utils -----
 def get_conv_log_filename():
     t = datetime.datetime.now()
-    return os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_conv.json")
 def get_conv_vote_filename():
     t = datetime.datetime.now()
@@ -103,7 +112,13 @@ def vote_last_response(state, vote_type, model_selector):
     if api and repo_name:
         try:
             with open(get_conv_vote_filename(), "a") as fout:
-                fout.write(json.dumps({"type": vote_type, "model": model_selector, "state": state}) + "\n")
             api.upload_file(
                 path_or_fileobj=get_conv_vote_filename(),
                 path_in_repo=get_conv_vote_filename().replace("./votes/", ""),
@@ -114,48 +129,93 @@ def vote_last_response(state, vote_type, model_selector):
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
-        return False
-    return name.split(".")[-1].lower() in ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
 def is_valid_image_filename(name):
-    return name.split(".")[-1].lower() in ["jpg","jpeg","png","bmp","gif","tiff","webp","heic","heif","jfif","svg","eps","raw"]
 def load_image(image_file):
-    if image_file.startswith("http"):
-        r = requests.get(image_file)
-        if r.status_code == 200:
-            return Image.open(BytesIO(r.content)).convert("RGB")
-        raise ValueError("Failed to load image from URL")
-    return Image.open(image_file).convert("RGB")
 def process_base64_image(base64_string):
-    if base64_string.startswith('data:image'):
-        base64_string = base64_string.split(',')[1]
-    image_data = base64.b64decode(base64_string)
-    return Image.open(BytesIO(image_data)).convert("RGB")
 def process_image_input(image_input):
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
-# ----- Chat session -----
 class InferenceDemo(object):
     def __init__(self, args, model_path, tokenizer, model, image_processor, context_len) -> None:
         if not LLAVA_AVAILABLE:
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
-            tokenizer, model, image_processor, context_len
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
@@ -167,22 +227,30 @@ class InferenceDemo(object):
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
-            print(f"[WARNING] auto inferred conv_mode={conv_mode}, using {args.conv_mode}")
         else:
             args.conv_mode = conv_mode
-        self.conv_mode = args.conv_mode
-        self.conversation = conv_templates[self.conv_mode].copy()
         self.num_frames = args.num_frames
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
@@ -191,295 +259,339 @@ class ChatSessionManager:
 chat_manager = ChatSessionManager()
 def clear_history():
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
-        inst = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B",
-                                        tokenizer, model, image_processor, context_len)
-        mode = getattr(inst, 'conv_mode', None)
-        if mode and mode in conv_templates:
-            inst.conversation = conv_templates[mode].copy()
-        else:
-            inst.conversation = inst.conversation.__class__()
         return {"status": "success", "message": "Conversation history cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
-# ----- Robust prefix stripper -----
-def _strip_prefix_relaxed(text: str, prefix: str) -> str:
-    try:
-        if text.startswith(prefix):
-            return text[len(prefix):]
-        t_norm = " ".join(text.split())
-        p_norm = " ".join(prefix.split())
-        if t_norm.startswith(p_norm):
-            idx = text.find(prefix.splitlines()[0]) if prefix.splitlines() else -1
-            if idx >= 0:
-                return text[idx + len(prefix.splitlines()[0]):]
-    except Exception:
-        pass
-    return text
-# ----- Core generate -----
-def generate_response(message_text,
-                      image_input,
-                      temperature=0.05,
-                      top_p=1.0,
-                      max_output_tokens=1024,
-                      repetition_penalty=1.0,
-                      conv_mode_override=None,
-                      do_sample=False,        # default greedy -> deterministik
-                      seed=None,
-                      use_stop=True):
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
         if not message_text or not image_input:
             return {"error": "Both message text and image are required"}
-        # Determinism knobs
-        if seed is not None:
-            try:
-                seed = int(seed)
-                torch.manual_seed(seed)
-                np.random.seed(seed)
-            except Exception:
-                pass
-        inst = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B",
-                                        tokenizer, model, image_processor, context_len)
-        # Image
-        image = process_image_input(image_input)
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
-        image_hash = hashlib.md5(img_byte_arr.getvalue()).hexdigest()
         # Save image to logs
         t = datetime.datetime.now()
-        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{image_hash}.jpg")
-        os.makedirs(os.path.dirname(filename), exist_ok=True)
-        image.save(filename)
-        # Preprocess
-        processed_images = process_images([image], inst.image_processor, inst.model.config)
-        if len(processed_images) == 0:
-            return {"error": "Image processing returned empty list"}
-        image_tensor = processed_images[0].half().to(inst.model.device).unsqueeze(0)
-        # Conversation
-        if conv_mode_override:
-            inst.conversation = conv_templates[conv_mode_override].copy()
-        else:
-            inst.conversation = conv_templates[inst.conv_mode].copy()
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
-        inst.conversation.append_message(inst.conversation.roles[0], inp)
-        inst.conversation.append_message(inst.conversation.roles[1], None)
-        prompt = inst.conversation.get_prompt()
-        # Tokenize
-        input_ids = tokenizer_image_token(prompt, inst.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(inst.model.device)
-        # Stop criteria
         stopping_criteria = None
-        stop_str = inst.conversation.sep if inst.conversation.sep_style != SeparatorStyle.TWO else inst.conversation.sep2
-        if use_stop:
-            stopping_criteria = KeywordsStoppingCriteria([stop_str], inst.tokenizer, input_ids)
-        # PAD/EOS safety
-        pad_id = inst.tokenizer.pad_token_id
-        eos_id = inst.tokenizer.eos_token_id if inst.tokenizer.eos_token_id is not None else pad_id
-        if pad_id is None:
-            # safety net (rare)
-            inst.tokenizer.add_special_tokens({"pad_token": inst.tokenizer.eos_token or "</s>"})
-            pad_id = inst.tokenizer.pad_token_id
-            eos_id = inst.tokenizer.eos_token_id or pad_id
-        gen_cfg = GenerationConfig(
-            do_sample=bool(do_sample),
-            temperature=float(temperature),
-            top_p=float(top_p),
-            max_new_tokens=int(max_output_tokens),
-            repetition_penalty=float(repetition_penalty),
-            pad_token_id=pad_id,
-            eos_token_id=eos_id
-        )
         with torch.no_grad():
-            outputs = inst.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
-                generation_config=gen_cfg,
-                use_cache=True,
-                stopping_criteria=[stopping_criteria] if stopping_criteria is not None else None,
-                return_dict_in_generate=True
             )
-        # Robust decode
-        sequences = outputs.sequences
-        gen_ids = sequences[0]
-        full_text = inst.tokenizer.decode(gen_ids, skip_special_tokens=True)
-        prompt_text = inst.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-        if gen_ids.shape[0] > input_ids.shape[1]:
-            response = inst.tokenizer.decode(gen_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-        else:
-            response = _strip_prefix_relaxed(full_text, prompt_text).strip()
-        if not response:
-            response = full_text.replace(stop_str, "").strip()
-        # Add to conversation
-        if len(inst.conversation.messages) > 0 and isinstance(inst.conversation.messages[-1], list) and len(inst.conversation.messages[-1]) > 1:
-            inst.conversation.messages[-1][-1] = response
-        else:
-            inst.conversation.append_message(inst.conversation.roles[1], response)
-        # Log
         with open(get_conv_log_filename(), "a") as fout:
-            fout.write(json.dumps({
                 "type": "chat",
                 "model": "PULSE-7b",
-                "state": [(message_text, response)],
-                "images": [image_hash],
-                "images_path": [filename]
-            }) + "\n")
-        return {"status": "success", "response": response, "conversation_id": id(inst.conversation)}
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
-# ----- Votes -----
 def upvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
-        return {"status": "success", "message": "Upvoted"}
     except Exception as e:
-        return {"error": str(e)}
 def downvote_last_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
-        return {"status": "success", "message": "Downvoted"}
     except Exception as e:
-        return {"error": str(e)}
 def flag_response(conversation_id):
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
-        return {"status": "success", "message": "Flagged"}
     except Exception as e:
-        return {"error": str(e)}
-# ----- Init model (with PAD/EOS safety) -----
 def initialize_model():
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
         print("LLaVA modules not available, skipping model initialization")
         return False
     try:
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
                 self.model_base = None
                 self.num_gpus = 1
                 self.conv_mode = None
-                self.temperature = 0.05
                 self.max_new_tokens = 1024
                 self.num_frames = 16
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
         model_name = get_model_name_from_path(args.model_path)
-        tok, mdl, img_proc, ctx_len = load_pretrained_model(
             args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit
         )
-        # PAD/EOS safety
-        if tok.eos_token_id is None and tok.eos_token is None:
-            try:
-                tok.add_special_tokens({"eos_token": "</s>"})
-            except Exception:
-                pass
-        if tok.pad_token_id is None:
-            if tok.eos_token is not None:
-                tok.pad_token = tok.eos_token
-            else:
-                if tok.unk_token is None:
-                    try:
-                        tok.add_special_tokens({"unk_token": "<unk>"})
-                    except Exception:
-                        pass
-                tok.pad_token = tok.unk_token or "</s>"
-        tokenizer, model, image_processor, context_len = tok, mdl, img_proc, ctx_len
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
             print("Model moved to CUDA")
         else:
             print("CUDA not available, using CPU")
         return True
     except Exception as e:
         print(f"Failed to initialize model: {e}")
         return False
-# ----- Query entrypoint -----
 def query(payload):
     global model_initialized
     if not model_initialized:
         print("Initializing model on first query...")
         model_initialized = initialize_model()
         if not model_initialized:
             return {"error": "Model initialization failed"}
     try:
-        # Log incoming keys
         print(f"[DEBUG] query payload keys={list(payload.keys()) if hasattr(payload,'keys') else 'N/A'}")
-        # Inputs
-        message_text = (payload.get("message") or payload.get("query") or payload.get("prompt") or payload.get("istem") or "").strip()
-        image_input = (payload.get("image") or payload.get("image_url") or payload.get("img") or None)
-        # Gen params
-        temperature = float(payload.get("temperature", 0.05))
-        top_p = float(payload.get("top_p", 1.0))
-        max_output_tokens = int(payload.get("max_output_tokens", payload.get("max_new_tokens", payload.get("max_tokens", 1024))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
-        # Determinism toggles
-        do_sample = bool(payload.get("do_sample", False))  # default greedy
-        seed = payload.get("seed", None)
-        use_stop = bool(payload.get("use_stop", True))      # default stop criteria açık
-        if not message_text:
-            return {"error": "Missing prompt text. Provide 'message' (or 'query'/'prompt'/'istem')."}
         if not image_input:
-            return {"error": "Missing image. Provide 'image' (url/base64/path) or 'image_url'/'img'."}
-        return generate_response(
             message_text=message_text,
             image_input=image_input,
-            temperature=temperature,
-            top_p=top_p,
             max_output_tokens=max_output_tokens,
             repetition_penalty=repetition_penalty,
-            conv_mode_override=conv_mode_override,
-            do_sample=do_sample,
-            seed=seed,
-            use_stop=use_stop
         )
     except Exception as e:
         return {"error": f"Query failed: {str(e)}"}
-# ----- Health / Info -----
 def health_check():
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
@@ -487,12 +599,18 @@ def health_check():
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
-        "lazy_loading": True
     }
 def get_model_info():
     if not model_initialized:
-        return {"error": "Model not initialized yet", "lazy_loading": True}
     return {
         "model_path": args.model_path if args else "Unknown",
         "model_type": "PULSE-7B",
@@ -500,19 +618,37 @@ def get_model_info():
         "device": str(model.device) if model else "Unknown"
     }
-# ----- HF Endpoint handler -----
 class EndpointHandler:
     def __init__(self, model_dir):
         self.model_dir = model_dir
         print(f"EndpointHandler initialized with model_dir: {model_dir}")
     def __call__(self, payload):
         if "inputs" in payload:
-            return query(payload["inputs"])
-        return query(payload)
     def health_check(self):
         return health_check()
     def get_model_info(self):
         return get_model_info()
 if __name__ == "__main__":
-    print("Handler loaded and ready.")

+"""
+PULSE ECG Handler - Deterministic ECG Analysis Model
+This handler provides consistent, deterministic responses for ECG analysis.
+All generation parameters are fixed to ensure reproducible results across
+different API calls and clients.
+Key Features:
+- Deterministic generation (do_sample=False)
+- Fixed random seed for consistency
+- No temperature/top_p sampling parameters
+- Consistent response lengths and content
+"""
 import os
 import datetime
 from PIL import Image
 from io import BytesIO
+# Try to import cv2, but make it optional
 try:
     import cv2
     CV2_AVAILABLE = True
     CV2_AVAILABLE = False
     print("Warning: cv2 (OpenCV) not available. Video processing will be disabled.")
+# Try to import llava modules, but make them optional
 try:
     from llava import conversation as conversation_lib
     from llava.constants import DEFAULT_IMAGE_TOKEN
     LLAVA_AVAILABLE = False
     print(f"Warning: LLaVA modules not available: {e}")
+# Try to import transformers
 try:
+    from transformers import TextStreamer, TextIteratorStreamer
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
     TRANSFORMERS_AVAILABLE = False
     print("Warning: Transformers not available")
+# Try to import huggingface_hub
 try:
     from huggingface_hub import HfApi, login
     HF_HUB_AVAILABLE = True
     HF_HUB_AVAILABLE = False
     print("Warning: Hugging Face Hub not available")
+# Initialize Hugging Face API
 if HF_HUB_AVAILABLE and "HF_TOKEN" in os.environ:
     try:
         login(token=os.environ["HF_TOKEN"], write_permission=True)
     api = None
     repo_name = ""
 external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
+# Global variables for model and tokenizer
 tokenizer = None
 model = None
 image_processor = None
 context_len = None
 args = None
 def get_conv_log_filename():
     t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_conv.json")
+    return name
 def get_conv_vote_filename():
     t = datetime.datetime.now()
     if api and repo_name:
         try:
             with open(get_conv_vote_filename(), "a") as fout:
+                data = {
+                    "type": vote_type,
+                    "model": model_selector,
+                    "state": state,
+                }
+                fout.write(json.dumps(data) + "\n")
             api.upload_file(
                 path_or_fileobj=get_conv_vote_filename(),
                 path_in_repo=get_conv_vote_filename().replace("./votes/", ""),
 def is_valid_video_filename(name):
     if not CV2_AVAILABLE:
+        return False  # Video processing disabled
+    video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
+    ext = name.split(".")[-1].lower()
+    return ext in video_extensions
 def is_valid_image_filename(name):
+    image_extensions = ["jpg", "jpeg", "png", "bmp", "gif", "tiff", "webp", "heic", "heif", "jfif", "svg", "eps", "raw"]
+    ext = name.split(".")[-1].lower()
+    return ext in image_extensions
+def sample_frames(video_file, num_frames):
+    if not CV2_AVAILABLE:
+        raise ImportError("cv2 (OpenCV) not available. Video processing is disabled.")
+    video = cv2.VideoCapture(video_file)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    interval = total_frames // num_frames
+    frames = []
+    for i in range(total_frames):
+        ret, frame = video.read()
+        if not ret:
+            continue
+        if i % interval == 0:
+            pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            frames.append(pil_img)
+    video.release()
+    return frames
 def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        if response.status_code == 200:
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+        else:
+            raise ValueError("Failed to load image from URL")
+    else:
+        print("Load image from local file")
+        print(image_file)
+        image = Image.open(image_file).convert("RGB")
+    return image
 def process_base64_image(base64_string):
+    """Process base64 encoded image string"""
+    try:
+        # Remove data URL prefix if present
+        if base64_string.startswith('data:image'):
+            base64_string = base64_string.split(',')[1]
+        # Decode base64 to bytes
+        image_data = base64.b64decode(base64_string)
+        # Convert to PIL Image
+        image = Image.open(BytesIO(image_data)).convert("RGB")
+        return image
+    except Exception as e:
+        raise ValueError(f"Failed to process base64 image: {e}")
 def process_image_input(image_input):
+    """Process different types of image input (file path, URL, or base64)"""
     if isinstance(image_input, str):
         if image_input.startswith("http"):
             return load_image(image_input)
         elif os.path.exists(image_input):
             return load_image(image_input)
         else:
+            # Try to process as base64
             return process_base64_image(image_input)
     elif isinstance(image_input, dict) and "image" in image_input:
+        # Handle base64 image from dict
         return process_base64_image(image_input["image"])
     else:
         raise ValueError("Unsupported image input format")
 class InferenceDemo(object):
     def __init__(self, args, model_path, tokenizer, model, image_processor, context_len) -> None:
         if not LLAVA_AVAILABLE:
             raise ImportError("LLaVA modules not available")
         disable_torch_init()
         self.tokenizer, self.model, self.image_processor, self.context_len = (
+            tokenizer,
+            model,
+            image_processor,
+            context_len,
         )
         model_name = get_model_name_from_path(model_path)
         if "llama-2" in model_name.lower():
             conv_mode = "llava_llama_2"
             conv_mode = "qwen_1_5"
         else:
             conv_mode = "llava_v0"
         if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print(
+                "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                    conv_mode, args.conv_mode, args.conv_mode
+                )
+            )
         else:
             args.conv_mode = conv_mode
+        self.conv_mode = conv_mode
+        self.conversation = conv_templates[args.conv_mode].copy()
         self.num_frames = args.num_frames
 class ChatSessionManager:
     def __init__(self):
         self.chatbot_instance = None
     def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
         print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
     def reset_chatbot(self):
         self.chatbot_instance = None
     def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
         if self.chatbot_instance is None:
             self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
 chat_manager = ChatSessionManager()
 def clear_history():
+    """Clear conversation history"""
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
+        chatbot_instance = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
+        try:
+            if hasattr(chatbot_instance, 'conv_mode') and chatbot_instance.conv_mode and LLAVA_AVAILABLE:
+                chatbot_instance.conversation = conv_templates[chatbot_instance.conv_mode].copy()
+            else:
+                # Use default conversation template
+                chatbot_instance.conversation = chatbot_instance.conversation.__class__()
+        except Exception as e:
+            print(f"[DEBUG] Failed to reset conversation in clear_history: {e}")
         return {"status": "success", "message": "Conversation history cleared"}
     except Exception as e:
         return {"error": f"Failed to clear history: {str(e)}"}
+def add_message(message_text, image_input=None):
+    """Add a message to the conversation"""
+    return {"status": "success", "message": "Message added"}
+def generate_response(message_text, image_input, max_output_tokens=4096, repetition_penalty=1.0, conv_mode_override=None):
+    """Generate response for the given message and image using deterministic generation for consistency"""
     if not LLAVA_AVAILABLE:
         return {"error": "LLaVA modules not available"}
     try:
         if not message_text or not image_input:
             return {"error": "Both message text and image are required"}
+        our_chatbot = chat_manager.get_chatbot(args, args.model_path if args else "PULSE-ECG/PULSE-7B", tokenizer, model, image_processor, context_len)
+        # Process image input
+        try:
+            image = process_image_input(image_input)
+        except Exception as e:
+            return {"error": f"Failed to process image: {str(e)}"}
+        # Save image for logging
+        all_image_hash = []
+        all_image_path = []
+        # Generate hash for the image
         img_byte_arr = BytesIO()
         image.save(img_byte_arr, format='JPEG')
+        img_byte_arr = img_byte_arr.getvalue()
+        image_hash = hashlib.md5(img_byte_arr).hexdigest()
+        all_image_hash.append(image_hash)
         # Save image to logs
         t = datetime.datetime.now()
+        filename = os.path.join(
+            LOGDIR,
+            "serve_images",
+            f"{t.year}-{t.month:02d}-{t.day:02d}",
+            f"{image_hash}.jpg",
+        )
+        all_image_path.append(filename)
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            print("image save to", filename)
+            image.save(filename)
+        # Process image for model
+        try:
+            print(f"[DEBUG] Processing image for model...")
+            processed_images = process_images([image], our_chatbot.image_processor, our_chatbot.model.config)
+            print(f"[DEBUG] Processed images length: {len(processed_images)}")
+            if len(processed_images) == 0:
+                return {"error": "Image processing returned empty list"}
+            image_tensor = processed_images[0]
+            image_tensor = image_tensor.half().to(our_chatbot.model.device)
+            image_tensor = image_tensor.unsqueeze(0)
+            print(f"[DEBUG] Image tensor shape: {image_tensor.shape}")
+        except Exception as e:
+            print(f"[DEBUG] Image processing error: {str(e)}")
+            return {"error": f"Image processing failed: {str(e)}"}
+        # Prepare conversation - reset for each request to avoid history issues
+        try:
+            if hasattr(our_chatbot, 'conv_mode') and our_chatbot.conv_mode and LLAVA_AVAILABLE:
+                our_chatbot.conversation = conv_templates[our_chatbot.conv_mode].copy()
+            else:
+                # Use default conversation template
+                our_chatbot.conversation = our_chatbot.conversation.__class__()
+        except Exception as e:
+            print(f"[DEBUG] Failed to reset conversation: {e}")
+            # Continue with existing conversation
         inp = DEFAULT_IMAGE_TOKEN + "\n" + message_text
+        our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
+        our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
+        prompt = our_chatbot.conversation.get_prompt()
+        # Tokenize input
+        input_ids = tokenizer_image_token(
+            prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+        ).unsqueeze(0).to(our_chatbot.model.device)
+        # No stopping criteria - let model generate freely up to max_new_tokens
+        print(f"[DEBUG] No stopping criteria - free generation up to {max_output_tokens} tokens")
         stopping_criteria = None
+        # Set seed for deterministic generation
+        # This ensures the same input always produces the same output
+        torch.manual_seed(42)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed(42)
+            torch.cuda.manual_seed_all(42)
+        # Generate response using deterministic greedy decoding
+        # This eliminates randomness and ensures consistent responses
         with torch.no_grad():
+            outputs = our_chatbot.model.generate(
                 inputs=input_ids,
                 images=image_tensor,
+                do_sample=False,  # Deterministic generation for consistency
+                max_new_tokens=max_output_tokens,
+                repetition_penalty=repetition_penalty,
+                use_cache=False,
+                pad_token_id=our_chatbot.tokenizer.eos_token_id,
+                eos_token_id=our_chatbot.tokenizer.eos_token_id,
+                length_penalty=1.0,  # Don't penalize longer sequences
             )
+        # Decode response
+        try:
+            print(f"[DEBUG] Outputs shape: {outputs.shape if hasattr(outputs, 'shape') else 'No shape attr'}")
+            print(f"[DEBUG] Outputs length: {len(outputs) if hasattr(outputs, '__len__') else 'No length'}")
+            print(f"[DEBUG] Input IDs shape: {input_ids.shape}")
+            if len(outputs) == 0:
+                return {"error": "Model generated empty output"}
+            response = our_chatbot.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
+            print(f"[DEBUG] Conversation messages length: {len(our_chatbot.conversation.messages)}")
+            if len(our_chatbot.conversation.messages) > 0:
+                last_message = our_chatbot.conversation.messages[-1]
+                print(f"[DEBUG] Last message: {last_message}")
+                if isinstance(last_message, list) and len(last_message) > 1:
+                    our_chatbot.conversation.messages[-1][-1] = response
+                    print(f"[DEBUG] Response added to conversation")
+                else:
+                    print(f"[DEBUG] Last message format unexpected: {last_message}")
+                    # Add response as new message if format is wrong
+                    our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
+            else:
+                print("[DEBUG] No conversation messages found")
+                # Add response as new message
+                our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], response)
+            print(f"[DEBUG] Generated response length: {len(response)}")
+        except Exception as e:
+            print(f"[DEBUG] Response decoding error: {str(e)}")
+            return {"error": f"Response decoding failed: {str(e)}"}
+        # Log conversation
+        history = [(message_text, response)]
         with open(get_conv_log_filename(), "a") as fout:
+            data = {
                 "type": "chat",
                 "model": "PULSE-7b",
+                "state": history,
+                "images": all_image_hash,
+                "images_path": all_image_path
+            }
+            print("#### conv log", data)
+            fout.write(json.dumps(data) + "\n")
+        # Upload files to Hugging Face if configured
+        if api and repo_name:
+            try:
+                for upload_img in all_image_path:
+                    api.upload_file(
+                        path_or_fileobj=upload_img,
+                        path_in_repo=upload_img.replace("./logs/", ""),
+                        repo_id=repo_name,
+                        repo_type="dataset",
+                    )
+                # Upload conversation log
+                api.upload_file(
+                    path_or_fileobj=get_conv_log_filename(),
+                    path_in_repo=get_conv_log_filename().replace("./logs/", ""),
+                    repo_id=repo_name,
+                    repo_type="dataset")
+            except Exception as e:
+                print(f"Failed to upload files: {e}")
+        return {
+            "status": "success",
+            "response": response,
+            "conversation_id": id(our_chatbot.conversation)
+        }
     except Exception as e:
         return {"error": f"Generation failed: {str(e)}"}
 def upvote_last_response(conversation_id):
+    """Upvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "upvote", "PULSE-7B")
+        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
+        return {"error": f"Failed to upvote: {str(e)}"}
 def downvote_last_response(conversation_id):
+    """Downvote the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "downvote", "PULSE-7B")
+        return {"status": "success", "message": "Thank you for your voting!"}
     except Exception as e:
+        return {"error": f"Failed to downvote: {str(e)}"}
 def flag_response(conversation_id):
+    """Flag the last response"""
     try:
         vote_last_response({"conversation_id": conversation_id}, "flag", "PULSE-7B")
+        return {"status": "success", "message": "Response flagged successfully"}
     except Exception as e:
+        return {"error": f"Failed to flag response: {str(e)}"}
+# Initialize model when module is imported
 def initialize_model():
+    """Initialize the model and tokenizer"""
     global tokenizer, model, image_processor, context_len, args
     if not LLAVA_AVAILABLE:
         print("LLaVA modules not available, skipping model initialization")
         return False
     try:
+        # Set default arguments
         class Args:
             def __init__(self):
                 self.model_path = "PULSE-ECG/PULSE-7B"
                 self.model_base = None
                 self.num_gpus = 1
                 self.conv_mode = None
                 self.max_new_tokens = 1024
                 self.num_frames = 16
                 self.load_8bit = False
                 self.load_4bit = False
                 self.debug = False
         args = Args()
+        # Load model
+        model_path = args.model_path
         model_name = get_model_name_from_path(args.model_path)
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
             args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit
         )
+        print("### image_processor", image_processor)
+        print("### tokenizer", tokenizer)
+        # Move model to GPU if available
         if torch.cuda.is_available():
             model = model.to(torch.device('cuda'))
             print("Model moved to CUDA")
         else:
             print("CUDA not available, using CPU")
         return True
     except Exception as e:
         print(f"Failed to initialize model: {e}")
         return False
+# Don't initialize model on import - do it lazily
+model_initialized = False
+# Main endpoint function for Hugging Face
 def query(payload):
+    """Main endpoint function for Hugging Face inference API"""
     global model_initialized
+    # Lazy initialization - initialize model on first call
     if not model_initialized:
         print("Initializing model on first query...")
         model_initialized = initialize_model()
         if not model_initialized:
             return {"error": "Model initialization failed"}
     try:
         print(f"[DEBUG] query payload keys={list(payload.keys()) if hasattr(payload,'keys') else 'N/A'}")
+        # Extract prompt with multiple possible keys
+        message_text = (payload.get("message") or
+                       payload.get("query") or
+                       payload.get("prompt") or
+                       payload.get("istem") or "")
+        # Extract image with multiple possible keys
+        image_input = (payload.get("image") or
+                      payload.get("image_url") or
+                      payload.get("img") or None)
+        # Extract generation parameters with fallbacks
+        max_output_tokens = int(payload.get("max_output_tokens",
+                               payload.get("max_new_tokens",
+                               payload.get("max_tokens", 8192))))
         repetition_penalty = float(payload.get("repetition_penalty", 1.0))
         conv_mode_override = payload.get("conv_mode", None)
+        if not message_text or not message_text.strip():
+            return {"error": "Missing prompt text. Use 'message', 'query', 'prompt', or 'istem' key"}
         if not image_input:
+            return {"error": "Missing image. Use 'image', 'image_url', or 'img' key"}
+        # Generate response with deterministic parameters
+        result = generate_response(
             message_text=message_text,
             image_input=image_input,
             max_output_tokens=max_output_tokens,
             repetition_penalty=repetition_penalty,
+            conv_mode_override=conv_mode_override
         )
+        return result
     except Exception as e:
         return {"error": f"Query failed: {str(e)}"}
+# Additional utility endpoints
 def health_check():
+    """Health check endpoint"""
     return {
         "status": "healthy",
         "model_initialized": model_initialized,
         "llava_available": LLAVA_AVAILABLE,
         "transformers_available": TRANSFORMERS_AVAILABLE,
         "cv2_available": CV2_AVAILABLE,
+        "lazy_loading": True  # Model will be loaded on first query
     }
 def get_model_info():
+    """Get model information"""
     if not model_initialized:
+        return {
+            "error": "Model not initialized yet",
+            "lazy_loading": True,
+            "note": "Model will be loaded on first query"
+        }
     return {
         "model_path": args.model_path if args else "Unknown",
         "model_type": "PULSE-7B",
         "device": str(model.device) if model else "Unknown"
     }
+# Hugging Face EndpointHandler class
 class EndpointHandler:
+    """Hugging Face endpoint handler class"""
     def __init__(self, model_dir):
+        """Initialize the endpoint handler"""
         self.model_dir = model_dir
         print(f"EndpointHandler initialized with model_dir: {model_dir}")
     def __call__(self, payload):
+        """Main endpoint function - handles Hugging Face payload format"""
+        # Hugging Face sends payload in "inputs" wrapper
         if "inputs" in payload:
+            # Extract the actual payload from inputs wrapper
+            actual_payload = payload["inputs"]
+            return query(actual_payload)
+        else:
+            # Direct payload (for backward compatibility)
+            return query(payload)
     def health_check(self):
+        """Health check endpoint"""
         return health_check()
     def get_model_info(self):
+        """Get model information"""
         return get_model_info()
+# For backward compatibility and testing
 if __name__ == "__main__":
+    print("Handler module loaded successfully!")
+    print("This handler is now ready for Hugging Face endpoints.")
+    print("Use the 'query' function as the main endpoint.")
+    print("Or use EndpointHandler class for Hugging Face compatibility.")