Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 17, 2025

Commit

1b1b06a

1 Parent(s): fd19926

update handler py

Browse files

Files changed (1) hide show

handler.py +96 -9

handler.py CHANGED Viewed

@@ -76,6 +76,33 @@ class EndpointHandler:
             logger.error(f"Error during model initialization: {e}")
             raise
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, str], Generator]:
         """
         Process the input data and generate a response using the Phi-4 model.
@@ -91,8 +118,45 @@ class EndpointHandler:
             if "inputs" not in data:
                 logger.warning("No 'inputs' field in request data")
                 return {"error": "Missing 'inputs' field in request"}
-            prompt = data.get("inputs", "")
             parameters = data.get("parameters", {})
             logger.info(f"Processing input with {len(prompt)} characters")
@@ -117,19 +181,19 @@ class EndpointHandler:
             attention_mask = torch.ones_like(input_ids)
             # Perform safe generation with error handling for out-of-vocabulary issues
-            return self._safe_generate(input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample)
         except Exception as e:
             logger.error(f"Error during generation: {e}")
             return {"error": str(e)}
-    def _safe_generate(self, input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample):
         """Safely generate text handling potential token index errors"""
         try:
             with torch.no_grad():
                 # Get the input text to exclude from final output
-                input_text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
-                logger.info(f"Input decoded text: '{input_text}'")
                 # Generate one token at a time to avoid index errors
                 max_steps = min(max_new_tokens, 100)  # Limit to 100 tokens for testing
@@ -181,10 +245,13 @@ class EndpointHandler:
                 # Decode the generated sequence
                 generated_text = self.tokenizer.decode(current_ids[0], skip_special_tokens=True)
-                # Return only the newly generated text (without the prompt)
-                if generated_text.startswith(input_text):
-                    response_text = generated_text[len(input_text):]
                 else:
                     response_text = generated_text
                 logger.info(f"Generated {len(response_text)} characters")
@@ -283,5 +350,25 @@ class EndpointHandler:
 if __name__ == "__main__":
     # Example usage
     handler = EndpointHandler()
-    result = handler({"inputs": "What are the major features of Phi-4?"})
     print(result)

             logger.error(f"Error during model initialization: {e}")
             raise
+    def format_prompt_with_system(self, user_message, system_message=None):
+        """
+        Format the prompt with system and user messages according to Phi-4 format.
+        Args:
+            user_message (str): The user's message
+            system_message (str, optional): The system message/instruction
+        Returns:
+            str: Formatted prompt ready for the model
+        """
+        # Format using Phi-4's expected chat template:
+        # <|system|>
+        # {system_message}
+        # <|user|>
+        # {user_message}
+        # <|assistant|>
+        if system_message:
+            prompt = f"<|system|>\n{system_message}\n<|user|>\n{user_message}\n<|assistant|>"
+        else:
+            # If no system message, just use user message with assistant tag
+            prompt = f"<|user|>\n{user_message}\n<|assistant|>"
+        logger.info(f"Formatted prompt with {'system message and ' if system_message else ''}user message")
+        return prompt
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, str], Generator]:
         """
         Process the input data and generate a response using the Phi-4 model.
             if "inputs" not in data:
                 logger.warning("No 'inputs' field in request data")
                 return {"error": "Missing 'inputs' field in request"}
+            # Handle different input formats
+            # 1. Direct string input
+            if isinstance(data["inputs"], str):
+                user_message = data["inputs"]
+                system_message = data.get("parameters", {}).get("system_message", None)
+            # 2. Dict with messages format
+            elif isinstance(data["inputs"], dict) and "messages" in data["inputs"]:
+                messages = data["inputs"]["messages"]
+                # Extract system and user messages
+                system_message = None
+                user_message = ""
+                # Process messages in order, using the last user message
+                for msg in messages:
+                    if msg.get("role") == "system":
+                        system_message = msg.get("content", "")
+                    elif msg.get("role") == "user":
+                        user_message = msg.get("content", "")
+            # 3. Direct messages list format
+            elif isinstance(data["inputs"], list):
+                messages = data["inputs"]
+                # Extract system and user messages
+                system_message = None
+                user_message = ""
+                # Process messages in order, using the last user message
+                for msg in messages:
+                    if msg.get("role") == "system":
+                        system_message = msg.get("content", "")
+                    elif msg.get("role") == "user":
+                        user_message = msg.get("content", "")
+            else:
+                logger.warning("Unsupported input format")
+                return {"error": "Unsupported input format. Expected string or messages object."}
+            # Format the prompt with system and user messages
+            prompt = self.format_prompt_with_system(user_message, system_message)
             parameters = data.get("parameters", {})
             logger.info(f"Processing input with {len(prompt)} characters")
             attention_mask = torch.ones_like(input_ids)
             # Perform safe generation with error handling for out-of-vocabulary issues
+            return self._safe_generate(input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample, prompt)
         except Exception as e:
             logger.error(f"Error during generation: {e}")
             return {"error": str(e)}
+    def _safe_generate(self, input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample, prompt):
         """Safely generate text handling potential token index errors"""
         try:
             with torch.no_grad():
                 # Get the input text to exclude from final output
+                input_text = prompt
+                logger.info(f"Input prompt length: {len(input_text)} characters")
                 # Generate one token at a time to avoid index errors
                 max_steps = min(max_new_tokens, 100)  # Limit to 100 tokens for testing
                 # Decode the generated sequence
                 generated_text = self.tokenizer.decode(current_ids[0], skip_special_tokens=True)
+                # Return only the newly generated text (after the assistant tag)
+                split_text = generated_text.split("<|assistant|>")
+                if len(split_text) > 1:
+                    response_text = split_text[1].strip()
                 else:
+                    # Fallback if the expected format is not found
+                    logger.warning("Could not find assistant tag in generated text")
                     response_text = generated_text
                 logger.info(f"Generated {len(response_text)} characters")
 if __name__ == "__main__":
     # Example usage
     handler = EndpointHandler()
+    # Test with system message
+    test_with_system = {
+        "inputs": "What are the major features of Phi-4?",
+        "parameters": {
+            "system_message": "You are an AI assistant that provides helpful, accurate, and concise information about AI models."
+        }
+    }
+    # Test with messages format
+    test_with_messages = {
+        "inputs": {
+            "messages": [
+                {"role": "system", "content": "You are an AI assistant that provides helpful, accurate, and concise information about AI models."},
+                {"role": "user", "content": "What are the major features of Phi-4?"}
+            ]
+        }
+    }
+    # Choose which test to run
+    result = handler(test_with_system)
     print(result)