Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 17, 2025

Commit

cec06c5

1 Parent(s): 1b1b06a

update the handler py

Browse files

Files changed (1) hide show

handler.py +50 -23

handler.py CHANGED Viewed

@@ -21,7 +21,7 @@ class EndpointHandler:
             path (str): Path to the model directory
         """
         # Set default parameters for inference
-        self.max_new_tokens = 1024  # Reduced from 4096 to avoid memory issues
         self.temperature = 0.7
         self.top_p = 0.9
         self.do_sample = True
@@ -103,7 +103,7 @@ class EndpointHandler:
         logger.info(f"Formatted prompt with {'system message and ' if system_message else ''}user message")
         return prompt
-    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, str], Generator]:
         """
         Process the input data and generate a response using the Phi-4 model.
@@ -111,7 +111,7 @@ class EndpointHandler:
             data (Dict[str, Any]): Input data containing the prompt and generation parameters
         Returns:
-            Dict[str, str] or Generator: Model response or stream
         """
         try:
             # Extract input parameters with defaults
@@ -124,27 +124,34 @@ class EndpointHandler:
             if isinstance(data["inputs"], str):
                 user_message = data["inputs"]
                 system_message = data.get("parameters", {}).get("system_message", None)
             # 2. Dict with messages format
             elif isinstance(data["inputs"], dict) and "messages" in data["inputs"]:
                 messages = data["inputs"]["messages"]
-                # Extract system and user messages
                 system_message = None
                 user_message = ""
-                # Process messages in order, using the last user message
                 for msg in messages:
                     if msg.get("role") == "system":
                         system_message = msg.get("content", "")
                     elif msg.get("role") == "user":
                         user_message = msg.get("content", "")
             # 3. Direct messages list format
             elif isinstance(data["inputs"], list):
                 messages = data["inputs"]
-                # Extract system and user messages
                 system_message = None
                 user_message = ""
-                # Process messages in order, using the last user message
                 for msg in messages:
                     if msg.get("role") == "system":
                         system_message = msg.get("content", "")
@@ -153,7 +160,7 @@ class EndpointHandler:
             else:
                 logger.warning("Unsupported input format")
                 return {"error": "Unsupported input format. Expected string or messages object."}
             # Format the prompt with system and user messages
             prompt = self.format_prompt_with_system(user_message, system_message)
@@ -162,7 +169,7 @@ class EndpointHandler:
             logger.info(f"Processing input with {len(prompt)} characters")
             # Get generation parameters with fallbacks to defaults
-            max_new_tokens = min(parameters.get("max_new_tokens", self.max_new_tokens), 2048)
             temperature = parameters.get("temperature", self.temperature)
             top_p = parameters.get("top_p", self.top_p)
             do_sample = parameters.get("do_sample", self.do_sample)
@@ -181,7 +188,35 @@ class EndpointHandler:
             attention_mask = torch.ones_like(input_ids)
             # Perform safe generation with error handling for out-of-vocabulary issues
-            return self._safe_generate(input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample, prompt)
         except Exception as e:
             logger.error(f"Error during generation: {e}")
@@ -196,7 +231,7 @@ class EndpointHandler:
                 logger.info(f"Input prompt length: {len(input_text)} characters")
                 # Generate one token at a time to avoid index errors
-                max_steps = min(max_new_tokens, 100)  # Limit to 100 tokens for testing
                 current_ids = input_ids.clone()
                 for _ in range(max_steps):
@@ -255,7 +290,7 @@ class EndpointHandler:
                     response_text = generated_text
                 logger.info(f"Generated {len(response_text)} characters")
-                return {"generated_text": response_text}
         except Exception as e:
             logger.error(f"Error in _safe_generate: {str(e)}")
@@ -295,7 +330,7 @@ class EndpointHandler:
                 response_text = generated_text
             logger.info(f"Generated {len(response_text)} characters")
-            return {"generated_text": response_text}
         except Exception as e:
             logger.error(f"Error in _generate: {e}")
@@ -351,14 +386,6 @@ if __name__ == "__main__":
     # Example usage
     handler = EndpointHandler()
-    # Test with system message
-    test_with_system = {
-        "inputs": "What are the major features of Phi-4?",
-        "parameters": {
-            "system_message": "You are an AI assistant that provides helpful, accurate, and concise information about AI models."
-        }
-    }
     # Test with messages format
     test_with_messages = {
         "inputs": {
@@ -369,6 +396,6 @@ if __name__ == "__main__":
         }
     }
-    # Choose which test to run
-    result = handler(test_with_system)
     print(result)

             path (str): Path to the model directory
         """
         # Set default parameters for inference
+        self.max_new_tokens = 2048  # Increased from 1024 to handle longer outputs
         self.temperature = 0.7
         self.top_p = 0.9
         self.do_sample = True
         logger.info(f"Formatted prompt with {'system message and ' if system_message else ''}user message")
         return prompt
+    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], Generator]:
         """
         Process the input data and generate a response using the Phi-4 model.
             data (Dict[str, Any]): Input data containing the prompt and generation parameters
         Returns:
+            Dict[str, Any] or Generator: Model response
         """
         try:
             # Extract input parameters with defaults
             if isinstance(data["inputs"], str):
                 user_message = data["inputs"]
                 system_message = data.get("parameters", {}).get("system_message", None)
+                messages = [
+                    {"role": "system", "content": system_message} if system_message else None,
+                    {"role": "user", "content": user_message}
+                ]
+                messages = [m for m in messages if m is not None]  # Remove None values
             # 2. Dict with messages format
             elif isinstance(data["inputs"], dict) and "messages" in data["inputs"]:
                 messages = data["inputs"]["messages"]
+                # Extract system and user messages for prompt formatting
                 system_message = None
                 user_message = ""
                 for msg in messages:
                     if msg.get("role") == "system":
                         system_message = msg.get("content", "")
                     elif msg.get("role") == "user":
                         user_message = msg.get("content", "")
             # 3. Direct messages list format
             elif isinstance(data["inputs"], list):
                 messages = data["inputs"]
+                # Extract system and user messages for prompt formatting
                 system_message = None
                 user_message = ""
                 for msg in messages:
                     if msg.get("role") == "system":
                         system_message = msg.get("content", "")
             else:
                 logger.warning("Unsupported input format")
                 return {"error": "Unsupported input format. Expected string or messages object."}
             # Format the prompt with system and user messages
             prompt = self.format_prompt_with_system(user_message, system_message)
             logger.info(f"Processing input with {len(prompt)} characters")
             # Get generation parameters with fallbacks to defaults
+            max_new_tokens = min(parameters.get("max_new_tokens", self.max_new_tokens), 4096)  # Increased to 4096
             temperature = parameters.get("temperature", self.temperature)
             top_p = parameters.get("top_p", self.top_p)
             do_sample = parameters.get("do_sample", self.do_sample)
             attention_mask = torch.ones_like(input_ids)
             # Perform safe generation with error handling for out-of-vocabulary issues
+            response_text = self._safe_generate(input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample, prompt)
+            # Format response in OpenAI-style format
+            if isinstance(response_text, dict) and "error" in response_text:
+                return response_text
+            else:
+                # OpenAI-style response format
+                openai_response = {
+                    "id": f"phi4-{int(torch.randint(10000, 99999, (1,)).item())}",
+                    "object": "chat.completion",
+                    "created": int(torch.cuda.current_stream().cuda_stream if torch.cuda.is_available() else 0),
+                    "model": "phi-4-mini",
+                    "choices": [
+                        {
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": response_text
+                            },
+                            "finish_reason": "stop"
+                        }
+                    ],
+                    "usage": {
+                        "prompt_tokens": len(input_ids[0]),
+                        "completion_tokens": len(self.tokenizer.encode(response_text)) if isinstance(response_text, str) else 0,
+                        "total_tokens": len(input_ids[0]) + (len(self.tokenizer.encode(response_text)) if isinstance(response_text, str) else 0)
+                    }
+                }
+                return openai_response
         except Exception as e:
             logger.error(f"Error during generation: {e}")
                 logger.info(f"Input prompt length: {len(input_text)} characters")
                 # Generate one token at a time to avoid index errors
+                max_steps = max_new_tokens  # Allow for full generation length
                 current_ids = input_ids.clone()
                 for _ in range(max_steps):
                     response_text = generated_text
                 logger.info(f"Generated {len(response_text)} characters")
+                return response_text
         except Exception as e:
             logger.error(f"Error in _safe_generate: {str(e)}")
                 response_text = generated_text
             logger.info(f"Generated {len(response_text)} characters")
+            return response_text
         except Exception as e:
             logger.error(f"Error in _generate: {e}")
     # Example usage
     handler = EndpointHandler()
     # Test with messages format
     test_with_messages = {
         "inputs": {
         }
     }
+    # Run the test
+    result = handler(test_with_messages)
     print(result)