Brain-LLM
/

phi4-mini-raw

Safetensors

phi3

custom_code

Model card Files Files and versions

xet

Community

Yong Liu commited on Apr 18, 2025

Commit

02d7d65

1 Parent(s): 4aa4d08

update handler

Browse files

Files changed (2) hide show

README.md +0 -81
handler.py +436 -229

README.md DELETED Viewed

@@ -1,81 +0,0 @@
-# Phi-4 Mini Inference Endpoint Handler
-This repository contains code for deploying the Phi-4 Mini model to a HuggingFace Inference Endpoint with an OpenAI-compatible API format.
-## Setup
-1. Install the required dependencies:
-   ```
-   pip install -r requirements.txt
-   ```
-2. Set the environment variable to your model path (optional if model is in the same directory):
-   ```
-   export MODEL_PATH=/path/to/your/model
-   ```
-## Usage
-When deploying to a HuggingFace Inference Endpoint, the `handler.py` file will be used to process requests. The endpoint accepts requests in an OpenAI-compatible format:
-```json
-{
-  "messages": [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Tell me about language models."}
-  ],
-  "max_tokens": 256,
-  "temperature": 0.7,
-  "top_p": 1.0,
-  "n": 1,
-  "stop": ["\n", "User:"]
-}
-```
-The endpoint returns responses in an OpenAI-compatible format:
-```json
-{
-  "id": "cmpl-12345",
-  "object": "chat.completion",
-  "created": 0,
-  "model": "phi4-mini-raw",
-  "choices": [
-    {
-      "index": 0,
-      "message": {
-        "role": "assistant",
-        "content": "Language models are computational systems designed to understand and generate human language..."
-      },
-      "finish_reason": "stop"
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 42,
-    "completion_tokens": 156,
-    "total_tokens": 198
-  }
-}
-```
-## Local Testing
-To test the handler locally before deployment:
-```python
-from handler import EndpointHandler
-# Initialize the handler with your model path
-handler = EndpointHandler("./phi4-mini-raw")
-# Test with a sample request
-request = {
-  "messages": [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Hello, how are you?"}
-  ]
-}
-response = handler(request)
-print(response)
-```

handler.py CHANGED Viewed

@@ -1,265 +1,472 @@
 import os
-import json
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from typing import Dict, List, Any
-# Fix for the rope_scaling validation issue
-import transformers.models.phi3.configuration_phi3
-# Store original method
-original_validation = transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation
-# Replace with a no-op function
-def no_validation(self):
-    pass
-# Apply the patch
-transformers.models.phi3.configuration_phi3.Phi3Config._rope_scaling_validation = no_validation
 class EndpointHandler:
     def __init__(self, path=""):
-        # Initialize model and tokenizer
-        self.model_path = path if path else os.environ.get("MODEL_PATH", "")
-        print(f"Loading model from: {self.model_path}")
-        # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-        # Determine the device to use
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        print(f"Using device: {self.device}")
-        # Load model directly without pipeline
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_path,
-            torch_dtype=torch.float16,
-            device_map="auto"
-        )
-        # Ensure model is on the correct device
-        if torch.cuda.is_available():
-            self.model = self.model.cuda()
-        print("Model loaded successfully")
-        # For Phi3 models, monkey patch the RotaryEmbedding
         try:
-            from transformers.models.phi3.modeling_phi3 import PhiRotaryEmbedding
-            original_forward = PhiRotaryEmbedding.forward
-            def patched_forward(self, position_ids, query, key, value=None):
-                # Ensure position_ids is on the same device as query
-                position_ids = position_ids.to(query.device)
-                return original_forward(self, position_ids, query, key, value)
-            PhiRotaryEmbedding.forward = patched_forward
-            print("Successfully patched PhiRotaryEmbedding.forward")
         except Exception as e:
-            print(f"Could not patch PhiRotaryEmbedding: {str(e)}")
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Handle inference request in OpenAI-like format or HuggingFace Inference API format"""
         try:
-            # Debugging: Print the received data
-            print(f"Received data: {json.dumps(data, indent=2)}")
-            # Handle HuggingFace Inference API format
-            if "inputs" in data:
-                # Extract data from inputs key
-                if isinstance(data["inputs"], dict):
-                    # If inputs contains a dictionary, extract it
-                    input_data = data["inputs"]
-                elif isinstance(data["inputs"], str):
-                    # If inputs is a string, create a simple message
-                    input_data = {
-                        "messages": [
-                            {"role": "user", "content": data["inputs"]}
-                        ]
-                    }
-                else:
-                    print(f"Unexpected inputs format: {type(data['inputs'])}")
-                    # Try to convert to string if possible
-                    try:
-                        input_data = {
-                            "messages": [
-                                {"role": "user", "content": str(data["inputs"])}
-                            ]
-                        }
-                    except:
-                        raise ValueError(f"Unsupported inputs format: {type(data['inputs'])}")
             else:
-                # Assume direct OpenAI format
-                input_data = data
-            # Debugging: Print the parsed input data
-            print(f"Parsed input data: {json.dumps(input_data, indent=2)}")
-            # Parse input data
-            inputs = self._parse_input(input_data)
-            # Generate response
-            outputs = self._generate(inputs)
-            # Format response in OpenAI-like format
-            return self._format_response(outputs, inputs)
-        except Exception as e:
-            print(f"Error during processing: {str(e)}")
-            import traceback
-            traceback.print_exc()
-            return {
-                "error": {
-                    "message": str(e),
-                    "type": "invalid_request_error",
-                    "code": 400
-                }
-            }
-    def _parse_input(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        """Parse input data to extract generation parameters"""
-        # Extract messages
-        messages = data.get("messages", [])
-        if not messages:
-            print(f"No messages found in data: {json.dumps(data, indent=2)}")
-            raise ValueError("No messages provided")
-        # Convert messages to prompt
-        prompt = self._convert_messages_to_prompt(messages)
-        # Extract generation parameters with defaults
-        generation_params = {
-            "max_tokens": data.get("max_tokens", 256),
-            "temperature": data.get("temperature", 0.7),
-            "top_p": data.get("top_p", 1.0),
-            "n": data.get("n", 1),
-            "stream": data.get("stream", False),
-            "stop": data.get("stop", None),
-            "presence_penalty": data.get("presence_penalty", 0.0),
-            "frequency_penalty": data.get("frequency_penalty", 0.0),
-        }
-        return {
-            "prompt": prompt,
-            "messages": messages,
-            "generation_params": generation_params
-        }
-    def _convert_messages_to_prompt(self, messages: List[Dict[str, str]]) -> str:
-        """Convert list of messages to a prompt string"""
-        prompt = ""
-        for message in messages:
-            role = message.get("role", "")
-            content = message.get("content", "")
-            if role == "system":
-                prompt += f"System: {content}\n\n"
-            elif role == "user":
-                prompt += f"User: {content}\n\n"
-            elif role == "assistant":
-                prompt += f"Assistant: {content}\n\n"
-        # Add final assistant prompt
-        prompt += "Assistant: "
-        return prompt
-    def _generate(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """Generate response using the model directly"""
-        prompt = inputs["prompt"]
-        params = inputs["generation_params"]
-        # Get the model's device
-        device = next(self.model.parameters()).device
-        print(f"Model is on device: {device}")
-        # Tokenize input and ensure it's on the correct device
-        input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(device)
-        print(f"Input tensor device: {input_ids.device}")
-        # Count input tokens
-        input_tokens = input_ids.shape[1]
-        # Convert OpenAI-like parameters to HF parameters
-        generation_kwargs = {
-            "max_new_tokens": params["max_tokens"],
-            "temperature": params["temperature"],
-            "top_p": params["top_p"],
-            "num_return_sequences": params["n"],
-            "do_sample": params["temperature"] > 0,
-            "pad_token_id": self.tokenizer.eos_token_id,
-        }
-        # Generate output
         try:
             with torch.no_grad():
-                outputs = self.model.generate(
-                    input_ids,
-                    **generation_kwargs
-                )
-                print(f"Output tensor device: {outputs.device}")
-        except RuntimeError as e:
-            if "Expected all tensors to be on the same device" in str(e):
-                print("Caught device mismatch error, trying to fix...")
-                # A more drastic approach: move the model completely to CPU if there's a device issue
-                if torch.cuda.is_available():
-                    print("Moving everything to CPU as a fallback")
-                    self.model = self.model.cpu()
-                    input_ids = input_ids.cpu()
-                    with torch.no_grad():
-                        outputs = self.model.generate(
-                            input_ids,
-                            **generation_kwargs
-                        )
                 else:
-                    raise
-            else:
-                raise
-        # Decode output
-        generated_texts = []
-        for i in range(params["n"]):
-            gen_text = self.tokenizer.decode(outputs[i][input_tokens:], skip_special_tokens=True)
-            # Apply stop sequences if provided
-            if params["stop"]:
-                for stop in params["stop"]:
-                    if stop in gen_text:
-                        gen_text = gen_text[:gen_text.find(stop)]
-            generated_texts.append(gen_text)
-        # Count completion tokens
-        completion_tokens = [len(self.tokenizer.encode(text)) for text in generated_texts]
-        return {
-            "generated_texts": generated_texts,
-            "prompt_tokens": input_tokens,
-            "completion_tokens": completion_tokens,
-        }
-    def _format_response(self, outputs: Dict[str, Any], inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """Format response in OpenAI-like format"""
-        generated_texts = outputs["generated_texts"]
-        prompt_tokens = outputs["prompt_tokens"]
-        completion_tokens = outputs["completion_tokens"]
-        choices = []
-        for i, text in enumerate(generated_texts):
-            choices.append({
-                "index": i,
-                "message": {
-                    "role": "assistant",
-                    "content": text
                 },
-                "finish_reason": "stop"
-            })
-        return {
-            "id": f"cmpl-{hash(inputs['prompt']) % 10000}",
-            "object": "chat.completion",
-            "created": int(torch.cuda.current_device()) if torch.cuda.is_available() else 0,
-            "model": os.path.basename(self.model_path),
-            "choices": choices,
-            "usage": {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": sum(completion_tokens),
-                "total_tokens": prompt_tokens + sum(completion_tokens)
             }
-        }

 import os
 import torch
+import logging
+import time
+import traceback
+import json
+import re
+from typing import Dict, List, Any, Union, Generator
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
 class EndpointHandler:
     def __init__(self, path=""):
+        """
+        Initialize the model and tokenizer for Phi-4 inference.
+        Args:
+            path (str): Path to the model directory
+        """
+        # Set default parameters for inference
+        self.max_new_tokens = 1024  # Keep at 1024 to avoid timeouts
+        self.temperature = 0.7
+        self.top_p = 0.9
+        self.do_sample = True
+        # Determine if CUDA is available
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        logger.info(f"Initializing model from {path} on {self.device}")
         try:
+            # Load tokenizer - use original model ID as fallback
+            # This helps with common tokenizer mismatch issues
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(path)
+                logger.info(f"Loaded tokenizer from local path")
+            except Exception as e:
+                logger.warning(f"Failed to load tokenizer from local path: {e}")
+                self.tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct")
+                logger.info("Loaded tokenizer from microsoft/Phi-4-mini-instruct")
+            # Ensure tokenizer has EOS token set
+            if self.tokenizer.eos_token_id is None:
+                logger.warning("EOS token not set in tokenizer, using default")
+                self.tokenizer.eos_token_id = 199999  # Phi-4's default EOS token
+            # Load model with appropriate settings
+            self.model = AutoModelForCausalLM.from_pretrained(
+                path,
+                torch_dtype=self.dtype,
+                device_map="auto" if self.device == "cuda" else None,
+                trust_remote_code=True
+            )
+            # Move model to device if CPU
+            if self.device == "cpu":
+                self.model = self.model.to(self.device)
+            # Set model to evaluation mode
+            self.model.eval()
+            # Print diagnostic information
+            logger.info(f"Model loaded on {self.device} using {self.dtype}")
+            logger.info(f"Tokenizer vocabulary size: {len(self.tokenizer)}")
+            logger.info(f"Model vocabulary size: {self.model.config.vocab_size}")
+            logger.info(f"Model embedding size: {self.model.get_input_embeddings().weight.shape}")
+            if len(self.tokenizer) != self.model.config.vocab_size:
+                logger.warning(f"Tokenizer vocab size ({len(self.tokenizer)}) doesn't match model vocab size ({self.model.config.vocab_size})")
         except Exception as e:
+            logger.error(f"Error during model initialization: {str(e)}")
+            logger.error(traceback.format_exc())
+            raise
+    def format_prompt_with_system(self, user_message, system_message=None):
+        """
+        Format the prompt with system and user messages according to Phi-4 format.
+        Args:
+            user_message (str): The user's message
+            system_message (str, optional): The system message/instruction
+        Returns:
+            str: Formatted prompt ready for the model
+        """
+        # Format using Phi-4's expected chat template:
+        # <|system|>
+        # {system_message}
+        # <|user|>
+        # {user_message}
+        # <|assistant|>
+        if system_message:
+            prompt = f"<|system|>\n{system_message}\n<|user|>\n{user_message}\n<|assistant|>"
+        else:
+            # If no system message, just use user message with assistant tag
+            prompt = f"<|user|>\n{user_message}\n<|assistant|>"
+        logger.info(f"Formatted prompt with {'system message and ' if system_message else ''}user message")
+        return prompt
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Process the input data and generate a response using the Phi-4 model.
+        Args:
+            data (Dict[str, Any]): Input data containing the prompt and generation parameters
+        Returns:
+            Dict[str, Any]: Model response
+        """
+        start_time = time.time()
+        logger.info(f"Starting request processing")
         try:
+            # Extract input parameters with defaults
+            if "inputs" not in data:
+                logger.warning("No 'inputs' field in request data")
+                error_msg = "Missing 'inputs' field in request"
+                return self._format_error_response(error_msg)
+            # Track user and system messages
+            user_message = ""
+            system_message = None
+            # Handle different input formats
+            # 1. Direct string input
+            if isinstance(data["inputs"], str):
+                user_message = data["inputs"]
+                system_message = data.get("parameters", {}).get("system_message", None)
+            # 2. Dict with messages format
+            elif isinstance(data["inputs"], dict) and "messages" in data["inputs"]:
+                messages = data["inputs"]["messages"]
+                # Extract system and user messages for prompt formatting
+                for msg in messages:
+                    if msg.get("role") == "system":
+                        system_message = msg.get("content", "")
+                    elif msg.get("role") == "user":
+                        user_message = msg.get("content", "")
+            # 3. Direct messages list format
+            elif isinstance(data["inputs"], list):
+                messages = data["inputs"]
+                # Extract system and user messages for prompt formatting
+                for msg in messages:
+                    if msg.get("role") == "system":
+                        system_message = msg.get("content", "")
+                    elif msg.get("role") == "user":
+                        user_message = msg.get("content", "")
             else:
+                logger.warning(f"Unsupported input format: {type(data['inputs'])}")
+                error_msg = "Unsupported input format. Expected string or messages object."
+                return self._format_error_response(error_msg)
+            logger.info(f"Extracted user message length: {len(user_message)} characters")
+            if system_message:
+                logger.info(f"Extracted system message length: {len(system_message)} characters")
+            # Format the prompt with system and user messages
+            prompt = self.format_prompt_with_system(user_message, system_message)
+            parameters = data.get("parameters", {})
+            logger.info(f"Processing input with {len(prompt)} characters")
+            # Get generation parameters with fallbacks to defaults
+            max_new_tokens = min(parameters.get("max_new_tokens", self.max_new_tokens), 1024)
+            temperature = parameters.get("temperature", self.temperature)
+            top_p = parameters.get("top_p", self.top_p)
+            do_sample = parameters.get("do_sample", self.do_sample)
+            logger.info(f"Generation parameters: max_new_tokens={max_new_tokens}, temperature={temperature}, top_p={top_p}, do_sample={do_sample}")
+            # Manually implement generation to avoid token index errors
+            try:
+                input_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)
+                logger.info(f"Input tokens shape: {input_ids.shape}")
+                # Create attention mask
+                attention_mask = torch.ones_like(input_ids)
+                # Perform safe generation with error handling for out-of-vocabulary issues
+                response_text = self._safe_generate(
+                    input_ids,
+                    attention_mask,
+                    max_new_tokens,
+                    temperature,
+                    top_p,
+                    do_sample,
+                    prompt
+                )
+                logger.info(f"Response generation completed, text length: {len(response_text) if isinstance(response_text, str) else 'N/A'}")
+                # Format and return response in OpenAI format
+                if isinstance(response_text, str):
+                    response_tokens = len(self.tokenizer.encode(response_text)) if response_text else 0
+                    logger.info(f"Response token count: {response_tokens}")
+                    return self._format_openai_response(
+                        response_text,
+                        input_ids.shape[1],
+                        response_tokens
+                    )
+                else:
+                    return self._format_error_response(f"Error during generation: {response_text}")
+            except RuntimeError as e:
+                logger.error(f"Runtime Error during generation: {str(e)}")
+                logger.error(traceback.format_exc())
+                return self._format_error_response(f"Error during generation: {str(e)}")
+        except Exception as e:
+            logger.error(f"Unexpected error during request processing: {str(e)}")
+            logger.error(traceback.format_exc())
+            return self._format_error_response(f"Unexpected error: {str(e)}")
+        finally:
+            duration = time.time() - start_time
+            logger.info(f"Request processing completed in {duration:.2f} seconds")
+    def _complete_sentence(self, text):
+        """Ensure the text ends with a complete sentence"""
+        # If text is already a complete sentence, return it
+        if text.strip().endswith(('.', '!', '?')):
+            return text
+        # Find the last complete sentence end
+        sentences = re.split(r'([.!?])\s+', text)
+        if len(sentences) <= 1:
+            # No complete sentences found, return as is with ellipsis
+            return text + "..."
+        # Reconstruct text up to the last complete sentence
+        result = ""
+        for i in range(len(sentences) - 1):
+            if i % 2 == 0:  # Content before punctuation
+                result += sentences[i]
+            else:  # Punctuation
+                result += sentences[i] + " "
+        return result.strip()
+    def _safe_generate(self, input_ids, attention_mask, max_new_tokens, temperature, top_p, do_sample, prompt):
+        """Safely generate text handling potential token index errors"""
         try:
             with torch.no_grad():
+                logger.info("Starting safe generation")
+                # Get the input text to exclude from final output
+                input_text = prompt
+                logger.info(f"Input prompt length: {len(input_text)} characters")
+                # Generate one token at a time to avoid index errors
+                # Use a lower absolute maximum to ensure completion
+                max_steps = min(max_new_tokens, 450)  # Adjusted down from 500
+                current_ids = input_ids.clone()
+                logger.info(f"Generating up to {max_steps} tokens")
+                # Keep track of last 5 tokens to detect repetition
+                last_tokens = []
+                repetition_detected = False
+                for i in range(max_steps):
+                    if i % 50 == 0:
+                        logger.info(f"Generated {i} tokens so far")
+                    # Early termination if we're getting close to the limit to allow for post-processing
+                    if i >= max_steps - 50:
+                        # Temporarily decode to check if we have a complete response already
+                        temp_text = self.tokenizer.decode(current_ids[0], skip_special_tokens=True)
+                        if "<|assistant|>" in temp_text:
+                            temp_response = temp_text.split("<|assistant|>")[1].strip()
+                            # If we have a reasonably complete response, stop early
+                            if len(temp_response) > 100 and temp_response.count('.') >= 3:
+                                logger.info(f"Early termination at {i} tokens with complete response detected")
+                                break
+                    # Get logits for next token
+                    outputs = self.model(
+                        input_ids=current_ids,
+                        attention_mask=attention_mask,
+                        return_dict=True
+                    )
+                    next_token_logits = outputs.logits[:, -1, :]
+                    # Apply temperature and sampling
+                    if temperature > 0:
+                        next_token_logits = next_token_logits / temperature
+                    if do_sample:
+                        # Apply top_p sampling
+                        sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
+                        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                        # Remove tokens with cumulative probability above the threshold
+                        sorted_indices_to_remove = cumulative_probs > top_p
+                        # Shift the indices to the right to keep also the first token above the threshold
+                        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                        sorted_indices_to_remove[..., 0] = 0
+                        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                        next_token_logits[indices_to_remove] = -float('Inf')
+                        # Sample from the filtered distribution
+                        probs = torch.softmax(next_token_logits, dim=-1)
+                        next_token = torch.multinomial(probs, num_samples=1)
+                    else:
+                        # Take the token with highest probability
+                        next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
+                    # Add the predicted token to the sequence
+                    current_ids = torch.cat([current_ids, next_token], dim=-1)
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
+                    # Add to last tokens list for repetition detection
+                    last_tokens.append(next_token.item())
+                    if len(last_tokens) > 5:
+                        last_tokens.pop(0)
+                    # Check for repetition (if we have at least 5 tokens)
+                    if len(last_tokens) >= 5:
+                        # Check if all last 5 tokens are the same
+                        if len(set(last_tokens)) == 1:
+                            logger.warning(f"Repetition detected after {i+1} tokens, stopping generation")
+                            repetition_detected = True
+                            break
+                    # Check if we've generated an EOS token
+                    if next_token[0, 0].item() == self.tokenizer.eos_token_id:
+                        logger.info(f"EOS token generated after {i+1} tokens")
+                        break
+                # Decode the generated sequence
+                generated_text = self.tokenizer.decode(current_ids[0], skip_special_tokens=True)
+                logger.info(f"Decoded generated text: {len(generated_text)} characters")
+                # Return only the newly generated text (after the assistant tag)
+                split_text = generated_text.split("<|assistant|>")
+                if len(split_text) > 1:
+                    assistant_response = split_text[1].strip()
+                    logger.info(f"Raw assistant response: {len(assistant_response)} characters")
+                    # Process the response to ensure complete sentences
+                    response_text = self._complete_sentence(assistant_response)
+                    logger.info(f"Processed assistant response: {len(response_text)} characters")
                 else:
+                    # Fallback if the expected format is not found
+                    logger.warning("Could not find assistant tag in generated text")
+                    response_text = generated_text
+                return response_text
+        except Exception as e:
+            logger.error(f"Error in _safe_generate: {str(e)}")
+            logger.error(traceback.format_exc())
+            return f"Generation error: {str(e)}. Please try a simpler input."
+    def _format_openai_response(self, response_text, prompt_tokens, completion_tokens):
+        """Format the response in OpenAI-style format"""
+        try:
+            # Create a response ID
+            response_id = f"phi4-{int(time.time())}"
+            # Build OpenAI-compatible response
+            openai_response = {
+                "id": response_id,
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": "phi-4-mini",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": response_text
+                        },
+                        "finish_reason": "stop"
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "total_tokens": prompt_tokens + completion_tokens
+                }
+            }
+            # For compatibility with Hugging Face UI, include the generated_text field
+            openai_response["generated_text"] = response_text
+            logger.info(f"Formatted OpenAI-style response: {len(json.dumps(openai_response))} bytes")
+            return openai_response
+        except Exception as e:
+            logger.error(f"Error formatting OpenAI response: {str(e)}")
+            # Fall back to simple response
+            return {"generated_text": response_text}
+    def _format_error_response(self, error_message):
+        """Format an error response in OpenAI-style format"""
+        try:
+            error_response = {
+                "id": f"phi4-error-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": "phi-4-mini",
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": {
+                            "role": "assistant",
+                            "content": f"Error: {error_message}"
+                        },
+                        "finish_reason": "error"
+                    }
+                ],
+                "usage": {
+                    "prompt_tokens": 0,
+                    "completion_tokens": 0,
+                    "total_tokens": 0
                 },
+                "error": {
+                    "message": error_message,
+                    "type": "invalid_request_error",
+                    "code": "error"
+                }
             }
+            # For compatibility with Hugging Face UI, include the generated_text field
+            error_response["generated_text"] = f"Error: {error_message}"
+            logger.info(f"Formatted error response: {len(json.dumps(error_response))} bytes")
+            return error_response
+        except Exception as e:
+            logger.error(f"Error formatting error response: {str(e)}")
+            # Fall back to simple error response
+            return {"generated_text": f"Error: {error_message}"}
+# For local testing
+if __name__ == "__main__":
+    # Example usage
+    handler = EndpointHandler()
+    # Test with messages format
+    test_with_messages = {
+        "inputs": {
+            "messages": [
+                {"role": "system", "content": "You are an AI assistant that provides helpful, accurate, and concise information about AI models."},
+                {"role": "user", "content": "What are the major features of Phi-4?"}
+            ]
+        }
+    }
+    # Run the test
+    result = handler(test_with_messages)
+    print(json.dumps(result, indent=2))