handraise-dev
/

gguf-inference

Text Generation

multilingual

nlp

code

Model card Files Files and versions

xet

Community

syberWolf commited on Jun 19, 2024

Commit

b3aebd1

1 Parent(s): c145e37

dd handler change

Browse files

Files changed (1) hide show

handler.py +88 -74

handler.py CHANGED Viewed

@@ -1,96 +1,110 @@
 from llama_cpp import Llama
-from typing import Dict, List, Any
 import os
 class EndpointHandler:
     def __init__(self, model_path=""):
-        # Construct the model path assuming the model is in the same directory as the handler file
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        model_filename = "Phi-3-medium-128k-instruct-IQ2_XS.gguf"
-        self.model_path = os.path.join(script_dir, model_filename)
-        # Check if the model file exists
-        if not os.path.exists(self.model_path):
-            raise ValueError(f"Model path does not exist: {self.model_path}")
-        # Load the GGUF model using llama_cpp
-        self.llm = Llama(
-            model_path=self.model_path,
-            n_ctx=5000,  # Set context length to 5000 tokens
-            # n_threads=12,  # Adjust the number of CPU threads as per your machine
-            n_gpu_layers=-1  # Adjust based on GPU availability
-        )
-        # Define generation kwargs for the model
-        self.generation_kwargs = {
-            "max_tokens": 400,  # Respond with up to 400 tokens
-            "stop": ["<|end|>", "<|user|>", "<|assistant|>"],
-            "top_k": 1  # Greedy decoding
-        }
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Data args:
             inputs (:obj:`dict`): The input prompts for the LLM including system instructions and user messages.
         Return:
             A :obj:`list` | `dict`: will be serialized and returned.
         """
-        # Extract inputs
-        inputs = data.get("inputs", {})
-        system_instructions = inputs.get("system", "")
-        user_message = inputs.get("message", "")
-        if not user_message:
-            raise ValueError("No user message provided for the model.")
-        # Combine system instructions and user message
-        final_input = f"{system_instructions}\n{user_message}"
-        # Run inference with llama_cpp
-        response = self.llm.create_chat_completion(
-            messages=[
-                {"role": "system", "content": system_instructions},
-                {"role": "user", "content": user_message}
-            ],
-            **self.generation_kwargs
-        )
         # Access generated text based on the response structure
         try:
             generated_text = response["choices"][0]["message"].get("content", "")
         except (KeyError, IndexError):
             raise ValueError("Unexpected response structure: missing 'content' in 'choices[0]['message']'")
         # Return the generated text
         return [{"generated_text": generated_text}]
-# Example usage:
 if __name__ == "__main__":
-    # Instantiate the handler ONCE
-    handler = EndpointHandler()
-    # Handlers can be called multiple times with different inputs and the model will remain in memory
-    data1 = {
-        "inputs": {
-            "system": "You are a helpful assistant.",
-            "message": "What is the meaning of life?"
-        }
-    }
-    data2 = {
-        "inputs": {
-            "system": "You are a knowledgeable assistant.",
-            "message": "Tell me about the history of the internet."
-        }
-    }
-    # First call - model already in memory
-    response1 = handler(data1)
-    print(response1)
-    # Second call - model still in memory
-    response2 = handler(data2)
-    print(response2)

 from llama_cpp import Llama
+from typing import Dict, List, Any, Union
 import os
 class EndpointHandler:
+    _instance = None  # Singleton instance
+    _model_loaded = False  # Flag to check if the model is loaded
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(EndpointHandler, cls).__new__(cls, *args, **kwargs)
+            cls._instance._model_loaded = False
+        return cls._instance
     def __init__(self, model_path=""):
+        if not self._model_loaded:
+            # Construct the model path assuming the model is in the same directory as the handler file
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            model_filename = "Phi-3-medium-128k-instruct-IQ2_XS.gguf"
+            self.model_path = os.path.join(script_dir, model_filename)
+            # Check if the model file exists
+            if not os.path.exists(self.model_path):
+                raise ValueError(f"Model path does not exist: {self.model_path}")
+            # Load the GGUF model using llama_cpp
+            self.llm = Llama(
+                model_path=self.model_path,
+                n_ctx=5000,  # Set context length to 5000 tokens
+                # n_threads=12,  # Adjust the number of CPU threads as per your machine
+                n_gpu_layers=-1  # Adjust based on GPU availability
+            )
+            # Define generation kwargs for the model
+            self.generation_kwargs = {
+                "max_tokens": 400,  # Respond with up to 400 tokens
+                "stop": ["<|end|>", "<|user|>", "<|assistant|>"],
+                "top_k": 1  # Greedy decoding
+            }
+            self._model_loaded = True
+    def __call__(self, data: Union[Dict[str, Any], str]) -> List[Dict[str, Any]]:
         """
         Data args:
             inputs (:obj:`dict`): The input prompts for the LLM including system instructions and user messages.
+            str: A string input which will create a chat completion.
         Return:
             A :obj:`list` | `dict`: will be serialized and returned.
         """
+        if isinstance(data, dict):
+            # Extract inputs
+            inputs = data.get("inputs", {})
+            system_instructions = inputs.get("system", "")
+            user_message = inputs.get("message", "")
+            if not user_message:
+                raise ValueError("No user message provided for the model.")
+            # Combine system instructions and user message
+            final_input = f"{system_instructions}\n{user_message}"
+            # Run inference with llama_cpp
+            response = self.llm.create_chat_completion(
+                messages=[
+                    {"role": "system", "content": system_instructions},
+                    {"role": "user", "content": user_message}
+                ],
+                **self.generation_kwargs
+            )
+        elif isinstance(data, str):
+            # Create a chat completion from the input string
+            response = self.llm.create_chat_completion(
+                messages=[
+                    {"role": "user", "content": data}
+                ],
+                **self.generation_kwargs
+            )
+        else:
+            raise ValueError("Invalid input type. Expected dict or str, got {}".format(type(data)))
         # Access generated text based on the response structure
         try:
             generated_text = response["choices"][0]["message"].get("content", "")
         except (KeyError, IndexError):
             raise ValueError("Unexpected response structure: missing 'content' in 'choices[0]['message']'")
         # Return the generated text
         return [{"generated_text": generated_text}]
+def main():
+    handler = EndpointHandler()  # assume Handler is the class that contains the __call__ method
+    # Test 1: Dictionary input
+    data_dict = {"inputs": {"system": "System instructions", "message": "Hello, how are you?"}}
+    result_dict = handler(data_dict)
+    print("Dictionary input result:", result_dict)
+    # Test 2: String input
+    data_str = "Hello, how are you?"
+    result_str = handler(data_str)
+    print("String input result:", result_str)
 if __name__ == "__main__":
+    main()