import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline class EndpointHandler(): """ Custom handler for Hugging Face Inference Endpoints. This handler will be used to load the model and tokenizer, and to handle inference requests. """ def __init__(self, path=""): """ Initializes the model and tokenizer. This method is called only once when the endpoint is created. Args: path (str, optional): The path to the model directory. If not provided, it defaults to the model loaded by the endpoint. """ # Get the model ID from the environment variable set by Hugging Face Inference Endpoints model_id = os.environ.get("HF_MODEL_ID", "Pragmanic0/Nomadic-ICDU-v8") print(f"Loading model: {model_id}...") # Load the tokenizer from the pretrained model self.tokenizer = AutoTokenizer.from_pretrained(model_id) # Load the model with recommended settings # torch.bfloat16 is used for better performance on compatible hardware (e.g., Ampere GPUs) # device_map="auto" automatically distributes the model across available GPUs self.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) # Create a text generation pipeline # This simplifies the process of generating text from a prompt self.pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, ) print("Model and pipeline loaded successfully.") def __call__(self, data: dict) -> list: """ This method is called for every inference request. Args: data (dict): The request payload from the user. It contains the inputs and parameters. Returns: list: A list containing the generated text in a dictionary. """ # Extract the prompt from the input data prompt = data.get("inputs", "") # Extract generation parameters, with sensible defaults # These parameters can be overridden by the user in the request parameters = data.get("parameters", {}) max_new_tokens = parameters.get("max_new_tokens", 512) temperature = parameters.get("temperature", 0.7) top_p = parameters.get("top_p", 0.95) do_sample = parameters.get("do_sample", True) # Apply the specific prompt template required by the Nomadic-ICDU-v8 model # This is crucial for getting high-quality responses from instruction-tuned models formatted_prompt = f"[INST] {prompt} [/INST]" print(f"Generating text for prompt: '{prompt}'") # Use the pipeline to generate text # We pass the formatted prompt and the generation parameters try: generated = self.pipeline( formatted_prompt, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature, top_p=top_p, return_full_text=False, # Only return the generated part, not the prompt ) # The pipeline returns a list of dictionaries # We extract the 'generated_text' from the first element result = generated[0] except Exception as e: print(f"An error occurred during generation: {e}") # Return an error message in the expected format result = {"generated_text": f"Error: {e}"} print(f"Generated text: {result['generated_text']}") # Return the result in a list, as expected by the Inference Endpoints framework return [result]