from huggingface_hub import hf_hub_download from llama_cpp import Llama class EndpointHandler: def __init__(self, path=""): # 1. Download GGUF files from the Hub self.qwen_4b_path = hf_hub_download( repo_id="AtomP/NewQwenTestCase", filename="qwen3-4b-instruct-2507.Q8_0.gguf" ) self.qwen_7b_path = hf_hub_download( repo_id="unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF", filename="Qwen2.5-Coder-7B-Instruct-Q8_0.gguf", ) # 2. Load models into GPU memory (n_gpu_layers=-1 offloads all layers to GPU) # n_ctx limits the context window to save VRAM. Increase this if your GPU has capacity. self.model_4b = Llama( model_path=self.qwen_4b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False ) self.model_7b = Llama( model_path=self.qwen_7b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False ) def __call__(self, data): # 1. Hugging Face puts our JSON inside the "inputs" key. # We use .get("inputs", data) so it still works gracefully if tested locally. payload = data.get("inputs", data) # 2. Extract parameters from the payload (using .get instead of .pop is safer here) messages = payload.get("messages", [{"role": "user", "content": "Hello"}]) target_model = payload.get("target_model", "test_case") max_tokens = payload.get("max_tokens", 512) temperature = payload.get("temperature", 0.7) response_format = payload.get("response_format", None) repeat_penalty = payload.get("repeat_penalty", 1.05) stop = payload.get("stop", ["<|im_end|>"]) # 3. Route request if target_model == "test_case": active_model = self.model_4b elif target_model == "test_script": active_model = self.model_7b else: return { "error": f"Invalid target_model: '{target_model}'. Use 'test_case' or 'test_script'." } # 4. Generate and return response response = active_model.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, response_format=response_format, # Don't forget to pass this! repeat_penalty=repeat_penalty, stop=stop ) return response