QwenCombined / handler.py
AtomP's picture
Upload handler.py
0e0974d verified
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
class EndpointHandler:
def __init__(self, path=""):
# 1. Download GGUF files from the Hub
self.qwen_4b_path = hf_hub_download(
repo_id="AtomP/NewQwenTestCase", filename="qwen3-4b-instruct-2507.Q8_0.gguf"
)
self.qwen_7b_path = hf_hub_download(
repo_id="unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF",
filename="Qwen2.5-Coder-7B-Instruct-Q8_0.gguf",
)
# 2. Load models into GPU memory (n_gpu_layers=-1 offloads all layers to GPU)
# n_ctx limits the context window to save VRAM. Increase this if your GPU has capacity.
self.model_4b = Llama(
model_path=self.qwen_4b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
)
self.model_7b = Llama(
model_path=self.qwen_7b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
)
def __call__(self, data):
# 1. Hugging Face puts our JSON inside the "inputs" key.
# We use .get("inputs", data) so it still works gracefully if tested locally.
payload = data.get("inputs", data)
# 2. Extract parameters from the payload (using .get instead of .pop is safer here)
messages = payload.get("messages", [{"role": "user", "content": "Hello"}])
target_model = payload.get("target_model", "test_case")
max_tokens = payload.get("max_tokens", 512)
temperature = payload.get("temperature", 0.7)
response_format = payload.get("response_format", None)
repeat_penalty = payload.get("repeat_penalty", 1.05)
stop = payload.get("stop", ["<|im_end|>"])
# 3. Route request
if target_model == "test_case":
active_model = self.model_4b
elif target_model == "test_script":
active_model = self.model_7b
else:
return {
"error": f"Invalid target_model: '{target_model}'. Use 'test_case' or 'test_script'."
}
# 4. Generate and return response
response = active_model.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
response_format=response_format, # Don't forget to pass this!
repeat_penalty=repeat_penalty,
stop=stop
)
return response