| from huggingface_hub import hf_hub_download
|
| from llama_cpp import Llama
|
|
|
|
|
| class EndpointHandler:
|
| def __init__(self, path=""):
|
|
|
| self.qwen_4b_path = hf_hub_download(
|
| repo_id="AtomP/NewQwenTestCase", filename="qwen3-4b-instruct-2507.Q8_0.gguf"
|
| )
|
| self.qwen_7b_path = hf_hub_download(
|
| repo_id="unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF",
|
| filename="Qwen2.5-Coder-7B-Instruct-Q8_0.gguf",
|
| )
|
|
|
|
|
|
|
| self.model_4b = Llama(
|
| model_path=self.qwen_4b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
|
| )
|
|
|
| self.model_7b = Llama(
|
| model_path=self.qwen_7b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
|
| )
|
|
|
| def __call__(self, data):
|
|
|
|
|
| payload = data.get("inputs", data)
|
|
|
|
|
| messages = payload.get("messages", [{"role": "user", "content": "Hello"}])
|
| target_model = payload.get("target_model", "test_case")
|
| max_tokens = payload.get("max_tokens", 512)
|
| temperature = payload.get("temperature", 0.7)
|
| response_format = payload.get("response_format", None)
|
| repeat_penalty = payload.get("repeat_penalty", 1.05)
|
| stop = payload.get("stop", ["<|im_end|>"])
|
|
|
|
|
| if target_model == "test_case":
|
| active_model = self.model_4b
|
| elif target_model == "test_script":
|
| active_model = self.model_7b
|
| else:
|
| return {
|
| "error": f"Invalid target_model: '{target_model}'. Use 'test_case' or 'test_script'."
|
| }
|
|
|
|
|
| response = active_model.create_chat_completion(
|
| messages=messages,
|
| max_tokens=max_tokens,
|
| temperature=temperature,
|
| response_format=response_format,
|
| repeat_penalty=repeat_penalty,
|
| stop=stop
|
| )
|
|
|
| return response
|
|
|