AtomP
/

QwenCombined

Model card Files Files and versions

QwenCombined / handler.py

AtomP's picture

Upload handler.py

0e0974d verified 3 months ago

history blame contribute delete

2.47 kB

	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama


	class EndpointHandler:
	def __init__(self, path=""):
	# 1. Download GGUF files from the Hub
	self.qwen_4b_path = hf_hub_download(
	repo_id="AtomP/NewQwenTestCase", filename="qwen3-4b-instruct-2507.Q8_0.gguf"
	)
	self.qwen_7b_path = hf_hub_download(
	repo_id="unsloth/Qwen2.5-Coder-7B-Instruct-128K-GGUF",
	filename="Qwen2.5-Coder-7B-Instruct-Q8_0.gguf",
	)

	# 2. Load models into GPU memory (n_gpu_layers=-1 offloads all layers to GPU)
	# n_ctx limits the context window to save VRAM. Increase this if your GPU has capacity.
	self.model_4b = Llama(
	model_path=self.qwen_4b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
	)

	self.model_7b = Llama(
	model_path=self.qwen_7b_path, n_gpu_layers=-1, n_ctx=8192, verbose=False
	)

	def __call__(self, data):
	# 1. Hugging Face puts our JSON inside the "inputs" key.
	# We use .get("inputs", data) so it still works gracefully if tested locally.
	payload = data.get("inputs", data)

	# 2. Extract parameters from the payload (using .get instead of .pop is safer here)
	messages = payload.get("messages", [{"role": "user", "content": "Hello"}])
	target_model = payload.get("target_model", "test_case")
	max_tokens = payload.get("max_tokens", 512)
	temperature = payload.get("temperature", 0.7)
	response_format = payload.get("response_format", None)
	repeat_penalty = payload.get("repeat_penalty", 1.05)
	stop = payload.get("stop", ["<\|im_end\|>"])

	# 3. Route request
	if target_model == "test_case":
	active_model = self.model_4b
	elif target_model == "test_script":
	active_model = self.model_7b
	else:
	return {
	"error": f"Invalid target_model: '{target_model}'. Use 'test_case' or 'test_script'."
	}

	# 4. Generate and return response
	response = active_model.create_chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	response_format=response_format, # Don't forget to pass this!
	repeat_penalty=repeat_penalty,
	stop=stop
	)

	return response