Spaces:

cngchis
/

Support-ticket-router

Running

App Files Files Community

Support-ticket-router / src /api /classifier.py

cngchis

Update src/api/classifier.py

375efe1 verified 12 days ago

raw

history blame contribute delete

2.86 kB

	import time
	import re
	from llama_cpp import Llama
	from api.config import LABELS
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# DEFINE GGUF MODEL PATH
	MODEL_PATH = "cngchis/phi4-mini-intent-GGUF"

	# LOAD GGUF MODEL
	try:
	llm = Llama.from_pretrained(
	repo_id=MODEL_PATH,
	filename="q4_k_m",
	n_ctx=128,
	n_threads=2,
	verbose=False,
	)
	gguf_available = True
	logger.info(f"GGUF model loaded from {MODEL_PATH}")

	except Exception as e:
	logger.warning(f"GGUF load failed: {e}")
	gguf_available = False

	def format_prompt(text: str) -> str:
	return f"""
	Classify intent into one label:
	api, billing, cancellation, complaint, technical, upgrade

	Message:
	{text}

	Label:
	"""

	def extract_label(raw: str) -> str:
	raw = re.sub(r'<thought>.*?</thought>', '', raw, flags=re.DOTALL)
	raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL)
	raw = raw.strip().lower()
	words = re.findall(
	r"\b(api\|billing\|cancellation\|complaint\|technical\|upgrade)\b", raw
	)
	if words:
	return words[-1]
	return "unknown"

	def infer_phi4(text: str) -> dict:
	if not gguf_available:
	return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}

	try:
	start = time.time()
	output = llm(
	format_prompt(text),
	max_tokens=5,
	temperature=0.1,
	stop=["</s>", "[/INST]"]
	)
	raw = output["choices"][0]["text"]
	predicted = extract_label(raw)
	latency = (time.time() - start) * 1000

	return {
	"intent" : predicted,
	"latency_ms": round(latency, 2),
	"confidence": 0.93,
	"model" : "phi4_gguf"
	}
	except Exception as e:
	logger.error(f"GGUF ERROR: {e}")
	return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}

	def infer(text: str) -> dict:
	start = time.time()

	results = [infer_phi4(text)]
	total_latency = (time.time() - start) * 1000

	valid_results = [r for r in results if r["intent"] != "unknown"]

	if not valid_results:
	final_intent = "unknown"
	confidence = 0.0
	else:
	intent_votes = {}
	for r in valid_results:
	intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1
	final_intent = max(intent_votes, key=intent_votes.get)
	confidence = sum(r["confidence"] for r in valid_results) / len(valid_results)

	return {
	"intent" : final_intent,
	"confidence" : round(confidence, 3),
	"model_results" : results,
	"latency_ms" : round(total_latency, 2),
	"individual_latencies": {r["model"]: r["latency_ms"] for r in results}
	}