Support-ticket-router / src /api /classifier.py
cngchis's picture
Update src/api/classifier.py
375efe1 verified
import time
import re
from llama_cpp import Llama
from api.config import LABELS
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# DEFINE GGUF MODEL PATH
MODEL_PATH = "cngchis/phi4-mini-intent-GGUF"
# LOAD GGUF MODEL
try:
llm = Llama.from_pretrained(
repo_id=MODEL_PATH,
filename="*q4_k_m*",
n_ctx=128,
n_threads=2,
verbose=False,
)
gguf_available = True
logger.info(f"GGUF model loaded from {MODEL_PATH}")
except Exception as e:
logger.warning(f"GGUF load failed: {e}")
gguf_available = False
def format_prompt(text: str) -> str:
return f"""
Classify intent into one label:
api, billing, cancellation, complaint, technical, upgrade
Message:
{text}
Label:
"""
def extract_label(raw: str) -> str:
raw = re.sub(r'<thought>.*?</thought>', '', raw, flags=re.DOTALL)
raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL)
raw = raw.strip().lower()
words = re.findall(
r"\b(api|billing|cancellation|complaint|technical|upgrade)\b", raw
)
if words:
return words[-1]
return "unknown"
def infer_phi4(text: str) -> dict:
if not gguf_available:
return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}
try:
start = time.time()
output = llm(
format_prompt(text),
max_tokens=5,
temperature=0.1,
stop=["</s>", "[/INST]"]
)
raw = output["choices"][0]["text"]
predicted = extract_label(raw)
latency = (time.time() - start) * 1000
return {
"intent" : predicted,
"latency_ms": round(latency, 2),
"confidence": 0.93,
"model" : "phi4_gguf"
}
except Exception as e:
logger.error(f"GGUF ERROR: {e}")
return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}
def infer(text: str) -> dict:
start = time.time()
results = [infer_phi4(text)]
total_latency = (time.time() - start) * 1000
valid_results = [r for r in results if r["intent"] != "unknown"]
if not valid_results:
final_intent = "unknown"
confidence = 0.0
else:
intent_votes = {}
for r in valid_results:
intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1
final_intent = max(intent_votes, key=intent_votes.get)
confidence = sum(r["confidence"] for r in valid_results) / len(valid_results)
return {
"intent" : final_intent,
"confidence" : round(confidence, 3),
"model_results" : results,
"latency_ms" : round(total_latency, 2),
"individual_latencies": {r["model"]: r["latency_ms"] for r in results}
}