import time import re from llama_cpp import Llama from api.config import LABELS import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # DEFINE GGUF MODEL PATH MODEL_PATH = "cngchis/phi4-mini-intent-GGUF" # LOAD GGUF MODEL try: llm = Llama.from_pretrained( repo_id=MODEL_PATH, filename="*q4_k_m*", n_ctx=128, n_threads=2, verbose=False, ) gguf_available = True logger.info(f"GGUF model loaded from {MODEL_PATH}") except Exception as e: logger.warning(f"GGUF load failed: {e}") gguf_available = False def format_prompt(text: str) -> str: return f""" Classify intent into one label: api, billing, cancellation, complaint, technical, upgrade Message: {text} Label: """ def extract_label(raw: str) -> str: raw = re.sub(r'.*?', '', raw, flags=re.DOTALL) raw = re.sub(r'.*?', '', raw, flags=re.DOTALL) raw = raw.strip().lower() words = re.findall( r"\b(api|billing|cancellation|complaint|technical|upgrade)\b", raw ) if words: return words[-1] return "unknown" def infer_phi4(text: str) -> dict: if not gguf_available: return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"} try: start = time.time() output = llm( format_prompt(text), max_tokens=5, temperature=0.1, stop=["", "[/INST]"] ) raw = output["choices"][0]["text"] predicted = extract_label(raw) latency = (time.time() - start) * 1000 return { "intent" : predicted, "latency_ms": round(latency, 2), "confidence": 0.93, "model" : "phi4_gguf" } except Exception as e: logger.error(f"GGUF ERROR: {e}") return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"} def infer(text: str) -> dict: start = time.time() results = [infer_phi4(text)] total_latency = (time.time() - start) * 1000 valid_results = [r for r in results if r["intent"] != "unknown"] if not valid_results: final_intent = "unknown" confidence = 0.0 else: intent_votes = {} for r in valid_results: intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1 final_intent = max(intent_votes, key=intent_votes.get) confidence = sum(r["confidence"] for r in valid_results) / len(valid_results) return { "intent" : final_intent, "confidence" : round(confidence, 3), "model_results" : results, "latency_ms" : round(total_latency, 2), "individual_latencies": {r["model"]: r["latency_ms"] for r in results} }