Spaces:
Running
Running
| import time | |
| import re | |
| from llama_cpp import Llama | |
| from api.config import LABELS | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # DEFINE GGUF MODEL PATH | |
| MODEL_PATH = "cngchis/phi4-mini-intent-GGUF" | |
| # LOAD GGUF MODEL | |
| try: | |
| llm = Llama.from_pretrained( | |
| repo_id=MODEL_PATH, | |
| filename="*q4_k_m*", | |
| n_ctx=128, | |
| n_threads=2, | |
| verbose=False, | |
| ) | |
| gguf_available = True | |
| logger.info(f"GGUF model loaded from {MODEL_PATH}") | |
| except Exception as e: | |
| logger.warning(f"GGUF load failed: {e}") | |
| gguf_available = False | |
| def format_prompt(text: str) -> str: | |
| return f""" | |
| Classify intent into one label: | |
| api, billing, cancellation, complaint, technical, upgrade | |
| Message: | |
| {text} | |
| Label: | |
| """ | |
| def extract_label(raw: str) -> str: | |
| raw = re.sub(r'<thought>.*?</thought>', '', raw, flags=re.DOTALL) | |
| raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL) | |
| raw = raw.strip().lower() | |
| words = re.findall( | |
| r"\b(api|billing|cancellation|complaint|technical|upgrade)\b", raw | |
| ) | |
| if words: | |
| return words[-1] | |
| return "unknown" | |
| def infer_phi4(text: str) -> dict: | |
| if not gguf_available: | |
| return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"} | |
| try: | |
| start = time.time() | |
| output = llm( | |
| format_prompt(text), | |
| max_tokens=5, | |
| temperature=0.1, | |
| stop=["</s>", "[/INST]"] | |
| ) | |
| raw = output["choices"][0]["text"] | |
| predicted = extract_label(raw) | |
| latency = (time.time() - start) * 1000 | |
| return { | |
| "intent" : predicted, | |
| "latency_ms": round(latency, 2), | |
| "confidence": 0.93, | |
| "model" : "phi4_gguf" | |
| } | |
| except Exception as e: | |
| logger.error(f"GGUF ERROR: {e}") | |
| return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"} | |
| def infer(text: str) -> dict: | |
| start = time.time() | |
| results = [infer_phi4(text)] | |
| total_latency = (time.time() - start) * 1000 | |
| valid_results = [r for r in results if r["intent"] != "unknown"] | |
| if not valid_results: | |
| final_intent = "unknown" | |
| confidence = 0.0 | |
| else: | |
| intent_votes = {} | |
| for r in valid_results: | |
| intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1 | |
| final_intent = max(intent_votes, key=intent_votes.get) | |
| confidence = sum(r["confidence"] for r in valid_results) / len(valid_results) | |
| return { | |
| "intent" : final_intent, | |
| "confidence" : round(confidence, 3), | |
| "model_results" : results, | |
| "latency_ms" : round(total_latency, 2), | |
| "individual_latencies": {r["model"]: r["latency_ms"] for r in results} | |
| } |