Spaces:
Running
Running
File size: 2,861 Bytes
2e03471 c71f038 2e03471 c71f038 375efe1 beba557 c71f038 2e03471 375efe1 2e03471 375efe1 2e03471 375efe1 2e03471 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import time
import re
from llama_cpp import Llama
from api.config import LABELS
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# DEFINE GGUF MODEL PATH
MODEL_PATH = "cngchis/phi4-mini-intent-GGUF"
# LOAD GGUF MODEL
try:
llm = Llama.from_pretrained(
repo_id=MODEL_PATH,
filename="*q4_k_m*",
n_ctx=128,
n_threads=2,
verbose=False,
)
gguf_available = True
logger.info(f"GGUF model loaded from {MODEL_PATH}")
except Exception as e:
logger.warning(f"GGUF load failed: {e}")
gguf_available = False
def format_prompt(text: str) -> str:
return f"""
Classify intent into one label:
api, billing, cancellation, complaint, technical, upgrade
Message:
{text}
Label:
"""
def extract_label(raw: str) -> str:
raw = re.sub(r'<thought>.*?</thought>', '', raw, flags=re.DOTALL)
raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL)
raw = raw.strip().lower()
words = re.findall(
r"\b(api|billing|cancellation|complaint|technical|upgrade)\b", raw
)
if words:
return words[-1]
return "unknown"
def infer_phi4(text: str) -> dict:
if not gguf_available:
return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}
try:
start = time.time()
output = llm(
format_prompt(text),
max_tokens=5,
temperature=0.1,
stop=["</s>", "[/INST]"]
)
raw = output["choices"][0]["text"]
predicted = extract_label(raw)
latency = (time.time() - start) * 1000
return {
"intent" : predicted,
"latency_ms": round(latency, 2),
"confidence": 0.93,
"model" : "phi4_gguf"
}
except Exception as e:
logger.error(f"GGUF ERROR: {e}")
return {"intent": "unknown", "latency_ms": 0, "confidence": 0.0, "model": "phi4_gguf"}
def infer(text: str) -> dict:
start = time.time()
results = [infer_phi4(text)]
total_latency = (time.time() - start) * 1000
valid_results = [r for r in results if r["intent"] != "unknown"]
if not valid_results:
final_intent = "unknown"
confidence = 0.0
else:
intent_votes = {}
for r in valid_results:
intent_votes[r["intent"]] = intent_votes.get(r["intent"], 0) + 1
final_intent = max(intent_votes, key=intent_votes.get)
confidence = sum(r["confidence"] for r in valid_results) / len(valid_results)
return {
"intent" : final_intent,
"confidence" : round(confidence, 3),
"model_results" : results,
"latency_ms" : round(total_latency, 2),
"individual_latencies": {r["model"]: r["latency_ms"] for r in results}
} |