Spaces:

harshvisualz
/

vgecbot

Sleeping

App Files Files Community

vgecbot / app /services /classifier_service.py

harsh-dev

docker deployment

4225666 about 2 months ago

raw

history blame contribute delete

11.5 kB

	import re
	import numpy as np
	import pickle
	from sentence_transformers import SentenceTransformer
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split, cross_val_score
	from app.core.config import settings
	from typing import List, Optional

	def load_pipeline(path):
	with open(path, "rb") as f:
	pipeline = pickle.load(f)

	return pipeline

	class Classifier:
	def __init__(
	self,
	tfidf,
	abbreviations,
	master_index,
	le_type,
	le_category,
	le_topic,
	le_intent,
	models=None,
	df=None,
	):
	self.tfidf = tfidf
	self.abbreviations = abbreviations
	self.master_index = master_index

	self.le_type = le_type
	self.le_category = le_category
	self.le_topic = le_topic
	self.le_intent = le_intent
	model_path = settings.embeddings_path / "mdbr-leaf-mt"
	if model_path.exists():
	self.embedding_model = SentenceTransformer(str(model_path))
	else:
	self.embedding_model = SentenceTransformer("MongoDB/mdbr-leaf-mt")

	# Prediction thresholds: below these, the field is set to None entirely
	self.threshold = {
	"type": 0.4,
	"category": 0.4,
	"topic": 0.5,
	"intent": 0.6
	}

	# Filter thresholds: above these, the field is used as a hard ChromaDB filter
	# Kept separate so you can tune "when to predict" vs "when to filter" independently
	self.filter_threshold = {
	"type": 0.65,
	"category": 0.65,
	"topic": 0.70,
	}

	# If trained models are passed
	if models is not None:
	self.models = models
	else:
	if df is None:
	raise ValueError("Either provide trained models or provide df to train.")
	self.models = self.train_models(df)

	def _build_filter(self, result):
	# If type confidence doesn't clear the filter bar, the entire filter
	# is unreliable — return None so retrieval does a full scan instead.
	if result.get("type_conf", 0) < self.filter_threshold["type"]:
	return None

	# --- Hard AND anchors (always reliable) ---
	hard_conditions = []
	hard_conditions.append({"type": result["type"]})

	# intent — handles special case for "count" to include "detail"
	intent = result.get("intent") or "detail"
	if intent == "count":
	hard_conditions.append({"$or": [{"intent": "count"}, {"intent": "detail"}]})
	else:
	hard_conditions.append({"intent": intent})

	# --- Soft OR hints (category / topic) ---
	# A document only needs to match ONE of these to pass.
	# This avoids dropping valid docs that are tagged with category but
	# not topic (or vice-versa), while still keeping retrieval directional.
	soft_conditions = []

	if result.get("category") and result.get("category_conf", 0) >= self.filter_threshold["category"]:
	soft_conditions.append({"category": result["category"]})
	else:
	soft_conditions.append({"category": "general"})

	if result.get("topic") and result.get("topic_conf", 0) >= self.filter_threshold["topic"]:
	soft_conditions.append({"topic": result["topic"]})
	else:
	soft_conditions.append({"topic": "general"})

	# Build final filter
	# Case 1: No soft hints — filter on hard anchors only (broad query like "list all departments")
	if not soft_conditions:
	if len(hard_conditions) == 1:
	return hard_conditions[0]
	return {"$and": hard_conditions}

	# Case 2: One soft hint — add it directly to the AND (no $or needed)
	if len(soft_conditions) == 1:
	return {"$and": hard_conditions + soft_conditions}

	# Case 3: Both category and topic are confident — combine as $or inside the AND
	# Final shape: type AND intent AND (category OR topic)
	return {"$and": hard_conditions + [{"$or": soft_conditions}]}

	def predict_with_filter(self, queries):
	filters = self.predict(queries)[0]
	return self._build_filter(filters)

	def expand_abbreviations(self, text):
	text = text.lower().strip()
	for abbr, full in self.abbreviations.items():
	pattern = r'\b' + re.escape(abbr.lower()) + r'\b' # ← lowercase the key too
	text = re.sub(pattern, full, text)
	return text

	def get_features(self, queries):

	queries_clean = [self.expand_abbreviations(q) for q in queries]

	embeddings = self.embedding_model.encode(
	queries_clean, show_progress_bar=False
	)

	if not hasattr(self.tfidf, "vocabulary_"):
	tfidf_features = self.tfidf.fit_transform(queries_clean).toarray()
	else:
	tfidf_features = self.tfidf.transform(queries_clean).toarray()

	return np.hstack([embeddings, tfidf_features])

	def train_single(self, X, y, field, C=0.01):

	X_train, X_test, y_train, y_test = train_test_split(
	X, y,
	test_size=0.2,
	random_state=42,
	stratify=y
	)

	clf = LogisticRegression(
	C=C,
	penalty="l2",
	solver="lbfgs",
	max_iter=2000,
	class_weight="balanced",
	random_state=42
	)

	clf.fit(X_train, y_train)

	train_acc = clf.score(X_train, y_train)
	test_acc = clf.score(X_test, y_test)

	cv_scores = cross_val_score(clf, X, y, cv=5)

	print(f"\n{field.upper()}:")
	print(f"Train: {train_acc:.3f} \| Test: {test_acc:.3f} \| CV: {cv_scores.mean():.3f}")

	return clf

	def train_models(self, df):

	X = self.get_features(df["question"].tolist())

	self.models["type"] = self.train_single(
	X, df["type"].values, "type", C=0.01
	)

	self.models["category"] = self.train_single(
	X, df["category"].values, "category", C=0.005
	)

	self.models["topic"] = self.train_single(
	X, df["topic"].values, "topic", C=0.005
	)

	self.models["intent"] = self.train_single(
	X, df["intent"].values, "intent", C=0.005
	)

	return self.models

	def predict(self, queries: List[str], enforce_constraints=True):

	X = self.get_features(queries)
	results = []

	for i, query in enumerate(queries):

	res = {"question": query}

	# ---------- TYPE ----------
	type_proba = self.models["type"].predict_proba([X[i]])[0]
	type_classes = self.models["type"].classes_

	type_idx = np.argmax(type_proba)
	type_pred = type_classes[type_idx]

	res["type"] = self.le_type.inverse_transform([type_pred])[0]
	res["type_conf"] = float(type_proba[type_idx])

	# ---------- CATEGORY ----------
	category_proba = self.models["category"].predict_proba([X[i]])[0]
	category_classes = self.models["category"].classes_

	if enforce_constraints:

	category_labels = self.le_category.inverse_transform(category_classes)
	allowed = set(self.master_index[res["type"]]["categories"])

	filtered = [
	(label, prob)
	for label, prob in zip(category_labels, category_proba)
	if label in allowed
	]

	if filtered:
	best_category, best_prob = max(filtered, key=lambda x: x[1])
	else:
	idx = np.argmax(category_proba)
	best_category = category_labels[idx]
	best_prob = category_proba[idx]

	res["category"] = best_category
	res["category_conf"] = float(best_prob)

	else:

	idx = np.argmax(category_proba)
	pred = category_classes[idx]

	res["category"] = self.le_category.inverse_transform([pred])[0]
	res["category_conf"] = float(category_proba[idx])

	# ---------- TOPIC ----------
	topic_proba = self.models["topic"].predict_proba([X[i]])[0]
	topic_classes = self.models["topic"].classes_

	if enforce_constraints:

	topic_labels = self.le_topic.inverse_transform(topic_classes)
	allowed = set(self.master_index[res["type"]]["topics"])

	filtered = [
	(label, prob)
	for label, prob in zip(topic_labels, topic_proba)
	if label in allowed
	]

	if filtered:
	best_topic, best_prob = max(filtered, key=lambda x: x[1])
	else:
	idx = np.argmax(topic_proba)
	best_topic = topic_labels[idx]
	best_prob = topic_proba[idx]

	res["topic"] = best_topic
	res["topic_conf"] = float(best_prob)

	else:

	idx = np.argmax(topic_proba)
	pred = topic_classes[idx]

	res["topic"] = self.le_topic.inverse_transform([pred])[0]
	res["topic_conf"] = float(topic_proba[idx])

	# ---------- INTENT ----------
	intent_proba = self.models["intent"].predict_proba([X[i]])[0]
	intent_classes = self.models["intent"].classes_

	intent_idx = np.argmax(intent_proba)
	intent_pred = intent_classes[intent_idx]

	res["intent"] = self.le_intent.inverse_transform([intent_pred])[0]
	res["intent_conf"] = float(intent_proba[intent_idx])

	if res["type_conf"] < self.threshold["type"]:
	res["type"] = None
	res["type_conf"] = 0
	if res["category_conf"] < self.threshold["category"]:
	res["category"] = None
	res["category_conf"] = 0
	if res["topic_conf"] < self.threshold["topic"]:
	res["topic"] = None
	res["topic_conf"] = 0
	if res["intent_conf"] < self.threshold["intent"]:
	res["intent"] = None
	res["intent_conf"] = 0

	print("=" * 50)
	print(query)
	print(f"Type: {res['type']}, {res['type_conf']}")
	print(f"Category: {res['category']}, {res['category_conf']}")
	print(f"Topic: {res['topic']}, {res['topic_conf']}")
	print(f"Intent: {res['intent']}, {res['intent_conf']}")
	print("=" * 50)

	results.append(res)

	return results


	classifier_path = settings.classifier_path / "chatbot_classifier.pkl"
	pipeline = load_pipeline(classifier_path)

	models = pipeline["models"]
	tfidf = pipeline["tfidf"]

	le_type = pipeline["le_type"]
	le_category = pipeline["le_category"]
	le_topic = pipeline["le_topic"]
	le_intent = pipeline["le_intent"]

	MASTER_INDEX = pipeline["MASTER_INDEX"]
	ABBREVIATIONS = pipeline["ABBREVIATIONS"]

	clf = Classifier(
	tfidf=tfidf,
	abbreviations=ABBREVIATIONS,
	master_index=MASTER_INDEX,
	le_type=le_type,
	le_category=le_category,
	le_topic=le_topic,
	le_intent=le_intent,
	models=models
	)