Spaces:

LeonardoMdSA
/

Context-aware-NLP-classification-platform-with-MCP

Running

App Files Files Community

Context-aware-NLP-classification-platform-with-MCP / app /classification /sklearn_model.py

LeonardoMdSA's picture

changed sklearn model and added data

b45fa4d about 2 months ago

history blame contribute delete

4.02 kB

	from pathlib import Path
	import joblib
	import json
	import re
	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.calibration import CalibratedClassifierCV
	from typing import Dict
	import os

	# -------------------------
	# Preprocessing
	# -------------------------
	try:
	from app.classification.preprocess import clean_text as external_clean_text
	clean_text = external_clean_text
	except ImportError:
	def clean_text(text: str) -> str:
	text = text.lower()
	text = re.sub(r"\d+", "NUM", text)
	text = re.sub(r"\s+", " ", text)
	text = re.sub(r"[\x00-\x1f]+", "", text)
	return text.strip()


	class SklearnClassifier:
	"""
	Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
	now with probability calibration.
	"""

	# Make MODEL_PATH absolute relative to project root
	PROJECT_ROOT = Path(__file__).resolve().parents[2]
	MODEL_PATH = PROJECT_ROOT / "models" / "trained_pipeline.joblib"

	def __init__(self, dataset_path: str = None):
	if dataset_path is None:
	dataset_path = self.PROJECT_ROOT / "data" / "samples" / "training_data.json"
	else:
	dataset_path = Path(dataset_path)

	# Base logistic regression
	base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
	# Wrap with probability calibration
	calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')

	self.pipeline = Pipeline([
	("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
	("clf", calibrated_clf)
	])
	self.is_trained = False

	# -------------------------
	# Load trained model if exists
	# -------------------------
	if self.MODEL_PATH.exists():
	self.pipeline = joblib.load(self.MODEL_PATH)
	self.is_trained = True
	elif dataset_path.exists():
	self.train_from_json(dataset_path)
	else:
	print(f"[Warning] No trained model or dataset found. Using fallback logic.")

	def train_from_json(self, dataset_path: Path):
	data = json.loads(dataset_path.read_text(encoding="utf-8"))
	texts = [clean_text(d["text"]) for d in data]
	labels = [d["label"] for d in data]

	self.pipeline.fit(texts, labels)
	self.is_trained = True

	# Save model
	self.MODEL_PATH.parent.mkdir(exist_ok=True, parents=True)
	joblib.dump(self.pipeline, self.MODEL_PATH)

	def predict(self, text: str) -> Dict[str, float]:
	text_clean = clean_text(text)
	if self.is_trained:
	try:
	label = self.pipeline.predict([text_clean])[0]
	# calibrated probabilities
	try:
	confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
	except Exception:
	confidence = 0.8
	except Exception as e:
	print("[Error] Sklearn prediction failed:", e)
	label = "unknown"
	confidence = 0.3
	else:
	# fallback heuristic
	if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean):
	label = "finance.invoice"
	elif "policy" in text_clean or "hr" in text_clean:
	label = "hr.policy"
	else:
	label = "legal.contract"
	confidence = 0.3

	return {"label": label, "confidence": confidence}


	# -------------------------
	# Quick sanity check when run directly
	# -------------------------
	if __name__ == "__main__":
	clf = SklearnClassifier()
	print("Is trained?", clf.is_trained)
	samples = [
	"Invoice for Q3 2025 amount 23923 $",
	"HR policy update for employees",
	"Signed legal contract for vendor"
	]
	for s in samples:
	print(s, "->", clf.predict(s))