Spaces:

LeonardoMdSA
/

Context-aware-NLP-classification-platform-with-MCP

Running

LeonardoMdSA commited on Jan 7

Commit

b45fa4d

1 Parent(s): e3754ce

changed sklearn model and added data

Files changed (3) hide show

app/classification/llm_adapter.py CHANGED Viewed

@@ -18,5 +18,5 @@ class LLMAdapter:
             result["label"] = "finance.invoice"
         # optionally adjust confidence
         if context and context.get("policies_applied"):
-            result["confidence"] = min(result["confidence"] + 0.1, 0.99)
         return result

             result["label"] = "finance.invoice"
         # optionally adjust confidence
         if context and context.get("policies_applied"):
+            result["confidence"] = min(result["confidence"] + 0.05, 0.99)
         return result

app/classification/sklearn_model.py CHANGED Viewed

@@ -5,6 +5,7 @@ import re
 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from typing import Dict
 import os
@@ -25,7 +26,8 @@ except ImportError:
 class SklearnClassifier:
     """
-    Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal.
     """
     # Make MODEL_PATH absolute relative to project root
@@ -38,9 +40,14 @@ class SklearnClassifier:
         else:
             dataset_path = Path(dataset_path)
         self.pipeline = Pipeline([
             ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
-            ("clf", LogisticRegression(max_iter=500, class_weight='balanced', C=1.0))
         ])
         self.is_trained = False
@@ -72,7 +79,7 @@ class SklearnClassifier:
         if self.is_trained:
             try:
                 label = self.pipeline.predict([text_clean])[0]
-                # fallback if pipeline has no predict_proba (safety)
                 try:
                     confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
                 except Exception:

 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
+from sklearn.calibration import CalibratedClassifierCV
 from typing import Dict
 import os
 class SklearnClassifier:
     """
+    Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
+    now with probability calibration.
     """
     # Make MODEL_PATH absolute relative to project root
         else:
             dataset_path = Path(dataset_path)
+        # Base logistic regression
+        base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
+        # Wrap with probability calibration
+        calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')
         self.pipeline = Pipeline([
             ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
+            ("clf", calibrated_clf)
         ])
         self.is_trained = False
         if self.is_trained:
             try:
                 label = self.pipeline.predict([text_clean])[0]
+                # calibrated probabilities
                 try:
                     confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
                 except Exception:

models/trained_pipeline.joblib CHANGED Viewed

Binary files a/models/trained_pipeline.joblib and b/models/trained_pipeline.joblib differ