Commit
·
b45fa4d
1
Parent(s):
e3754ce
changed sklearn model and added data
Browse files
app/classification/llm_adapter.py
CHANGED
|
@@ -18,5 +18,5 @@ class LLMAdapter:
|
|
| 18 |
result["label"] = "finance.invoice"
|
| 19 |
# optionally adjust confidence
|
| 20 |
if context and context.get("policies_applied"):
|
| 21 |
-
result["confidence"] = min(result["confidence"] + 0.
|
| 22 |
return result
|
|
|
|
| 18 |
result["label"] = "finance.invoice"
|
| 19 |
# optionally adjust confidence
|
| 20 |
if context and context.get("policies_applied"):
|
| 21 |
+
result["confidence"] = min(result["confidence"] + 0.05, 0.99)
|
| 22 |
return result
|
app/classification/sklearn_model.py
CHANGED
|
@@ -5,6 +5,7 @@ import re
|
|
| 5 |
from sklearn.pipeline import Pipeline
|
| 6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 7 |
from sklearn.linear_model import LogisticRegression
|
|
|
|
| 8 |
from typing import Dict
|
| 9 |
import os
|
| 10 |
|
|
@@ -25,7 +26,8 @@ except ImportError:
|
|
| 25 |
|
| 26 |
class SklearnClassifier:
|
| 27 |
"""
|
| 28 |
-
Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Make MODEL_PATH absolute relative to project root
|
|
@@ -38,9 +40,14 @@ class SklearnClassifier:
|
|
| 38 |
else:
|
| 39 |
dataset_path = Path(dataset_path)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
self.pipeline = Pipeline([
|
| 42 |
("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
|
| 43 |
-
("clf",
|
| 44 |
])
|
| 45 |
self.is_trained = False
|
| 46 |
|
|
@@ -72,7 +79,7 @@ class SklearnClassifier:
|
|
| 72 |
if self.is_trained:
|
| 73 |
try:
|
| 74 |
label = self.pipeline.predict([text_clean])[0]
|
| 75 |
-
#
|
| 76 |
try:
|
| 77 |
confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
|
| 78 |
except Exception:
|
|
|
|
| 5 |
from sklearn.pipeline import Pipeline
|
| 6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 7 |
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.calibration import CalibratedClassifierCV
|
| 9 |
from typing import Dict
|
| 10 |
import os
|
| 11 |
|
|
|
|
| 26 |
|
| 27 |
class SklearnClassifier:
|
| 28 |
"""
|
| 29 |
+
Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
|
| 30 |
+
now with probability calibration.
|
| 31 |
"""
|
| 32 |
|
| 33 |
# Make MODEL_PATH absolute relative to project root
|
|
|
|
| 40 |
else:
|
| 41 |
dataset_path = Path(dataset_path)
|
| 42 |
|
| 43 |
+
# Base logistic regression
|
| 44 |
+
base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
|
| 45 |
+
# Wrap with probability calibration
|
| 46 |
+
calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')
|
| 47 |
+
|
| 48 |
self.pipeline = Pipeline([
|
| 49 |
("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
|
| 50 |
+
("clf", calibrated_clf)
|
| 51 |
])
|
| 52 |
self.is_trained = False
|
| 53 |
|
|
|
|
| 79 |
if self.is_trained:
|
| 80 |
try:
|
| 81 |
label = self.pipeline.predict([text_clean])[0]
|
| 82 |
+
# calibrated probabilities
|
| 83 |
try:
|
| 84 |
confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
|
| 85 |
except Exception:
|
models/trained_pipeline.joblib
CHANGED
|
Binary files a/models/trained_pipeline.joblib and b/models/trained_pipeline.joblib differ
|
|
|