LeonardoMdSA commited on
Commit
b45fa4d
·
1 Parent(s): e3754ce

changed sklearn model and added data

Browse files
app/classification/llm_adapter.py CHANGED
@@ -18,5 +18,5 @@ class LLMAdapter:
18
  result["label"] = "finance.invoice"
19
  # optionally adjust confidence
20
  if context and context.get("policies_applied"):
21
- result["confidence"] = min(result["confidence"] + 0.1, 0.99)
22
  return result
 
18
  result["label"] = "finance.invoice"
19
  # optionally adjust confidence
20
  if context and context.get("policies_applied"):
21
+ result["confidence"] = min(result["confidence"] + 0.05, 0.99)
22
  return result
app/classification/sklearn_model.py CHANGED
@@ -5,6 +5,7 @@ import re
5
  from sklearn.pipeline import Pipeline
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.linear_model import LogisticRegression
 
8
  from typing import Dict
9
  import os
10
 
@@ -25,7 +26,8 @@ except ImportError:
25
 
26
  class SklearnClassifier:
27
  """
28
- Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal.
 
29
  """
30
 
31
  # Make MODEL_PATH absolute relative to project root
@@ -38,9 +40,14 @@ class SklearnClassifier:
38
  else:
39
  dataset_path = Path(dataset_path)
40
 
 
 
 
 
 
41
  self.pipeline = Pipeline([
42
  ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
43
- ("clf", LogisticRegression(max_iter=500, class_weight='balanced', C=1.0))
44
  ])
45
  self.is_trained = False
46
 
@@ -72,7 +79,7 @@ class SklearnClassifier:
72
  if self.is_trained:
73
  try:
74
  label = self.pipeline.predict([text_clean])[0]
75
- # fallback if pipeline has no predict_proba (safety)
76
  try:
77
  confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
78
  except Exception:
 
5
  from sklearn.pipeline import Pipeline
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.linear_model import LogisticRegression
8
+ from sklearn.calibration import CalibratedClassifierCV
9
  from typing import Dict
10
  import os
11
 
 
26
 
27
  class SklearnClassifier:
28
  """
29
+ Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
30
+ now with probability calibration.
31
  """
32
 
33
  # Make MODEL_PATH absolute relative to project root
 
40
  else:
41
  dataset_path = Path(dataset_path)
42
 
43
+ # Base logistic regression
44
+ base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
45
+ # Wrap with probability calibration
46
+ calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')
47
+
48
  self.pipeline = Pipeline([
49
  ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
50
+ ("clf", calibrated_clf)
51
  ])
52
  self.is_trained = False
53
 
 
79
  if self.is_trained:
80
  try:
81
  label = self.pipeline.predict([text_clean])[0]
82
+ # calibrated probabilities
83
  try:
84
  confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
85
  except Exception:
models/trained_pipeline.joblib CHANGED
Binary files a/models/trained_pipeline.joblib and b/models/trained_pipeline.joblib differ