LeonardoMdSA's picture
changed sklearn model and added data
b45fa4d
from pathlib import Path
import joblib
import json
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from typing import Dict
import os
# -------------------------
# Preprocessing
# -------------------------
try:
from app.classification.preprocess import clean_text as external_clean_text
clean_text = external_clean_text
except ImportError:
def clean_text(text: str) -> str:
text = text.lower()
text = re.sub(r"\d+", "NUM", text)
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[\x00-\x1f]+", "", text)
return text.strip()
class SklearnClassifier:
"""
Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
now with probability calibration.
"""
# Make MODEL_PATH absolute relative to project root
PROJECT_ROOT = Path(__file__).resolve().parents[2]
MODEL_PATH = PROJECT_ROOT / "models" / "trained_pipeline.joblib"
def __init__(self, dataset_path: str = None):
if dataset_path is None:
dataset_path = self.PROJECT_ROOT / "data" / "samples" / "training_data.json"
else:
dataset_path = Path(dataset_path)
# Base logistic regression
base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
# Wrap with probability calibration
calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')
self.pipeline = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
("clf", calibrated_clf)
])
self.is_trained = False
# -------------------------
# Load trained model if exists
# -------------------------
if self.MODEL_PATH.exists():
self.pipeline = joblib.load(self.MODEL_PATH)
self.is_trained = True
elif dataset_path.exists():
self.train_from_json(dataset_path)
else:
print(f"[Warning] No trained model or dataset found. Using fallback logic.")
def train_from_json(self, dataset_path: Path):
data = json.loads(dataset_path.read_text(encoding="utf-8"))
texts = [clean_text(d["text"]) for d in data]
labels = [d["label"] for d in data]
self.pipeline.fit(texts, labels)
self.is_trained = True
# Save model
self.MODEL_PATH.parent.mkdir(exist_ok=True, parents=True)
joblib.dump(self.pipeline, self.MODEL_PATH)
def predict(self, text: str) -> Dict[str, float]:
text_clean = clean_text(text)
if self.is_trained:
try:
label = self.pipeline.predict([text_clean])[0]
# calibrated probabilities
try:
confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
except Exception:
confidence = 0.8
except Exception as e:
print("[Error] Sklearn prediction failed:", e)
label = "unknown"
confidence = 0.3
else:
# fallback heuristic
if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean):
label = "finance.invoice"
elif "policy" in text_clean or "hr" in text_clean:
label = "hr.policy"
else:
label = "legal.contract"
confidence = 0.3
return {"label": label, "confidence": confidence}
# -------------------------
# Quick sanity check when run directly
# -------------------------
if __name__ == "__main__":
clf = SklearnClassifier()
print("Is trained?", clf.is_trained)
samples = [
"Invoice for Q3 2025 amount 23923 $",
"HR policy update for employees",
"Signed legal contract for vendor"
]
for s in samples:
print(s, "->", clf.predict(s))