| | from pathlib import Path |
| | import joblib |
| | import json |
| | import re |
| | from sklearn.pipeline import Pipeline |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.linear_model import LogisticRegression |
| | from sklearn.calibration import CalibratedClassifierCV |
| | from typing import Dict |
| | import os |
| |
|
| | |
| | |
| | |
| | try: |
| | from app.classification.preprocess import clean_text as external_clean_text |
| | clean_text = external_clean_text |
| | except ImportError: |
| | def clean_text(text: str) -> str: |
| | text = text.lower() |
| | text = re.sub(r"\d+", "NUM", text) |
| | text = re.sub(r"\s+", " ", text) |
| | text = re.sub(r"[\x00-\x1f]+", "", text) |
| | return text.strip() |
| |
|
| |
|
| | class SklearnClassifier: |
| | """ |
| | Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal, |
| | now with probability calibration. |
| | """ |
| |
|
| | |
| | PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| | MODEL_PATH = PROJECT_ROOT / "models" / "trained_pipeline.joblib" |
| |
|
| | def __init__(self, dataset_path: str = None): |
| | if dataset_path is None: |
| | dataset_path = self.PROJECT_ROOT / "data" / "samples" / "training_data.json" |
| | else: |
| | dataset_path = Path(dataset_path) |
| |
|
| | |
| | base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0) |
| | |
| | calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid') |
| |
|
| | self.pipeline = Pipeline([ |
| | ("tfidf", TfidfVectorizer(ngram_range=(1, 2))), |
| | ("clf", calibrated_clf) |
| | ]) |
| | self.is_trained = False |
| |
|
| | |
| | |
| | |
| | if self.MODEL_PATH.exists(): |
| | self.pipeline = joblib.load(self.MODEL_PATH) |
| | self.is_trained = True |
| | elif dataset_path.exists(): |
| | self.train_from_json(dataset_path) |
| | else: |
| | print(f"[Warning] No trained model or dataset found. Using fallback logic.") |
| |
|
| | def train_from_json(self, dataset_path: Path): |
| | data = json.loads(dataset_path.read_text(encoding="utf-8")) |
| | texts = [clean_text(d["text"]) for d in data] |
| | labels = [d["label"] for d in data] |
| |
|
| | self.pipeline.fit(texts, labels) |
| | self.is_trained = True |
| |
|
| | |
| | self.MODEL_PATH.parent.mkdir(exist_ok=True, parents=True) |
| | joblib.dump(self.pipeline, self.MODEL_PATH) |
| |
|
| | def predict(self, text: str) -> Dict[str, float]: |
| | text_clean = clean_text(text) |
| | if self.is_trained: |
| | try: |
| | label = self.pipeline.predict([text_clean])[0] |
| | |
| | try: |
| | confidence = float(max(self.pipeline.predict_proba([text_clean])[0])) |
| | except Exception: |
| | confidence = 0.8 |
| | except Exception as e: |
| | print("[Error] Sklearn prediction failed:", e) |
| | label = "unknown" |
| | confidence = 0.3 |
| | else: |
| | |
| | if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean): |
| | label = "finance.invoice" |
| | elif "policy" in text_clean or "hr" in text_clean: |
| | label = "hr.policy" |
| | else: |
| | label = "legal.contract" |
| | confidence = 0.3 |
| |
|
| | return {"label": label, "confidence": confidence} |
| |
|
| |
|
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | clf = SklearnClassifier() |
| | print("Is trained?", clf.is_trained) |
| | samples = [ |
| | "Invoice for Q3 2025 amount 23923 $", |
| | "HR policy update for employees", |
| | "Signed legal contract for vendor" |
| | ] |
| | for s in samples: |
| | print(s, "->", clf.predict(s)) |
| |
|