from pathlib import Path
import joblib
import json
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from typing import Dict
import os

# -------------------------
# Preprocessing
# -------------------------
try:
    from app.classification.preprocess import clean_text as external_clean_text
    clean_text = external_clean_text
except ImportError:
    def clean_text(text: str) -> str:
        text = text.lower()
        text = re.sub(r"\d+", "NUM", text)
        text = re.sub(r"\s+", " ", text)
        text = re.sub(r"[\x00-\x1f]+", "", text)
        return text.strip()


class SklearnClassifier:
    """
    Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal,
    now with probability calibration.
    """

    # Make MODEL_PATH absolute relative to project root
    PROJECT_ROOT = Path(__file__).resolve().parents[2]
    MODEL_PATH = PROJECT_ROOT / "models" / "trained_pipeline.joblib"

    def __init__(self, dataset_path: str = None):
        if dataset_path is None:
            dataset_path = self.PROJECT_ROOT / "data" / "samples" / "training_data.json"
        else:
            dataset_path = Path(dataset_path)

        # Base logistic regression
        base_clf = LogisticRegression(max_iter=500, class_weight='balanced', C=1.0)
        # Wrap with probability calibration
        calibrated_clf = CalibratedClassifierCV(base_clf, cv=3, method='sigmoid')

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
            ("clf", calibrated_clf)
        ])
        self.is_trained = False

        # -------------------------
        # Load trained model if exists
        # -------------------------
        if self.MODEL_PATH.exists():
            self.pipeline = joblib.load(self.MODEL_PATH)
            self.is_trained = True
        elif dataset_path.exists():
            self.train_from_json(dataset_path)
        else:
            print(f"[Warning] No trained model or dataset found. Using fallback logic.")

    def train_from_json(self, dataset_path: Path):
        data = json.loads(dataset_path.read_text(encoding="utf-8"))
        texts = [clean_text(d["text"]) for d in data]
        labels = [d["label"] for d in data]

        self.pipeline.fit(texts, labels)
        self.is_trained = True

        # Save model
        self.MODEL_PATH.parent.mkdir(exist_ok=True, parents=True)
        joblib.dump(self.pipeline, self.MODEL_PATH)

    def predict(self, text: str) -> Dict[str, float]:
        text_clean = clean_text(text)
        if self.is_trained:
            try:
                label = self.pipeline.predict([text_clean])[0]
                # calibrated probabilities
                try:
                    confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
                except Exception:
                    confidence = 0.8
            except Exception as e:
                print("[Error] Sklearn prediction failed:", e)
                label = "unknown"
                confidence = 0.3
        else:
            # fallback heuristic
            if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean):
                label = "finance.invoice"
            elif "policy" in text_clean or "hr" in text_clean:
                label = "hr.policy"
            else:
                label = "legal.contract"
            confidence = 0.3

        return {"label": label, "confidence": confidence}


# -------------------------
# Quick sanity check when run directly
# -------------------------
if __name__ == "__main__":
    clf = SklearnClassifier()
    print("Is trained?", clf.is_trained)
    samples = [
        "Invoice for Q3 2025 amount 23923 $",
        "HR policy update for employees",
        "Signed legal contract for vendor"
    ]
    for s in samples:
        print(s, "->", clf.predict(s))