Spaces:

LeonardoMdSA
/

Context-aware-NLP-classification-platform-with-MCP

Sleeping

App Files Files Community

LeonardoMdSA commited on Jan 6

Commit

fd7242c

1 Parent(s): b6e3994

Trained model

Browse files

Files changed (7) hide show

README.md +7 -0
app/classification/model.py +10 -7
app/classification/sklearn_model.py +80 -13
app/models/trained_pipeline.joblib +0 -0
data/samples/training_data.json +42 -0
models/trained_pipeline.joblib +0 -0
scripts/train_model.py +25 -0

README.md CHANGED Viewed

@@ -16,9 +16,16 @@ uvicorn app.main:app --reload --host 127.0.0.1 --port 8000
 streamlit run ui/streamlit_app.py --server.port 8501 --server.address 127.0.0.1
 ### Tests
 pytest -v
 Or manual smoke test in test_backend.py
 ## Initial struture
 Context-aware NLP classification platform with MCP/

 streamlit run ui/streamlit_app.py --server.port 8501 --server.address 127.0.0.1
 ### Tests
 pytest -v
 Or manual smoke test in test_backend.py
+### Train model
+python scripts/train_model.py
 ## Initial struture
 Context-aware NLP classification platform with MCP/

app/classification/model.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from typing import Any, Dict, Optional
 from app.classification.sklearn_model import SklearnClassifier
 from app.classification.llm_adapter import LLMAdapter
@@ -10,22 +11,24 @@ settings = get_settings()
 class Classifier:
     """
     Abstract classifier. Can switch between:
-    - Sklearn baseline
     - Optional LLM-assisted classification
     """
-    def __init__(self):
-        self.model = SklearnClassifier()
         self.llm = LLMAdapter() if settings.MCP_EMBEDDED else None
-    def predict(
-        self, text: str, context: Dict[str, Any]
-    ) -> Dict[str, Any]:
         """
         Predict label using structured context.
         Returns dict: {label, confidence}
         """
         # Step 1: baseline model
         baseline_result = self.model.predict(text)

 from typing import Any, Dict, Optional
+from pathlib import Path
 from app.classification.sklearn_model import SklearnClassifier
 from app.classification.llm_adapter import LLMAdapter
 class Classifier:
     """
     Abstract classifier. Can switch between:
+    - Sklearn baseline (trained from JSON dataset)
     - Optional LLM-assisted classification
     """
+    def __init__(self, dataset_path: Optional[str] = None):
+        # Use default training dataset if none provided
+        default_dataset = Path("data/samples/training_data.json")
+        if dataset_path is None and default_dataset.exists():
+            dataset_path = str(default_dataset)
+        self.model = SklearnClassifier(dataset_path=dataset_path)
         self.llm = LLMAdapter() if settings.MCP_EMBEDDED else None
+    def predict(self, text: str, context: Dict[str, Any]) -> Dict[str, Any]:
         """
         Predict label using structured context.
         Returns dict: {label, confidence}
         """
         # Step 1: baseline model
         baseline_result = self.model.predict(text)

app/classification/sklearn_model.py CHANGED Viewed

@@ -1,22 +1,89 @@
 from typing import Dict
-from app.classification.preprocess import clean_text
-import random
 class SklearnClassifier:
     """
-    Placeholder baseline classifier.
-    - Replace with trained scikit-learn model or lightweight transformer.
-    - Deterministic for testing / example purposes.
     """
-    def __init__(self):
-        # Load pre-trained model here in production
-        self.labels = ["finance.invoice", "hr.policy", "legal.contract"]
     def predict(self, text: str) -> Dict[str, float]:
-        text = clean_text(text)
-        # deterministic mock confidence
-        confidence = round(random.uniform(0.6, 0.95), 2)
-        label = self.labels[hash(text) % len(self.labels)]
         return {"label": label, "confidence": confidence}

 from typing import Dict
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+import re
+import json
+from pathlib import Path
+import joblib  # new import for saving/loading models
+# Import from the module if already exists; else fallback to local definition
+try:
+    from app.classification.preprocess import clean_text as external_clean_text
+    clean_text = external_clean_text
+except ImportError:
+    # -------------------------
+    # Minimal preprocessing
+    # -------------------------
+    def clean_text(text: str) -> str:
+        # Lowercase, remove extra spaces, standardize numeric patterns
+        text = text.lower()
+        text = re.sub(r"\d+", "NUM", text)  # Replace numbers with placeholder
+        text = re.sub(r"\s+", " ", text)
+        return text.strip()
 class SklearnClassifier:
     """
+    Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal.
+    Deterministic and trainable from JSON dataset.
     """
+    MODEL_PATH = Path(__file__).parent.parent / "models" / "trained_pipeline.joblib"
+    def __init__(self, dataset_path: str = "data/samples/training_data.json"):
+        """
+        dataset_path: optional path to JSON file with training data
+        format: [{"text": "...", "label": "finance.invoice"}, ...]
+        """
+        self.pipeline = Pipeline([
+            ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
+            ("clf", LogisticRegression(max_iter=500))
+        ])
+        self.is_trained = False
+        # -------------------------
+        # Load trained model if exists
+        # -------------------------
+        if self.MODEL_PATH.exists():
+            self.pipeline = joblib.load(self.MODEL_PATH)
+            self.is_trained = True
+        else:
+            file_path = Path(dataset_path)
+            if file_path.exists():
+                self.train_from_json(dataset_path)
+    def train_from_json(self, dataset_path: str):
+        file_path = Path(dataset_path)
+        if not file_path.exists():
+            raise ValueError(f"Dataset file not found: {dataset_path}")
+        data = json.loads(file_path.read_text(encoding="utf-8"))
+        texts = [clean_text(d["text"]) for d in data]
+        labels = [d["label"] for d in data]
+        self.pipeline.fit(texts, labels)
+        self.is_trained = True
+        # -------------------------
+        # Save trained pipeline
+        # -------------------------
+        self.MODEL_PATH.parent.mkdir(exist_ok=True)
+        joblib.dump(self.pipeline, self.MODEL_PATH)
     def predict(self, text: str) -> Dict[str, float]:
+        text_clean = clean_text(text)
+        if self.is_trained:
+            label = self.pipeline.predict([text_clean])[0]
+            confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
+        else:
+            # fallback if no training data provided
+            if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean):
+                label = "finance.invoice"
+            elif "policy" in text_clean or "hr" in text_clean:
+                label = "hr.policy"
+            else:
+                label = "legal.contract"
+            confidence = 0.75
         return {"label": label, "confidence": confidence}

app/models/trained_pipeline.joblib ADDED Viewed

Binary file (5.91 kB). View file

data/samples/training_data.json ADDED Viewed

	@@ -0,0 +1,42 @@

+[
+  {
+    "text": "Invoice for Q1 2025 total amount $15,200",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "Invoice for Q2 2025 total amount $8,450",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "Invoice for Q3 2025 total amount $23,923",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "Invoice for Q4 2025 total amount $12,000",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "HR policy update regarding employee leave",
+    "label": "hr.policy"
+  },
+  {
+    "text": "New guidelines for work-from-home policy",
+    "label": "hr.policy"
+  },
+  {
+    "text": "Mandatory compliance training policy for all staff",
+    "label": "hr.policy"
+  },
+  {
+    "text": "Contract agreement between Company A and Company B",
+    "label": "legal.contract"
+  },
+  {
+    "text": "Non-disclosure agreement for external partners",
+    "label": "legal.contract"
+  },
+  {
+    "text": "Service level agreement for client X",
+    "label": "legal.contract"
+  }
+]

models/trained_pipeline.joblib ADDED Viewed

Binary file (5.91 kB). View file

scripts/train_model.py CHANGED Viewed

	@@ -0,0 +1,25 @@

+#scripts\train_model.py
+from pathlib import Path
+import sys
+# Add project root to sys.path
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+import joblib
+from app.classification.sklearn_model import SklearnClassifier
+# -------------------------
+# Paths
+# -------------------------
+DATASET_PATH = Path(__file__).parent.parent / "data" / "samples" / "training_data.json"
+MODEL_PATH = Path(__file__).parent.parent / "models" / "trained_pipeline.joblib"
+# -------------------------
+# Train classifier
+# -------------------------
+print(f"Loading training data from {DATASET_PATH}")
+classifier = SklearnClassifier(dataset_path=str(DATASET_PATH))
+# Save trained pipeline
+joblib.dump(classifier.pipeline, MODEL_PATH)
+print(f"Trained model saved to {MODEL_PATH}")