LeonardoMdSA commited on
Commit
fd7242c
·
1 Parent(s): b6e3994

Trained model

Browse files
README.md CHANGED
@@ -16,9 +16,16 @@ uvicorn app.main:app --reload --host 127.0.0.1 --port 8000
16
  streamlit run ui/streamlit_app.py --server.port 8501 --server.address 127.0.0.1
17
 
18
  ### Tests
 
19
  pytest -v
 
20
  Or manual smoke test in test_backend.py
21
 
 
 
 
 
 
22
  ## Initial struture
23
 
24
  Context-aware NLP classification platform with MCP/
 
16
  streamlit run ui/streamlit_app.py --server.port 8501 --server.address 127.0.0.1
17
 
18
  ### Tests
19
+
20
  pytest -v
21
+
22
  Or manual smoke test in test_backend.py
23
 
24
+
25
+ ### Train model
26
+
27
+ python scripts/train_model.py
28
+
29
  ## Initial struture
30
 
31
  Context-aware NLP classification platform with MCP/
app/classification/model.py CHANGED
@@ -1,4 +1,5 @@
1
  from typing import Any, Dict, Optional
 
2
 
3
  from app.classification.sklearn_model import SklearnClassifier
4
  from app.classification.llm_adapter import LLMAdapter
@@ -10,22 +11,24 @@ settings = get_settings()
10
  class Classifier:
11
  """
12
  Abstract classifier. Can switch between:
13
- - Sklearn baseline
14
  - Optional LLM-assisted classification
15
  """
16
 
17
- def __init__(self):
18
- self.model = SklearnClassifier()
 
 
 
 
 
19
  self.llm = LLMAdapter() if settings.MCP_EMBEDDED else None
20
 
21
- def predict(
22
- self, text: str, context: Dict[str, Any]
23
- ) -> Dict[str, Any]:
24
  """
25
  Predict label using structured context.
26
  Returns dict: {label, confidence}
27
  """
28
-
29
  # Step 1: baseline model
30
  baseline_result = self.model.predict(text)
31
 
 
1
  from typing import Any, Dict, Optional
2
+ from pathlib import Path
3
 
4
  from app.classification.sklearn_model import SklearnClassifier
5
  from app.classification.llm_adapter import LLMAdapter
 
11
  class Classifier:
12
  """
13
  Abstract classifier. Can switch between:
14
+ - Sklearn baseline (trained from JSON dataset)
15
  - Optional LLM-assisted classification
16
  """
17
 
18
+ def __init__(self, dataset_path: Optional[str] = None):
19
+ # Use default training dataset if none provided
20
+ default_dataset = Path("data/samples/training_data.json")
21
+ if dataset_path is None and default_dataset.exists():
22
+ dataset_path = str(default_dataset)
23
+
24
+ self.model = SklearnClassifier(dataset_path=dataset_path)
25
  self.llm = LLMAdapter() if settings.MCP_EMBEDDED else None
26
 
27
+ def predict(self, text: str, context: Dict[str, Any]) -> Dict[str, Any]:
 
 
28
  """
29
  Predict label using structured context.
30
  Returns dict: {label, confidence}
31
  """
 
32
  # Step 1: baseline model
33
  baseline_result = self.model.predict(text)
34
 
app/classification/sklearn_model.py CHANGED
@@ -1,22 +1,89 @@
1
  from typing import Dict
2
- from app.classification.preprocess import clean_text
3
- import random
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  class SklearnClassifier:
6
  """
7
- Placeholder baseline classifier.
8
-
9
- - Replace with trained scikit-learn model or lightweight transformer.
10
- - Deterministic for testing / example purposes.
11
  """
12
 
13
- def __init__(self):
14
- # Load pre-trained model here in production
15
- self.labels = ["finance.invoice", "hr.policy", "legal.contract"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def predict(self, text: str) -> Dict[str, float]:
18
- text = clean_text(text)
19
- # deterministic mock confidence
20
- confidence = round(random.uniform(0.6, 0.95), 2)
21
- label = self.labels[hash(text) % len(self.labels)]
 
 
 
 
 
 
 
 
 
 
22
  return {"label": label, "confidence": confidence}
 
1
  from typing import Dict
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.pipeline import Pipeline
5
+ import re
6
+ import json
7
+ from pathlib import Path
8
+ import joblib # new import for saving/loading models
9
+
10
+ # Import from the module if already exists; else fallback to local definition
11
+ try:
12
+ from app.classification.preprocess import clean_text as external_clean_text
13
+ clean_text = external_clean_text
14
+ except ImportError:
15
+ # -------------------------
16
+ # Minimal preprocessing
17
+ # -------------------------
18
+ def clean_text(text: str) -> str:
19
+ # Lowercase, remove extra spaces, standardize numeric patterns
20
+ text = text.lower()
21
+ text = re.sub(r"\d+", "NUM", text) # Replace numbers with placeholder
22
+ text = re.sub(r"\s+", " ", text)
23
+ return text.strip()
24
+
25
 
26
  class SklearnClassifier:
27
  """
28
+ Lightweight TF-IDF + Logistic Regression classifier for finance/hr/legal.
29
+ Deterministic and trainable from JSON dataset.
 
 
30
  """
31
 
32
+ MODEL_PATH = Path(__file__).parent.parent / "models" / "trained_pipeline.joblib"
33
+
34
+ def __init__(self, dataset_path: str = "data/samples/training_data.json"):
35
+ """
36
+ dataset_path: optional path to JSON file with training data
37
+ format: [{"text": "...", "label": "finance.invoice"}, ...]
38
+ """
39
+ self.pipeline = Pipeline([
40
+ ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),
41
+ ("clf", LogisticRegression(max_iter=500))
42
+ ])
43
+ self.is_trained = False
44
+
45
+ # -------------------------
46
+ # Load trained model if exists
47
+ # -------------------------
48
+ if self.MODEL_PATH.exists():
49
+ self.pipeline = joblib.load(self.MODEL_PATH)
50
+ self.is_trained = True
51
+ else:
52
+ file_path = Path(dataset_path)
53
+ if file_path.exists():
54
+ self.train_from_json(dataset_path)
55
+
56
+ def train_from_json(self, dataset_path: str):
57
+ file_path = Path(dataset_path)
58
+ if not file_path.exists():
59
+ raise ValueError(f"Dataset file not found: {dataset_path}")
60
+
61
+ data = json.loads(file_path.read_text(encoding="utf-8"))
62
+ texts = [clean_text(d["text"]) for d in data]
63
+ labels = [d["label"] for d in data]
64
+
65
+ self.pipeline.fit(texts, labels)
66
+ self.is_trained = True
67
+
68
+ # -------------------------
69
+ # Save trained pipeline
70
+ # -------------------------
71
+ self.MODEL_PATH.parent.mkdir(exist_ok=True)
72
+ joblib.dump(self.pipeline, self.MODEL_PATH)
73
 
74
  def predict(self, text: str) -> Dict[str, float]:
75
+ text_clean = clean_text(text)
76
+ if self.is_trained:
77
+ label = self.pipeline.predict([text_clean])[0]
78
+ confidence = float(max(self.pipeline.predict_proba([text_clean])[0]))
79
+ else:
80
+ # fallback if no training data provided
81
+ if "invoice" in text_clean or ("q" in text_clean and "num" in text_clean):
82
+ label = "finance.invoice"
83
+ elif "policy" in text_clean or "hr" in text_clean:
84
+ label = "hr.policy"
85
+ else:
86
+ label = "legal.contract"
87
+ confidence = 0.75
88
+
89
  return {"label": label, "confidence": confidence}
app/models/trained_pipeline.joblib ADDED
Binary file (5.91 kB). View file
 
data/samples/training_data.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "text": "Invoice for Q1 2025 total amount $15,200",
4
+ "label": "finance.invoice"
5
+ },
6
+ {
7
+ "text": "Invoice for Q2 2025 total amount $8,450",
8
+ "label": "finance.invoice"
9
+ },
10
+ {
11
+ "text": "Invoice for Q3 2025 total amount $23,923",
12
+ "label": "finance.invoice"
13
+ },
14
+ {
15
+ "text": "Invoice for Q4 2025 total amount $12,000",
16
+ "label": "finance.invoice"
17
+ },
18
+ {
19
+ "text": "HR policy update regarding employee leave",
20
+ "label": "hr.policy"
21
+ },
22
+ {
23
+ "text": "New guidelines for work-from-home policy",
24
+ "label": "hr.policy"
25
+ },
26
+ {
27
+ "text": "Mandatory compliance training policy for all staff",
28
+ "label": "hr.policy"
29
+ },
30
+ {
31
+ "text": "Contract agreement between Company A and Company B",
32
+ "label": "legal.contract"
33
+ },
34
+ {
35
+ "text": "Non-disclosure agreement for external partners",
36
+ "label": "legal.contract"
37
+ },
38
+ {
39
+ "text": "Service level agreement for client X",
40
+ "label": "legal.contract"
41
+ }
42
+ ]
models/trained_pipeline.joblib ADDED
Binary file (5.91 kB). View file
 
scripts/train_model.py CHANGED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #scripts\train_model.py
2
+ from pathlib import Path
3
+ import sys
4
+
5
+ # Add project root to sys.path
6
+ sys.path.append(str(Path(__file__).resolve().parent.parent))
7
+
8
+ import joblib
9
+ from app.classification.sklearn_model import SklearnClassifier
10
+
11
+ # -------------------------
12
+ # Paths
13
+ # -------------------------
14
+ DATASET_PATH = Path(__file__).parent.parent / "data" / "samples" / "training_data.json"
15
+ MODEL_PATH = Path(__file__).parent.parent / "models" / "trained_pipeline.joblib"
16
+
17
+ # -------------------------
18
+ # Train classifier
19
+ # -------------------------
20
+ print(f"Loading training data from {DATASET_PATH}")
21
+ classifier = SklearnClassifier(dataset_path=str(DATASET_PATH))
22
+
23
+ # Save trained pipeline
24
+ joblib.dump(classifier.pipeline, MODEL_PATH)
25
+ print(f"Trained model saved to {MODEL_PATH}")