Spaces:

LeonardoMdSA
/

Context-aware-NLP-classification-platform-with-MCP

Sleeping

App Files Files Community

LeonardoMdSA commited on Jan 7

Commit

62a3be1

1 Parent(s): bb37020

add working scripts

Browse files

Files changed (5) hide show

README.md +5 -1
data/samples/eval.json +18 -0
data/samples/train.json +26 -0
scripts/evaluate.py +97 -0
scripts/seed_data.py +114 -0

README.md CHANGED Viewed

@@ -22,10 +22,14 @@ pytest -v
 Or manual smoke test in test_backend.py
-### Train model
 python scripts/train_model.py
 ## Initial struture
 Context-aware NLP classification platform with MCP/

 Or manual smoke test in test_backend.py
+### Train-evaluate model
+python scripts\seed_data.py
 python scripts/train_model.py
+python scripts\evaluate.py
 ## Initial struture
 Context-aware NLP classification platform with MCP/

data/samples/eval.json ADDED Viewed

	@@ -0,0 +1,18 @@

+[
+  {
+    "text": "Invoice for Q1 2025 total amount $15,200",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "HR policy update regarding employee leave",
+    "label": "hr.policy"
+  },
+  {
+    "text": "Contract agreement between Company A and Company B",
+    "label": "legal.contract"
+  },
+  {
+    "text": "Invoice for Q4 2025 total amount $12,000",
+    "label": "finance.invoice"
+  }
+]

data/samples/train.json ADDED Viewed

	@@ -0,0 +1,26 @@

+[
+  {
+    "text": "Mandatory compliance training policy for all staff",
+    "label": "hr.policy"
+  },
+  {
+    "text": "Invoice for Q2 2025 total amount $8,450",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "New guidelines for work-from-home policy",
+    "label": "hr.policy"
+  },
+  {
+    "text": "Invoice for Q3 2025 total amount $23,923",
+    "label": "finance.invoice"
+  },
+  {
+    "text": "Non-disclosure agreement for external partners",
+    "label": "legal.contract"
+  },
+  {
+    "text": "Service level agreement for client X",
+    "label": "legal.contract"
+  }
+]

scripts/evaluate.py CHANGED Viewed

	@@ -0,0 +1,97 @@

+#!/usr/bin/env python
+import argparse
+import json
+from pathlib import Path
+import joblib
+from sklearn.metrics import (
+    accuracy_score,
+    precision_recall_fscore_support,
+    classification_report
+)
+BASE_DIR = Path(__file__).resolve().parent.parent
+MODELS_DIR = BASE_DIR / "models"
+DATA_DIR = BASE_DIR / "data"
+def load_model():
+    model_path = MODELS_DIR / "trained_pipeline.joblib"
+    if not model_path.exists():
+        raise FileNotFoundError(f"Model not found: {model_path}")
+    return joblib.load(model_path)
+def load_dataset(dataset_path: Path):
+    if not dataset_path.exists():
+        raise FileNotFoundError(f"Dataset not found: {dataset_path}")
+    # Hard guard: never evaluate on training data
+    if dataset_path.name in {"training_data.json", "train.json"}:
+        raise RuntimeError(
+            f"Refusing to evaluate on training dataset: {dataset_path.name}"
+        )
+    with dataset_path.open("r", encoding="utf-8") as f:
+        raw = json.load(f)
+    if isinstance(raw, list):
+        samples = raw
+    elif isinstance(raw, dict) and "samples" in raw:
+        samples = raw["samples"]
+    else:
+        raise ValueError("Unsupported JSON dataset format")
+    texts = []
+    labels = []
+    for i, item in enumerate(samples):
+        if "text" not in item or "label" not in item:
+            raise ValueError(f"Invalid sample at index {i}: {item}")
+        texts.append(item["text"])
+        labels.append(item["label"])
+    return texts, labels
+def evaluate(model, X, y):
+    y_pred = model.predict(X)
+    acc = accuracy_score(y, y_pred)
+    precision, recall, f1, _ = precision_recall_fscore_support(
+        y, y_pred, average="weighted", zero_division=0
+    )
+    print("====================================")
+    print("Offline Evaluation Results")
+    print("====================================")
+    print(f"Samples  : {len(y)}")
+    print(f"Accuracy : {acc:.4f}")
+    print(f"Precision: {precision:.4f}")
+    print(f"Recall   : {recall:.4f}")
+    print(f"F1-score : {f1:.4f}")
+    print()
+    print("Detailed Classification Report")
+    print("------------------------------------")
+    print(classification_report(y, y_pred, zero_division=0))
+def main():
+    parser = argparse.ArgumentParser(
+        description="Offline evaluation using held-out JSON dataset"
+    )
+    parser.add_argument(
+        "--data",
+        default=str(DATA_DIR / "samples" / "eval.json"),
+        help="Path to evaluation dataset (default: data/samples/eval.json)"
+    )
+    args = parser.parse_args()
+    model = load_model()
+    X, y = load_dataset(Path(args.data))
+    evaluate(model, X, y)
+if __name__ == "__main__":
+    main()

scripts/seed_data.py CHANGED Viewed

	@@ -0,0 +1,114 @@

+"""
+Seed and split dataset for training and evaluation.
+- Reads: data/samples/training_data.json
+- Writes:
+    - data/samples/train.json
+    - data/samples/eval.json
+This script enforces:
+- Stratified split by label
+- Deterministic output (fixed random seed)
+- Basic data validation
+"""
+import json
+import random
+from pathlib import Path
+from collections import defaultdict
+# -------------------------
+# Configuration
+# -------------------------
+RANDOM_SEED = 42
+TRAIN_RATIO = 0.7
+BASE_DIR = Path(__file__).resolve().parent.parent
+SAMPLES_DIR = BASE_DIR / "data" / "samples"
+SOURCE_FILE = SAMPLES_DIR / "training_data.json"
+TRAIN_FILE = SAMPLES_DIR / "train.json"
+EVAL_FILE = SAMPLES_DIR / "eval.json"
+def main():
+    if not SOURCE_FILE.exists():
+        raise FileNotFoundError(f"Source dataset not found: {SOURCE_FILE}")
+    with open(SOURCE_FILE, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list) or len(data) == 0:
+        raise ValueError("Dataset must be a non-empty list")
+    # -------------------------
+    # Basic validation
+    # -------------------------
+    for i, item in enumerate(data):
+        if "text" not in item or "label" not in item:
+            raise ValueError(f"Invalid sample at index {i}: {item}")
+    # -------------------------
+    # Stratified split
+    # -------------------------
+    random.seed(RANDOM_SEED)
+    by_label = defaultdict(list)
+    for item in data:
+        by_label[item["label"]].append(item)
+    train_data = []
+    eval_data = []
+    for label, items in by_label.items():
+        random.shuffle(items)
+        split_idx = max(1, int(len(items) * TRAIN_RATIO))
+        train_data.extend(items[:split_idx])
+        eval_data.extend(items[split_idx:])
+    # Final shuffle (important)
+    random.shuffle(train_data)
+    random.shuffle(eval_data)
+    # -------------------------
+    # Write outputs
+    # -------------------------
+    SAMPLES_DIR.mkdir(parents=True, exist_ok=True)
+    with open(TRAIN_FILE, "w", encoding="utf-8") as f:
+        json.dump(train_data, f, indent=2, ensure_ascii=False)
+    with open(EVAL_FILE, "w", encoding="utf-8") as f:
+        json.dump(eval_data, f, indent=2, ensure_ascii=False)
+    # -------------------------
+    # Summary
+    # -------------------------
+    print("====================================")
+    print("Dataset seeding completed")
+    print("====================================")
+    print(f"Total samples : {len(data)}")
+    print(f"Train samples : {len(train_data)}")
+    print(f"Eval samples  : {len(eval_data)}")
+    print()
+    print("Label distribution (train):")
+    _print_distribution(train_data)
+    print("\nLabel distribution (eval):")
+    _print_distribution(eval_data)
+def _print_distribution(dataset):
+    dist = defaultdict(int)
+    for item in dataset:
+        dist[item["label"]] += 1
+    for label, count in sorted(dist.items()):
+        print(f"  {label:<20} {count}")
+if __name__ == "__main__":
+    main()