Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Dec 8, 2025

Commit

02a1f10

verified ·

1 Parent(s): 4811a82

Update engine/train_genus_model.py

Browse files

Files changed (1) hide show

engine/train_genus_model.py +90 -126

engine/train_genus_model.py CHANGED Viewed

@@ -2,27 +2,16 @@
 """
 Train a genus-level classifier (XGBoost) from gold tests.
-Inputs:
-  - training/gold_tests.json
-      Each item should have:
-        - "name" (e.g. "Salmonella enterica")
-        - and a dict of expected fields:
-            "fields" or "expected_fields" or "schema" or "expected"
-We:
-  1) Extract the genus from "name" (first token).
-  2) Turn expected fields into an ML feature vector via engine.features.extract_feature_vector.
-  3) Train an XGBoost multi-class classifier (one class per genus).
-  4) Save:
-       models/genus_xgb.json         (the model)
-       models/genus_xgb_meta.json    (label map + feature meta)
-This module exposes:
-    train_genus_model()  -> dict summary
-so the Gradio UI can call it and display the JSON summary, and also keeps a
-CLI entry via `python -m engine.train_genus_model` or direct execution.
 """
 from __future__ import annotations
@@ -37,7 +26,10 @@ import xgboost as xgb
 from .features import extract_feature_vector, FEATURES
 # Paths
 GOLD_TESTS_PATH = "training/gold_tests.json"
 MODEL_DIR = "models"
 MODEL_PATH = os.path.join(MODEL_DIR, "genus_xgb.json")
@@ -45,35 +37,44 @@ META_PATH = os.path.join(MODEL_DIR, "genus_xgb_meta.json")
 # ---------------------------------------------------------------------------
-# Helpers
 # ---------------------------------------------------------------------------
 def _load_gold_tests(path: str) -> List[Dict[str, Any]]:
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     if not isinstance(data, list):
-        raise ValueError("gold_tests.json should contain a list of samples.")
     return data
 def _extract_genus(sample: Dict[str, Any]) -> str | None:
     """
-    Get genus from sample["name"] / ["Name"] / ["organism"] etc.
-    We just take the first word.
     """
     for key in ("name", "Name", "organism", "Organism"):
         if key in sample and sample[key]:
-            text = str(sample[key]).strip()
-            if not text:
-                continue
-            return text.split()[0]
     return None
 def _extract_fields(sample: Dict[str, Any]) -> Dict[str, Any]:
     """
-    Try several possible keys for the expected fields in gold_tests.json.
     """
     for key in ("fields", "expected_fields", "schema", "expected"):
         if key in sample and isinstance(sample[key], dict):
@@ -81,19 +82,19 @@ def _extract_fields(sample: Dict[str, Any]) -> Dict[str, Any]:
     return {}
-def _build_dataset(
-    samples: List[Dict[str, Any]]
-) -> Tuple[np.ndarray, np.ndarray, Dict[str, int]]:
-    """
-    Build X (features) and y (integer genus labels) from gold tests.
-    Returns:
-      X: (N, D) feature matrix
-      y: (N,) integer labels
-      genus_to_idx: mapping from genus string → class index
     """
-    X: List[np.ndarray] = []
-    y: List[int] = []
     genus_to_idx: Dict[str, int] = {}
     for sample in samples:
@@ -103,41 +104,41 @@ def _build_dataset(
         fields = _extract_fields(sample)
         if not fields:
-            # No expected fields for this sample → skip
             continue
-        # Convert expected fields to feature vector
         vec = extract_feature_vector(fields)
-        # Map genus to class index
         if genus not in genus_to_idx:
             genus_to_idx[genus] = len(genus_to_idx)
-        label = genus_to_idx[genus]
-        X.append(vec)
-        y.append(label)
-    if not X:
-        raise ValueError("No usable samples found in gold_tests.json.")
-    X_arr = np.vstack(X)
-    y_arr = np.array(y, dtype=np.int32)
-    return X_arr, y_arr, genus_to_idx
 def _train_xgboost(
     X: np.ndarray,
     y: np.ndarray,
     num_classes: int,
-    seed: int = 42,
 ) -> Tuple[xgb.Booster, Dict[str, float]]:
     """
-    Train an XGBoost multi-class classifier with a simple train/valid split.
-    Returns:
-      model, metrics_dict
     """
-    # Train/valid split (80/20)
     n = X.shape[0]
     indices = list(range(n))
     random.Random(seed).shuffle(indices)
@@ -156,10 +157,10 @@ def _train_xgboost(
         "objective": "multi:softprob",
         "num_class": num_classes,
         "eval_metric": "mlogloss",
-        "max_depth": 5,
-        "eta": 0.1,
-        "subsample": 0.8,
-        "colsample_bytree": 0.8,
         "min_child_weight": 1,
         "seed": seed,
     }
@@ -169,30 +170,29 @@ def _train_xgboost(
     model = xgb.train(
         params,
         dtrain,
-        num_boost_round=200,
         evals=evals,
-        early_stopping_rounds=20,
-        verbose_eval=25,
     )
-    # Simple accuracy on train/valid
-    train_pred = np.argmax(model.predict(dtrain), axis=1)
-    valid_pred = np.argmax(model.predict(dvalid), axis=1)
-    train_acc = float((train_pred == y_train).mean())
-    valid_acc = float((valid_pred == y_valid).mean())
-    metrics = {
         "train_accuracy": train_acc,
         "valid_accuracy": valid_acc,
         "best_iteration": int(model.best_iteration),
     }
-    return model, metrics
-def _ensure_model_dir() -> None:
-    if not os.path.isdir(MODEL_DIR):
         os.makedirs(MODEL_DIR, exist_ok=True)
@@ -200,49 +200,29 @@ def _ensure_model_dir() -> None:
 # Public entry for UI
 # ---------------------------------------------------------------------------
 def train_genus_model() -> Dict[str, Any]:
-    """
-    Public function used by the Gradio UI.
-    Returns a JSON-serialisable dict, e.g.:
-    {
-      "ok": true,
-      "message": "...",
-      "stats": {...},
-      "metrics": {...},
-      "paths": {...},
-      "class_count": 42,
-      "genus_examples": ["Salmonella", "Staphylococcus", ...]
-    }
-    """
     try:
-        print("Loading gold tests from:", GOLD_TESTS_PATH)
         samples = _load_gold_tests(GOLD_TESTS_PATH)
-        print(f"Loaded {len(samples)} gold samples (raw).")
-        print("Building dataset...")
         X, y, genus_to_idx = _build_dataset(samples)
         num_classes = len(genus_to_idx)
-        print(f"Usable samples: {X.shape[0]}")
         print(f"Feature dimension: {X.shape[1]}")
-        print(f"Distinct genera (classes): {num_classes}")
-        print("Training XGBoost genus classifier...")
-        model, metrics = _train_xgboost(X, y, num_classes=num_classes)
         print("Training complete.")
         print(f"Train accuracy: {metrics['train_accuracy']:.3f}")
         print(f"Valid accuracy: {metrics['valid_accuracy']:.3f}")
-        print(f"Best iteration: {metrics['best_iteration']}")
         _ensure_model_dir()
-        print("Saving model to:", MODEL_PATH)
         model.save_model(MODEL_PATH)
-        # Build index → genus map
         idx_to_genus = {idx: genus for genus, idx in genus_to_idx.items()}
         meta = {
@@ -255,18 +235,12 @@ def train_genus_model() -> Dict[str, Any]:
             "feature_names": [f["name"] for f in FEATURES],
         }
-        print("Saving meta to:", META_PATH)
         with open(META_PATH, "w", encoding="utf-8") as f:
             json.dump(meta, f, indent=2, ensure_ascii=False)
-        print("Done.")
-        # Compact summary for the UI
-        genus_examples = sorted(list(genus_to_idx.keys()))[:20]
         return {
             "ok": True,
-            "message": "Genus XGBoost model trained and saved successfully.",
             "stats": {
                 "num_raw_samples": len(samples),
                 "num_usable_samples": int(X.shape[0]),
@@ -274,19 +248,14 @@ def train_genus_model() -> Dict[str, Any]:
                 "num_classes": int(num_classes),
             },
             "metrics": metrics,
-            "paths": {
-                "model_path": MODEL_PATH,
-                "meta_path": META_PATH,
-            },
-            "class_count": int(num_classes),
-            "genus_examples": genus_examples,
         }
     except Exception as e:
-        # If anything blows up, return a clean error for the UI JSON
         return {
             "ok": False,
-            "message": f"Error during genus model training: {type(e).__name__}: {e}",
         }
@@ -294,14 +263,9 @@ def train_genus_model() -> Dict[str, Any]:
 # CLI entry
 # ---------------------------------------------------------------------------
-def main() -> None:
-    """
-    Keep a CLI entry that prints the same summary.
-    """
-    summary = train_genus_model()
-    print(json.dumps(summary, indent=2, ensure_ascii=False))
 if __name__ == "__main__":
-    main()

 """
 Train a genus-level classifier (XGBoost) from gold tests.
+Pipeline:
+  • Load gold_tests.json
+  • Extract genus (first token of organism name)
+  • Convert expected_fields → feature vector (via engine.features.extract_feature_vector)
+  • Train an XGBoost multi-class classifier
+  • Save:
+        models/genus_xgb.json
+        models/genus_xgb_meta.json
+Compatible with FEATURE SCHEMA v2 (category, binary temperature flags, pigment, odor, colony pattern, TSI, etc.)
 """
 from __future__ import annotations
 from .features import extract_feature_vector, FEATURES
+# ---------------------------------------------------------------------------
 # Paths
+# ---------------------------------------------------------------------------
 GOLD_TESTS_PATH = "training/gold_tests.json"
 MODEL_DIR = "models"
 MODEL_PATH = os.path.join(MODEL_DIR, "genus_xgb.json")
 # ---------------------------------------------------------------------------
+# Load gold tests
 # ---------------------------------------------------------------------------
 def _load_gold_tests(path: str) -> List[Dict[str, Any]]:
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"Missing gold test file: {path}")
     with open(path, "r", encoding="utf-8") as f:
         data = json.load(f)
     if not isinstance(data, list):
+        raise ValueError("gold_tests.json must contain a list.")
     return data
+# ---------------------------------------------------------------------------
+# Extract genus & expected fields
+# ---------------------------------------------------------------------------
 def _extract_genus(sample: Dict[str, Any]) -> str | None:
     """
+    Extract genus from:
+        name / Name / organism / Organism
+    (genus = first token before space)
     """
     for key in ("name", "Name", "organism", "Organism"):
         if key in sample and sample[key]:
+            val = str(sample[key]).strip()
+            if val:
+                return val.split()[0]
     return None
 def _extract_fields(sample: Dict[str, Any]) -> Dict[str, Any]:
     """
+    Extract expected field dict from any of:
+       fields / expected_fields / schema / expected
     """
     for key in ("fields", "expected_fields", "schema", "expected"):
         if key in sample and isinstance(sample[key], dict):
     return {}
+# ---------------------------------------------------------------------------
+# Dataset builder
+# ---------------------------------------------------------------------------
+def _build_dataset(samples: List[Dict[str, Any]]) -> Tuple[np.ndarray, np.ndarray, Dict[str, int]]:
     """
+    Convert gold tests into:
+       X  → feature matrix
+       y  → integer labels
+       genus_to_idx → mapping
+    """
+    X_list: List[np.ndarray] = []
+    y_list: List[int] = []
     genus_to_idx: Dict[str, int] = {}
     for sample in samples:
         fields = _extract_fields(sample)
         if not fields:
             continue
+        # Generate ML feature vector (schema v2)
         vec = extract_feature_vector(fields)
         if genus not in genus_to_idx:
             genus_to_idx[genus] = len(genus_to_idx)
+        X_list.append(vec)
+        y_list.append(genus_to_idx[genus])
+    if not X_list:
+        raise ValueError("No usable gold tests found.")
+    X = np.vstack(X_list)
+    y = np.array(y_list, dtype=np.int32)
+    return X, y, genus_to_idx
+# ---------------------------------------------------------------------------
+# Train XGBoost model
+# ---------------------------------------------------------------------------
 def _train_xgboost(
     X: np.ndarray,
     y: np.ndarray,
     num_classes: int,
+    seed: int = 42
 ) -> Tuple[xgb.Booster, Dict[str, float]]:
     """
+    Train a multi-class XGBoost classifier.
+    80/20 split.
     """
     n = X.shape[0]
     indices = list(range(n))
     random.Random(seed).shuffle(indices)
         "objective": "multi:softprob",
         "num_class": num_classes,
         "eval_metric": "mlogloss",
+        "max_depth": 6,        # Higher depth since schema v2 more complex
+        "eta": 0.08,           # Slightly slower learning
+        "subsample": 0.9,
+        "colsample_bytree": 0.9,
         "min_child_weight": 1,
         "seed": seed,
     }
     model = xgb.train(
         params,
         dtrain,
         evals=evals,
+        num_boost_round=500,      # More rounds since more features
+        early_stopping_rounds=40,  # Allow more patience for complex space
+        verbose_eval=50,
     )
+    # Accuracy evaluation
+    train_acc = float(
+        (np.argmax(model.predict(dtrain), axis=1) == y_train).mean()
+    )
+    valid_acc = float(
+        (np.argmax(model.predict(dvalid), axis=1) == y_valid).mean()
+    )
+    return model, {
         "train_accuracy": train_acc,
         "valid_accuracy": valid_acc,
         "best_iteration": int(model.best_iteration),
     }
+def _ensure_model_dir():
+    if not os.path.exists(MODEL_DIR):
         os.makedirs(MODEL_DIR, exist_ok=True)
 # Public entry for UI
 # ---------------------------------------------------------------------------
 def train_genus_model() -> Dict[str, Any]:
     try:
+        print(f"Loading gold tests → {GOLD_TESTS_PATH}")
         samples = _load_gold_tests(GOLD_TESTS_PATH)
+        print("Building ML dataset...")
         X, y, genus_to_idx = _build_dataset(samples)
         num_classes = len(genus_to_idx)
         print(f"Feature dimension: {X.shape[1]}")
+        print(f"Classes (genera):   {num_classes}")
+        print(f"Samples:            {X.shape[0]}")
+        print("Training XGBoost (schema v2)...")
+        model, metrics = _train_xgboost(X, y, num_classes)
         print("Training complete.")
         print(f"Train accuracy: {metrics['train_accuracy']:.3f}")
         print(f"Valid accuracy: {metrics['valid_accuracy']:.3f}")
         _ensure_model_dir()
         model.save_model(MODEL_PATH)
         idx_to_genus = {idx: genus for genus, idx in genus_to_idx.items()}
         meta = {
             "feature_names": [f["name"] for f in FEATURES],
         }
         with open(META_PATH, "w", encoding="utf-8") as f:
             json.dump(meta, f, indent=2, ensure_ascii=False)
         return {
             "ok": True,
+            "message": "Genus XGBoost model (schema v2) trained successfully.",
             "stats": {
                 "num_raw_samples": len(samples),
                 "num_usable_samples": int(X.shape[0]),
                 "num_classes": int(num_classes),
             },
             "metrics": metrics,
+            "paths": {"model_path": MODEL_PATH, "meta_path": META_PATH},
+            "genus_examples": sorted(genus_to_idx.keys())[:20],
         }
     except Exception as e:
         return {
             "ok": False,
+            "message": f"Training error: {type(e).__name__}: {e}",
         }
 # CLI entry
 # ---------------------------------------------------------------------------
+def main():
+    print(json.dumps(train_genus_model(), indent=2, ensure_ascii=False))
 if __name__ == "__main__":
+    main()