# engine/genus_predictor.py """ Genus-level ML prediction using the XGBoost model trained in Stage 12D. This module loads: models/genus_xgb.json models/genus_xgb_meta.json And exposes: predict_genus_from_fused(fused_fields) Which returns a list of tuples: [ (genus_name, probability_float, confidence_label), ... ] Where confidence_label is one of: - "Excellent Identification" (>= 0.90) - "Good Identification" (>= 0.80) - "Acceptable Identification" (>= 0.65) - "Low Discrimination" (< 0.65) """ from __future__ import annotations import os import json from typing import Dict, Any, List, Tuple import numpy as np import xgboost as xgb from .features import extract_feature_vector # Paths _MODEL_PATH = "models/genus_xgb.json" _META_PATH = "models/genus_xgb_meta.json" # ---------------------------------------------------------------------- # Lazy load model + metadata — only loads once globally # ---------------------------------------------------------------------- _MODEL = None _META = None _IDX_TO_GENUS = None _NUM_FEATURES = None _NUM_CLASSES = None def _lazy_load(): """Load model and metadata only once.""" global _MODEL, _META, _IDX_TO_GENUS, _NUM_FEATURES, _NUM_CLASSES if _MODEL is not None: return if not os.path.exists(_MODEL_PATH): raise FileNotFoundError(f"Genus model not found at '{_MODEL_PATH}'.") if not os.path.exists(_META_PATH): raise FileNotFoundError(f"Genus meta file not found at '{_META_PATH}'.") # Load model _MODEL = xgb.Booster() _MODEL.load_model(_MODEL_PATH) # Load metadata with open(_META_PATH, "r", encoding="utf-8") as f: _META = json.load(f) _IDX_TO_GENUS = {int(k): v for k, v in _META["idx_to_genus"].items()} _NUM_FEATURES = _META["n_features"] _NUM_CLASSES = _META["num_classes"] # ---------------------------------------------------------------------- # Confidence label assignment # ---------------------------------------------------------------------- def _confidence_band(p: float) -> str: if p >= 0.90: return "Excellent Identification" if p >= 0.80: return "Good Identification" if p >= 0.65: return "Acceptable Identification" return "Low Discrimination" # ---------------------------------------------------------------------- # Public prediction function # ---------------------------------------------------------------------- def predict_genus_from_fused( fused_fields: Dict[str, Any], top_k: int = 10 ) -> List[Tuple[str, float, str]]: """ Predict genus from fused fields using the trained XGBoost model. Returns top_k results sorted by probability: [(genus_name, probability_float, confidence_label), ...] """ _lazy_load() # Build feature vector vec = extract_feature_vector(fused_fields) if vec.shape[0] != _NUM_FEATURES: # Defensive: mismatch in schema → pad or trim fixed = np.zeros(_NUM_FEATURES, dtype=float) m = min(len(vec), _NUM_FEATURES) fixed[:m] = vec[:m] vec = fixed dmat = xgb.DMatrix(vec.reshape(1, -1)) probs = _MODEL.predict(dmat)[0] # shape: (num_classes,) # Build list of (genus, prob, band) results = [] for idx, p in enumerate(probs): genus = _IDX_TO_GENUS.get(idx, f"Class_{idx}") results.append((genus, float(p), _confidence_band(float(p)))) # Sort by probability, descending results.sort(key=lambda x: x[1], reverse=True) return results[:top_k]