Spaces:

COCODEDE04
/

SF_FastAPI

Sleeping

App Files Files Community

COCODEDE04 commited on Nov 12, 2025

Commit

a46e832

verified ·

1 Parent(s): c761c99

Update app.py

Browse files

Files changed (1) hide show

app.py +281 -278

app.py CHANGED Viewed

@@ -1,336 +1,339 @@
-# app.py
-import os, json, glob
 from typing import Any, Dict, List, Optional
 import numpy as np
 import tensorflow as tf
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 # ----------------- CONFIG -----------------
-DEFAULT_MODEL_CANDIDATES = ["best_model.h5", "best_model.keras"]
-DEFAULT_IMPUTER_CANDIDATES = ["imputer.joblib", "imputer.pkl", "imputer.sav"]
-DEFAULT_SCALER_CANDIDATES  = ["scaler.joblib",  "scaler.pkl",  "scaler.sav"]
-DEFAULT_STATS_PATH = "means_std.json"
-CLASSES = ["Top", "Mid-Top", "Mid", "Mid-Low", "Low"]  # index 0=Top ... 4=Low
-APPLY_CORAL_MONOTONE = True  # nudge thresholds to be non-increasing before decode
 # ------------------------------------------
-HERE = os.path.dirname(os.path.abspath(__file__))
-# ---------- utilities: robust file resolving & logging ----------
-def resolve_first(*names: str) -> Optional[str]:
-    """Return absolute path to the first existing file among provided names
-    by checking HERE, CWD, then recursive matches."""
-    for base in (HERE, os.getcwd()):
-        for n in names:
-            p = os.path.join(base, n)
-            if os.path.isfile(p):
-                return p
-    # recursive fallback (handles subfolders)
-    patterns: List[str] = []
-    for n in names:
-        patterns += [os.path.join(HERE, "**", n),
-                     os.path.join(os.getcwd(), "**", n)]
-    for pat in patterns:
-        for p in glob.glob(pat, recursive=True):
-            if os.path.isfile(p):
-                return p
-    return None
-def describe_dir():
-    try:
-        print("CWD:", os.getcwd())
-        print("Repo dir (HERE):", HERE)
-        print("Repo listing:", os.listdir(HERE))
-    except Exception as e:
-        print("listdir error:", e)
-def load_joblib(label: str, candidates: List[str]):
-    import joblib
-    print(f"Looking for {label} among: {candidates}")
-    describe_dir()
-    path = resolve_first(*candidates)
-    if not path:
-        print(f"⚠️  {label} not found.")
-        return None
-    try:
-        print(f"Loading {label} from {path} ({os.path.getsize(path)} bytes)")
-    except Exception:
-        print(f"Loading {label} from {path}")
-    try:
-        return joblib.load(path)
-    except Exception as e:
-        print(f"⚠️  Failed to load {label}: {repr(e)}")
-        return None
-def load_model_robust() -> tf.keras.Model:
-    print("Resolving model...")
-    # env override supported
-    env_model = os.getenv("MODEL_PATH")
-    if env_model:
-        candidates = [env_model]
-    else:
-        candidates = DEFAULT_MODEL_CANDIDATES
-    path = resolve_first(*candidates)
-    if not path:
-        raise FileNotFoundError(f"Model file not found. Tried: {candidates}")
-    print(f"Loading model from {path} ({os.path.getsize(path)} bytes)")
-    # We don't need custom objects for inference; compile=False is safer
-    return tf.keras.models.load_model(path, compile=False)
-def load_means_std(stats_path: Optional[str]) -> Optional[Dict[str, Dict[str, float]]]:
-    path = stats_path or os.getenv("STATS_PATH") or DEFAULT_STATS_PATH
-    path = resolve_first(path) if path else None
-    if not path:
-        print("⚠️  means_std.json not found.")
-        return None
-    print(f"Loading means/std from {path} ({os.path.getsize(path)} bytes)")
-    with open(path, "r") as f:
-        return json.load(f)
-# ---------- numeric coercion ----------
 def coerce_float(val: Any) -> float:
-    """Accepts numeric, or locale strings like '49.709,14' -> 49709.14"""
-    if isinstance(val, (int, float)):
         return float(val)
     s = str(val).strip()
     if s == "":
         raise ValueError("empty")
     s = s.replace(" ", "")
-    has_dot = "." in s
-    has_comma = "," in s
     if has_dot and has_comma:
-        last_dot = s.rfind(".")
-        last_comma = s.rfind(",")
-        if last_comma > last_dot:
             s = s.replace(".", "")
             s = s.replace(",", ".")
         else:
             s = s.replace(",", "")
     elif has_comma and not has_dot:
         s = s.replace(",", ".")
     return float(s)
-def z_manual(val: Any, mean: float, sd: float) -> float:
-    try:
-        v = coerce_float(val)
-    except Exception:
-        return 0.0
-    if not sd:
-        return 0.0
-    return (v - mean) / sd
-# ---------- CORAL decoding ----------
-def coral_probs_from_logits(logits_np: np.ndarray, monotone: bool = False) -> np.ndarray:
     """
-    logits: (N, K-1) cumulative logits.
-    If monotone=True, enforce non-increasing thresholds per sample before decode.
     """
-    logits = np.asarray(logits_np, dtype=np.float32)
-    if monotone:
-        # clamp each row to be non-increasing: t1 >= t2 >= t3 >= ...
-        # for Top=0 best to Low=4 worst, cumulative boundary logits
-        for i in range(logits.shape[0]):
-            row = logits[i]
-            # make it non-increasing by cumulative minimum from left to right
-            for j in range(1, row.shape[0]):
-                if row[j] > row[j - 1]:
-                    row[j] = row[j - 1]
-            logits[i] = row
-    sig = 1.0 / (1.0 + np.exp(-logits))  # sigmoid
-    left = np.concatenate([np.ones((sig.shape[0], 1), dtype=np.float32), sig], axis=1)
-    right = np.concatenate([sig, np.zeros((sig.shape[0], 1), dtype=np.float32)], axis=1)
-    probs = np.clip(left - right, 1e-12, 1.0)
-    return probs
-# ---------- FastAPI app ----------
-app = FastAPI(title="Static Fingerprint API", version="1.1.0")
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=False,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
 print("Loading model / imputer / scaler...")
-model = load_model_robust()
-imputer = load_joblib("imputer", DEFAULT_IMPUTER_CANDIDATES)
-scaler  = load_joblib("scaler",  DEFAULT_SCALER_CANDIDATES)
-stats   = load_means_std(os.getenv("STATS_PATH"))
-# Feature order:
-# Prefer scaler.feature_names_in_ if present (sklearn >=1.0),
-# else imputer.feature_names_in_,
-# else the order in means_std.json,
-# else fail loudly.
-if hasattr(scaler, "feature_names_in_"):
-    FEATURES: List[str] = list(scaler.feature_names_in_)
-    print("FEATURES from scaler.feature_names_in_")
-elif hasattr(imputer, "feature_names_in_"):
-    FEATURES = list(imputer.feature_names_in_)
-    print("FEATURES from imputer.feature_names_in_")
-elif isinstance(stats, dict):
-    FEATURES = list(stats.keys())
-    print("FEATURES from means_std.json order")
-else:
-    raise RuntimeError("Cannot determine feature order. Provide scaler/imputer with feature_names_in_ or a means_std.json.")
-print("Feature order:", FEATURES)
-print("Artifacts present:",
-      {"imputer": imputer is not None, "scaler": scaler is not None, "stats": stats is not None})
-@app.get("/")
-def root():
-    return {
-        "message": "Static Fingerprint API is running.",
-        "try": ["GET /health", "POST /predict", "POST /echo"],
-    }
-@app.get("/health")
-def health():
-    return {
-        "status": "ok",
-        "features": FEATURES,
-        "classes": CLASSES,
-        "artifacts": {
-            "imputer": bool(imputer is not None),
-            "scaler": bool(scaler is not None),
-            "means_std": bool(stats is not None),
-        },
-    }
-@app.post("/echo")
-async def echo(req: Request):
-    payload = await req.json()
-    return {"received": payload}
-def preprocess_payload_to_X(payload: Dict[str, Any]) -> Dict[str, Any]:
     """
-    Returns dict with:
-      - X: np.ndarray shape (1, n_features) ready for model
-      - z_scores: dict feature -> z value (if available)
-      - missing: list of features not provided
-      - used: dict feature -> raw value used (after imputation)
     """
-    missing: List[str] = []
-    used_vals: List[float] = []
-    z_scores: Dict[str, float] = {}
-    used_raw: Dict[str, float] = {}
-    # Build raw feature vector in correct order
-    raw_vec: List[float] = []
     for f in FEATURES:
         if f in payload:
-            v = coerce_float(payload[f])
         else:
-            missing.append(f)
-            v = np.nan  # let imputer handle it (median), or we'll fill below
-        raw_vec.append(v)
-    raw = np.array([raw_vec], dtype=np.float32)
-    # Impute if available
     if imputer is not None:
-        raw_imp = imputer.transform(raw)
-    else:
-        # If no imputer, simple median fill using means_std or zero
-        raw_imp = raw.copy()
-        for j, f in enumerate(FEATURES):
-            if np.isnan(raw_imp[0, j]):
-                if stats and f in stats:
-                    raw_imp[0, j] = stats[f].get("mean", 0.0)
-                else:
-                    raw_imp[0, j] = 0.0
-    # Scale if available
     if scaler is not None:
-        X = scaler.transform(raw_imp).astype(np.float32)
-        # we can still compute z-scores from scaler if it exposes scale_ and mean_
-        if hasattr(scaler, "mean_") and hasattr(scaler, "scale_"):
-            for j, f in enumerate(FEATURES):
-                mu = float(scaler.mean_[j])
-                sd = float(scaler.scale_[j])
-                z = 0.0 if sd == 0 else (float(raw_imp[0, j]) - mu) / sd
-                z_scores[f] = float(z)
     else:
-        # manual z-score using means_std.json
-        if not stats:
-            raise RuntimeError("No scaler and no means_std.json — cannot standardize.")
-        z_list: List[float] = []
-        for j, f in enumerate(FEATURES):
-            mu = float(stats[f]["mean"])
-            sd = float(stats[f]["std"])
-            z = z_manual(raw_imp[0, j], mu, sd)
-            z_list.append(z)
-            z_scores[f] = float(z)
-        X = np.array([z_list], dtype=np.float32)
-    # capture used raw values (after imputation)
-    for j, f in enumerate(FEATURES):
-        used_val = float(raw_imp[0, j])
-        used_raw[f] = used_val
-        used_vals.append(used_val)
     return {
-        "X": X,
-        "z_scores": z_scores,
-        "missing": missing,
-        "used": used_raw,
     }
 @app.post("/predict")
 async def predict(req: Request):
-    payload = await req.json()
-    if not isinstance(payload, dict):
-        return {"error": "Expected a JSON object mapping feature -> value."}
-    prep = preprocess_payload_to_X(payload)
-    X: np.ndarray = prep["X"]
-    raw = model.predict(X, verbose=0)
-    # CORAL (K-1) vs softmax (K)
-    debug: Dict[str, Any] = {"raw_shape": list(raw.shape)}
-    if raw.ndim == 2 and raw.shape[1] == (len(CLASSES) - 1):
-        decode_mode = "auto_coral_monotone" if APPLY_CORAL_MONOTONE else "auto_coral"
-        probs = coral_probs_from_logits(raw, monotone=APPLY_CORAL_MONOTONE)[0]
-    else:
-        decode_mode = "auto_softmax"
-        probs = raw[0]
-        s = float(np.sum(probs))
-        if s > 0:
-            probs = probs / s
-    debug["decode_mode"] = decode_mode
-    debug["raw_first_row"] = [float(x) for x in np.array(raw[0]).ravel().tolist()]
-    pred_idx = int(np.argmax(probs))
-    return {
-        "input_ok": (len(prep["missing"]) == 0),
-        "missing": prep["missing"],
-        "used_raw": prep["used"],        # values after imputation
-        "z_scores": prep["z_scores"],    # standardized (from scaler or stats)
-        "probabilities": {CLASSES[i]: float(probs[i]) for i in range(len(CLASSES))},
-        "predicted_state": CLASSES[pred_idx],
-        "debug": debug,
-    }

+import os, json, io, traceback
 from typing import Any, Dict, List, Optional
 import numpy as np
 import tensorflow as tf
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
 # ----------------- CONFIG -----------------
+MODEL_PATH = os.getenv("MODEL_PATH", "best_model.h5")
+STATS_PATH = os.getenv("STATS_PATH", "means_std.json")
+IMPUTER_CANDIDATES = ["imputer.joblib", "imputer.pkl", "imputer.sav"]
+SCALER_CANDIDATES  = ["scaler.joblib", "scaler.pkl", "scaler.sav"]
+CLASSES = ["Top", "Mid-Top", "Mid", "Mid-Low", "Low"]
+# ⛔ DO NOT CHANGE: exact order used in training
+FEATURES: List[str] = [
+    "autosuf_oper",
+    "improductiva",
+    "gastos_fin_over_avg_cart",
+    "_equity",
+    "grado_absorcion",
+    "_cartera_bruta",
+    "gastos_oper_over_ing_oper",
+    "cartera_vencida_ratio",
+    "roe_pre_tax",
+    "_assets",
+    "_liab",
+    "equity_over_assets",
+    "_margen_bruto",
+    "prov_over_cartera",
+    "gastos_oper_over_cart",
+    "ing_cartera_over_ing_total",
+    "debt_to_equity",
+    "prov_gasto_over_cart",
+    "cov_improductiva",
+    "rend_cart_over_avg_cart",
+    "roa_pre_tax",
+]
 # ------------------------------------------
+# --------- helpers: I/O + numeric coercion ---------
 def coerce_float(val: Any) -> float:
+    """
+    Accepts numeric, or strings like:
+      "49.709,14" -> 49709.14
+      "49,709.14" -> 49709.14
+      "0,005"     -> 0.005
+    """
+    if isinstance(val, (int, float, np.number)):
         return float(val)
     s = str(val).strip()
     if s == "":
         raise ValueError("empty")
     s = s.replace(" ", "")
+    has_dot, has_comma = "." in s, "," in s
     if has_dot and has_comma:
+        # Decide decimal by last occurrence
+        if s.rfind(",") > s.rfind("."):
             s = s.replace(".", "")
             s = s.replace(",", ".")
         else:
             s = s.replace(",", "")
     elif has_comma and not has_dot:
         s = s.replace(",", ".")
+    # else leave as-is
     return float(s)
+def load_json(path: str) -> dict:
+    with open(path, "r") as f:
+        return json.load(f)
+def load_joblib_if_exists(candidates: List[str]):
     """
+    Try loading a joblib/pickle artifact (imputer/scaler).
+    Returns (obj, path_str or None, error_str or None).
     """
+    for name in candidates:
+        p = os.path.join(os.getcwd(), name)
+        if os.path.isfile(p):
+            try:
+                # Import inside to avoid hard dependency if not used
+                import joblib  # type: ignore
+                with open(p, "rb") as fh:
+                    obj = joblib.load(fh)
+                return obj, p, None
+            except Exception as e:
+                return None, p, f"{type(e).__name__}({e})"
+    return None, None, None
+# --------- model / artifacts load ---------
 print("Loading model / imputer / scaler...")
+# Model
+model = tf.keras.models.load_model(MODEL_PATH, compile=False)
+# Imputer
+imputer, imputer_path, imputer_err = load_joblib_if_exists(IMPUTER_CANDIDATES)
+if imputer_path and imputer_err:
+    print(f"⚠️  Failed to load imputer from {imputer_path}: {imputer_err}")
+elif imputer:
+    print(f"Loaded imputer from {imputer_path}")
+else:
+    print("⚠️ No imputer found — skipping median imputation.")
+# Scaler
+scaler, scaler_path, scaler_err = load_joblib_if_exists(SCALER_CANDIDATES)
+if scaler_path and scaler_err:
+    print(f"⚠️  Failed to load scaler from {scaler_path}: {scaler_err}")
+elif scaler:
+    print(f"Loaded scaler from {scaler_path}")
+else:
+    print("⚠️ No scaler found — using manual z-scoring if stats are available.")
+# Stats (means/std) for fallback manual z-score
+stats = {}
+if os.path.isfile(STATS_PATH):
+    stats = load_json(STATS_PATH)
+    print(f"Loaded means/std from {STATS_PATH}")
+else:
+    print("⚠️ No means_std.json found — manual z-scoring will be unavailable if scaler missing.")
+# --------- decoding for CORAL vs softmax ---------
+def coral_probs_from_logits(logits_np: np.ndarray) -> np.ndarray:
+    """
+    (N, K-1) logits -> (N, K) probabilities for CORAL ordinal output.
+    """
+    logits = tf.convert_to_tensor(logits_np, dtype=tf.float32)
+    sig = tf.math.sigmoid(logits)  # (N, K-1)
+    left  = tf.concat([tf.ones_like(sig[:, :1]), sig], axis=1)
+    right = tf.concat([sig, tf.zeros_like(sig[:, :1])], axis=1)
+    probs = tf.clip_by_value(left - right, 1e-12, 1.0)
+    return probs.numpy()
+def decode_logits(raw: np.ndarray) -> (np.ndarray, str):
+    """
+    raw: (1, M) array
+    Returns (probs (K,), mode_str).
+    Detects CORAL (M=K-1) vs Softmax (M=K).
+    """
+    if raw.ndim != 2:
+        raise ValueError(f"Unexpected raw shape {raw.shape}")
+    M = raw.shape[1]
+    K = len(CLASSES)
+    if M == K - 1:
+        # CORAL logits
+        probs = coral_probs_from_logits(raw)[0]
+        return probs, "auto_coral"
+    elif M == K:
+        # Softmax or unnormalized scores
+        row = raw[0]
+        exps = np.exp(row - np.max(row))
+        probs = exps / np.sum(exps)
+        return probs, "auto_softmax"
+    else:
+        # Fallback: normalize across whatever is there
+        row = raw[0]
+        s = float(np.sum(np.abs(row)))
+        probs = (row / s) if s > 0 else np.ones_like(row) / len(row)
+        return probs, f"fallback_M{M}_K{K}"
+# --------- preprocessing pipeline ---------
+def build_raw_vector(payload: Dict[str, Any]) -> np.ndarray:
     """
+    Build raw feature vector in exact training order.
+    Missing -> np.nan (imputer will handle if available).
+    Values coerced to float robustly.
     """
+    vals = []
     for f in FEATURES:
         if f in payload:
+            try:
+                vals.append(coerce_float(payload[f]))
+            except Exception:
+                vals.append(np.nan)
         else:
+            vals.append(np.nan)
+    return np.array(vals, dtype=np.float32)
+def apply_imputer_if_any(x: np.ndarray) -> np.ndarray:
     if imputer is not None:
+        # imputer expects 2D
+        return imputer.transform(x.reshape(1, -1)).astype(np.float32)[0]
+    # fallback: replace NaNs with feature means from stats if available, else 0
+    out = x.copy()
+    for i, f in enumerate(FEATURES):
+        if np.isnan(out[i]):
+            if f in stats and "mean" in stats[f]:
+                out[i] = float(stats[f]["mean"])
+            else:
+                out[i] = 0.0
+    return out
+def apply_scaling_or_stats(raw_vec: np.ndarray) -> (np.ndarray, Dict[str, float], str):
+    """
+    Returns (z_vec, z_detail_dict, mode_str)
+    - If scaler present: scaler.transform
+    - Else: manual (x-mean)/std using stats
+    """
     if scaler is not None:
+        z = scaler.transform(raw_vec.reshape(1, -1)).astype(np.float32)[0]
+        z_detail = {f: float(z[i]) for i, f in enumerate(FEATURES)}
+        return z, z_detail, "sklearn_scaler"
     else:
+        z = np.zeros_like(raw_vec, dtype=np.float32)
+        z_detail: Dict[str, float] = {}
+        for i, f in enumerate(FEATURES):
+            mean = stats.get(f, {}).get("mean", 0.0)
+            sd   = stats.get(f, {}).get("std",  1.0)
+            if not sd:
+                sd = 1.0
+            z[i] = (raw_vec[i] - mean) / sd
+            z_detail[f] = float(z[i])
+        return z, z_detail, "manual_stats"
+# ----------------- FastAPI -----------------
+app = FastAPI(title="Static Fingerprint API", version="1.1.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=False,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/")
+def root():
+    return {
+        "message": "Static Fingerprint API is running.",
+        "try": ["GET /health", "POST /predict", "POST /debug/z"],
+    }
+@app.get("/health")
+def health():
+    stats_keys = []
+    try:
+        if os.path.isfile(STATS_PATH):
+            stats_keys = list(load_json(STATS_PATH).keys())
+    except Exception:
+        pass
     return {
+        "status": "ok",
+        "classes": CLASSES,
+        "features_training_order": FEATURES,
+        "features_in_means_std": stats_keys,
+        "model_file": MODEL_PATH,
+        "imputer": bool(imputer),
+        "scaler": bool(scaler),
+        "stats_available": bool(stats),
     }
+@app.post("/debug/z")
+async def debug_z(req: Request):
+    try:
+        payload = await req.json()
+        if not isinstance(payload, dict):
+            return JSONResponse(status_code=400, content={"error": "Expected JSON object"})
+        raw = build_raw_vector(payload)
+        raw_imp = apply_imputer_if_any(raw)
+        z, z_detail, mode = apply_scaling_or_stats(raw_imp)
+        rows = []
+        for i, f in enumerate(FEATURES):
+            rows.append({
+                "feature": f,
+                "input_value": None if np.isnan(raw[i]) else float(raw[i]),
+                "imputed_value": float(raw_imp[i]),
+                "z": float(z[i]),
+                "mean": stats.get(f, {}).get("mean", None),
+                "std":  stats.get(f, {}).get("std",  None),
+            })
+        return {"preprocess_mode": mode, "rows": rows}
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e), "trace": traceback.format_exc()})
 @app.post("/predict")
 async def predict(req: Request):
+    """
+    Body: JSON object mapping feature -> numeric value (strings with commas/points ok).
+    Missing features are imputed if imputer present; else filled with means (if stats) or 0.
+    """
+    try:
+        payload = await req.json()
+        if not isinstance(payload, dict):
+            return JSONResponse(status_code=400, content={"error": "Expected JSON object"})
+        # Build in EXACT training order
+        raw = build_raw_vector(payload)            # may contain NaNs
+        raw_imp = apply_imputer_if_any(raw)        # impute
+        z_vec, z_detail, z_mode = apply_scaling_or_stats(raw_imp)  # scale / z-score
+        # Predict
+        X = z_vec.reshape(1, -1).astype(np.float32)
+        raw_logits = model.predict(X, verbose=0)
+        probs, mode = decode_logits(raw_logits)
+        # Package response
+        pred_idx = int(np.argmax(probs))
+        probs_dict = {CLASSES[i]: float(probs[i]) for i in range(len(CLASSES))}
+        missing = [f for i, f in enumerate(FEATURES) if np.isnan(raw[i])]
+        return {
+            "input_ok": (len(missing) == 0),
+            "missing": missing,
+            "preprocess": {
+                "imputer": bool(imputer),
+                "scaler": bool(scaler),
+                "z_mode": z_mode,
+            },
+            "z_scores": z_detail,  # per feature
+            "probabilities": probs_dict,
+            "predicted_state": CLASSES[pred_idx],
+            "debug": {
+                "raw_shape": list(raw_logits.shape),
+                "decode_mode": mode,
+                "raw_first_row": [float(v) for v in raw_logits[0]],
+            },
+        }
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": str(e), "trace": traceback.format_exc()})