Spaces:

cindyy287
/

fraud_detection_api_1

Runtime error

File size: 5,303 Bytes

c2fb337

import joblib
import json
import os
import numpy as np
import pandas as pd

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MODEL_DIR = os.path.join(BASE_DIR, "models")
def _load_first_existing(*names):
    """Try the given filenames in order and load the first one that exists.

    Returns the loaded object or raises FileNotFoundError if none exist.

    """
    for name in names:
        path = os.path.join(MODEL_DIR, name)
        if os.path.exists(path):
            return joblib.load(path)
    raise FileNotFoundError(f"None of {names} found in {MODEL_DIR}")


# Load model and preprocessor, preferring enhanced versions if present.
model = _load_first_existing(
    "ensemble_model_enhanced.joblib",
    "ensemble_model.joblib",
    "Ensemble_model.joblib",
)

preprocessor = _load_first_existing(
    "preprocessor_enhanced.joblib",
    "preprocessor.joblib",
    "Preprocessor.joblib",
)

# Anscombe config (case-insensitive check)
anscombe_path = None
for candidate in ("anscombe.json", "Anscombe.json"):
    p = os.path.join(MODEL_DIR, candidate)
    if os.path.exists(p):
        anscombe_path = p
        break
if anscombe_path:
    with open(anscombe_path) as f:
        anscombe_config = json.load(f)
else:
    anscombe_config = {}


def predict_fraud(data: dict):
    # Accept either a dict of feature-name: value pairs or a JSON
    # body with a single key "features" containing a list of values.
    if isinstance(data, dict) and "features" in data:
        features = data["features"]
        # If the preprocessor expects named columns, provide a
        # DataFrame with those column names; otherwise use a numpy
        # array truncated/padded to the expected length.
        feature_names = getattr(preprocessor, "feature_names_in_", None)
        if feature_names is not None:
            cols = list(feature_names)
            row = features[: len(cols)]
            # Figure out which columns are treated as categorical by the
            # preprocessor so we can coerce values appropriately.
            cat_cols = set()
            for name, trans, cols_in_transformer in preprocessor.transformers_:
                try:
                    # If transformer is OneHotEncoder (or similar) we
                    # treat its columns as categorical.
                    if type(trans).__name__ == "OneHotEncoder" or hasattr(trans, 'categories_'):
                        for c in cols_in_transformer:
                            cat_cols.add(c)
                except Exception:
                    continue

            coerced = []
            for col_name, v in zip(cols, row):
                if col_name in cat_cols:
                    coerced.append(str(v))
                else:
                    try:
                        coerced.append(float(v))
                    except Exception:
                        coerced.append(float('nan'))
            # If the provided features list is shorter than the number
            # of expected columns, pad the remaining columns with
            # sensible defaults: empty string for categorical columns
            # and NaN for numeric columns.
            if len(row) < len(cols):
                for col_name in cols[len(row) :]:
                    if col_name in cat_cols:
                        coerced.append("")
                    else:
                        coerced.append(float('nan'))
            X = pd.DataFrame([coerced], columns=cols)
        else:
            X = np.array([features])
    else:
        # If caller provided a mapping of name->value, use a
        # DataFrame so column names match the preprocessor.
        if isinstance(data, dict):
            X = pd.DataFrame([data])
        else:
            X = np.array([list(data.values())])
    # Ensure the input has the expected number of features for the
    # preprocessor. If extra features are provided (e.g. tests send 4
    # but preprocessor expects 2), take the first n features.
    expected = getattr(preprocessor, "n_features_in_", None)
    if expected is not None:
        # If X is a numpy array, check shape; if it's a DataFrame,
        # the preprocessor can accept it as long as it has required
        # columns.
        if isinstance(X, np.ndarray):
            if X.shape[1] < expected:
                raise ValueError(f"X has {X.shape[1]} features, but preprocessor is expecting {expected} features as input.")
            if X.shape[1] > expected:
                X = X[:, :expected]

    try:
        X_processed = preprocessor.transform(X)
    except Exception as exc:
        # Raise a more informative error to help debugging
        cols = getattr(X, 'columns', None)
        head = None
        try:
            head = X.head().to_dict()
        except Exception:
            head = None
        raise ValueError(f"Transform failed: {exc}; X_type={type(X)}; columns={cols}; head={head}") from exc

    prediction = model.predict(X_processed)[0]
    probability = model.predict_proba(X_processed)[0].max()

    return {
        "fraud": int(prediction),
        "fraud_prediction": int(prediction),
        "probability": float(probability)
    }