EchoML / scripts /build_base_index.py
Tiffany Degbotse
query with your model
2ae10e0
"""
Precompute a 'global' reasoning space from a baseline model + dataset.
Usage:
python scripts/build_base_index.py \
--model_path path/to/model.pkl \
--csv path/to/data.csv \
--features col1,col2,col3 \
--target target_col \
--namespace data/base_indices/recidivism_global \
--sample 2000
"""
# Query_Your_Model/scripts/build_base_index.py
import sys, os
#sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import os
import pandas as pd
import numpy as np
from ..core.model_loader import load_model, predict
from ..core.explain import explain_instance
from ..core.storage import ensure_dir, init_matrix_files, append_case
from ..core.utils import case_id_from_vector
# Hardcoded defaults for Iris demo
MODEL_PATH = "Query_Your_Model/model_data/model.pkl"
CSV_PATH = "Query_Your_Model/model_data/data.csv"
FEATURES = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]
TARGET = "target"
NAMESPACE = "Query_Your_Model/data/base_indices/iris_global"
SAMPLE = 100 # how many rows to sample
def main():
print("Building reasoning index...")
df = pd.read_csv(CSV_PATH)
if SAMPLE and SAMPLE < len(df):
df = df.sample(SAMPLE, random_state=42)
X = df[FEATURES].values
model = load_model(MODEL_PATH)
ensure_dir(NAMESPACE)
init_matrix_files(NAMESPACE, feature_dim=len(FEATURES), shap_dim=len(FEATURES))
bg = df[FEATURES].sample(min(100, len(df)), random_state=0).values.astype("float32")
for i, row in df.iterrows():
x = row[FEATURES].values.astype("float32")
y_pred, _ = predict(model, x.reshape(1, -1))
exp = explain_instance(model, x, FEATURES, background_X=bg, top_k=8)
shap_vec = np.array(exp["shap_values"], dtype="float32")
cid = case_id_from_vector(x, prefix="iris")
meta = {"y_pred": float(y_pred[0])}
append_case(NAMESPACE, cid, x, shap_vec, meta)
print(f"Done! Index saved to {NAMESPACE}")
if __name__ == "__main__":
main()