File size: 13,244 Bytes

3961ee7


import pandas as pd
import numpy as np
import joblib
import os
import logging
from pymatgen.core import Composition 
import re 

from .constants import KNOWN_ELEMENT_SYMBOLS, ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available
from .feature_engineering_utils import standardize_chemical_formula, generate_compositional_features
from .process_feature_utils import generate_process_features_for_input, generate_stoichiometry_features_for_input

MODEL_DIR = "../models" 
PREPROCESSOR_DIR = "../models" 
ELEMENTAL_DATA_PATH = os.path.join(MODEL_DIR, "df_elements_processed.pkl")

ESSENTIAL_OBJECTS = {}
DF_ELEMENTS_PROCESSED_GLOBAL = None

def load_all_artifacts_once():
    global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS, matminer_available, MAGPIE_FEATURIZER, MAGPIE_LABELS 
    if ESSENTIAL_OBJECTS.get("loaded_successfully"):
        logging.info("Artifacts already loaded.")
        return True

    logging.info("--- Loading Essential Artifacts for Prediction ---")
    script_dir = os.path.dirname(__file__) 
    
    try:
        elemental_data_full_path = os.path.join(script_dir, ELEMENTAL_DATA_PATH)
        DF_ELEMENTS_PROCESSED_GLOBAL = pd.read_pickle(elemental_data_full_path)
        ESSENTIAL_OBJECTS["elemental_data"] = DF_ELEMENTS_PROCESSED_GLOBAL
        logging.info(f"Loaded processed elemental data from {elemental_data_full_path}")
    except Exception as e:
        logging.critical(f"CRITICAL: Error loading elemental data from {elemental_data_full_path}: {e}")
        return False
    
    if not matminer_available: # Attempt to re-init if constants.py didn't catch it
        try:
            from matminer.featurizers.composition import ElementProperty
            MAGPIE_FEATURIZER = ElementProperty.from_preset("magpie", impute_nan=True)
            MAGPIE_LABELS = [f'magpie_{label.replace(" ", "_")}' for label in MAGPIE_FEATURIZER.feature_labels()]
            matminer_available = True
            logging.info("Matminer re-initialized in inference script.")
        except:
            logging.warning("Matminer could not be re-initialized in inference script.")


    ESSENTIAL_OBJECTS["models"] = {}
    ESSENTIAL_OBJECTS["encoders"] = {}
    ESSENTIAL_OBJECTS["imputers"] = {}
    ESSENTIAL_OBJECTS["scalers"] = {}
    ESSENTIAL_OBJECTS["feature_columns"] = {}

    all_loaded_successfully = True
    for model_type_key in ["temperature_bin", "atmosphere_category"]:
        model_artifact_name = f"{model_type_key}_tuned"
        try:
            ESSENTIAL_OBJECTS["models"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_lgbm_model.joblib"))
            ESSENTIAL_OBJECTS["encoders"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_label_encoder.joblib"))
            ESSENTIAL_OBJECTS["imputers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_imputer.joblib"))
            ESSENTIAL_OBJECTS["scalers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_scaler.joblib"))
            ESSENTIAL_OBJECTS["feature_columns"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_feature_columns.joblib"))
            logging.info(f"Loaded artifacts for {model_artifact_name} model.")
        except Exception as e:
            logging.error(f"Error loading one or more artifacts for '{model_artifact_name}': {e}. Predictions for it may fail.")
            ESSENTIAL_OBJECTS["models"][model_type_key] = None
            all_loaded_successfully = False
            
    ESSENTIAL_OBJECTS["loaded_successfully"] = all_loaded_successfully
    return all_loaded_successfully

def create_feature_vector_for_prediction(raw_synthesis_input, model_target_name):
    global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS
    
    if DF_ELEMENTS_PROCESSED_GLOBAL is None:
        logging.error("Elemental data not loaded. Call load_all_artifacts_once() first.")
        return None

    expected_feature_cols = ESSENTIAL_OBJECTS["feature_columns"].get(model_target_name)
    if not expected_feature_cols:
        logging.error(f"Feature column list for '{model_target_name}' not found in loaded artifacts.")
        return None
    
    feature_dict = {col: (0 if col.startswith(("ops_", "proc_has_", "elem_block_")) or "is_stoichiometric" in col or "is_elements_only" in col else np.nan) for col in expected_feature_cols}

    # Target Compositional Features
    std_target_output = standardize_chemical_formula(raw_synthesis_input.get('target_formula_raw'), "predict_target")
    target_comp_feats = generate_compositional_features(std_target_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_target_comp")
    for k, v in target_comp_feats.items():
        feature_key = f'target_{k}'
        if feature_key in feature_dict: feature_dict[feature_key] = v

    # Precursor Compositional Features
    precursor_formulas_raw = raw_synthesis_input.get('precursor_formulas_raw', [])
    std_precursors_outputs = [standardize_chemical_formula(p, f"predict_prec_{i}") for i, p in enumerate(precursor_formulas_raw)]
    num_valid_precursors, num_stoich_precursors, num_elements_only_precursors = 0,0,0
    precursor_comp_feats_list = []
    for std_p_output in std_precursors_outputs:
        if std_p_output is not None:
            num_valid_precursors += 1
            if isinstance(std_p_output, str): num_stoich_precursors += 1
            elif isinstance(std_p_output, dict) and std_p_output.get('type') == 'elements_only': num_elements_only_precursors +=1
        precursor_comp_feats_list.append(generate_compositional_features(std_p_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_prec_comp"))
    
    feature_dict['num_valid_precursors'] = num_valid_precursors
    feature_dict['all_prec_are_stoichiometric'] = (num_stoich_precursors == num_valid_precursors) if num_valid_precursors > 0 else False
    feature_dict['any_prec_is_elements_only'] = (num_elements_only_precursors > 0) if num_valid_precursors > 0 else False

    if precursor_comp_feats_list:
        df_prec_feats = pd.DataFrame(precursor_comp_feats_list)
        numeric_cols_df_prec = df_prec_feats.select_dtypes(include=np.number)
        if not numeric_cols_df_prec.empty:
            temp_sample_df = pd.DataFrame([generate_compositional_features("H2O", DF_ELEMENTS_PROCESSED_GLOBAL)])
            numeric_sample_comp_keys = [k for k in temp_sample_df.columns if pd.api.types.is_numeric_dtype(temp_sample_df[k]) and k not in ['is_stoichiometric_formula']]
            for agg_func_name in ['mean', 'std', 'min', 'max', 'sum']:
                aggregated_vals = getattr(numeric_cols_df_prec, agg_func_name)()
                for feat_name_suffix in numeric_sample_comp_keys:
                    agg_feat_key = f"{agg_func_name}_prec_{feat_name_suffix}"
                    if agg_feat_key in feature_dict and feat_name_suffix in aggregated_vals:
                        feature_dict[agg_feat_key] = aggregated_vals[feat_name_suffix]
    
    # Process Features
    process_input_ops_list = raw_synthesis_input.get('operations_simplified_list', []) 
    all_atm_cats = list(set([col.split('ops_atm_cat_')[-1] for col in expected_feature_cols if col.startswith('ops_atm_cat_')]))
    all_mix_meths = list(set([col.split('ops_mix_meth_')[-1] for col in expected_feature_cols if col.startswith('ops_mix_meth_')]))
    proc_feats_generated = generate_process_features_for_input(process_input_ops_list, all_atm_cats, all_mix_meths)
    for k, v in proc_feats_generated.items():
        if k in feature_dict: feature_dict[k] = v
            
    # Stoichiometry features
    reactants_simplified = raw_synthesis_input.get('reactants_simplified', []) 
    products_simplified = raw_synthesis_input.get('products_simplified', [])
    stoich_feats_generated = generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_chemical_formula)
    for k, v in stoich_feats_generated.items():
        if k in feature_dict: feature_dict[k] = v

    feature_vector_df = pd.DataFrame([feature_dict], columns=expected_feature_cols) 
    
    # Impute and Scale
    imputer = ESSENTIAL_OBJECTS["imputers"].get(model_target_name)
    scaler = ESSENTIAL_OBJECTS["scalers"].get(model_target_name)
    
    numerical_features_for_transform = [col for col in expected_feature_cols if col in feature_vector_df.columns and pd.api.types.is_numeric_dtype(feature_vector_df[col].dtype) and not col.startswith('ops_') and not col.startswith('proc_has_') and not col.startswith('elem_block_') and col not in ['is_stoichiometric_formula', 'all_prec_are_stoichiometric', 'any_prec_is_elements_only', 'num_valid_precursors']]

    if imputer and scaler and numerical_features_for_transform:
        try:
            feature_vector_df[numerical_features_for_transform] = feature_vector_df[numerical_features_for_transform].astype(np.float64)
            feature_vector_df[numerical_features_for_transform] = imputer.transform(feature_vector_df[numerical_features_for_transform])
            feature_vector_df[numerical_features_for_transform] = scaler.transform(feature_vector_df[numerical_features_for_transform])
            logging.info("Feature vector imputed and scaled for prediction.")
        except Exception as e_transform:
            logging.error(f"Error during imputation/scaling for prediction: {e_transform}", exc_info=True)
            return None
    else:
        logging.warning("Imputer, Scaler or numerical features missing for prediction. Proceeding with caution.")
    return feature_vector_df


def predict_synthesis_outcome(raw_synthesis_input):
    global ESSENTIAL_OBJECTS
    if not ESSENTIAL_OBJECTS.get("loaded_successfully"):
        success = load_all_artifacts_once()
        if not success:
            logging.error("Essential artifacts could not be loaded. Cannot make predictions.")
            return {}
        
    predictions = {}
    model_types_to_predict = ["temperature_bin", "atmosphere_category"]

    for model_type in model_types_to_predict:
        if ESSENTIAL_OBJECTS["models"].get(model_type):
            logging.info(f"\n--- Predicting {model_type} ---")
            feature_vector = create_feature_vector_for_prediction(raw_synthesis_input, model_type)
            
            if feature_vector is not None:
                model = ESSENTIAL_OBJECTS["models"][model_type]
                encoder = ESSENTIAL_OBJECTS["encoders"][model_type]
                try:
                    pred_encoded = model.predict(feature_vector)
                    pred_proba = model.predict_proba(feature_vector)
                    pred_label = encoder.inverse_transform(pred_encoded)[0]
                    
                    predictions[model_type] = {
                        'predicted_label': pred_label,
                        'probabilities': {str(cls): prob for cls, prob in zip(encoder.classes_, pred_proba[0])}
                    }
                    logging.info(f"Predicted {model_type}: {pred_label}")
                    logging.info(f"Probabilities: {predictions[model_type]['probabilities']}")
                except Exception as e:
                    logging.error(f"Error during {model_type} prediction: {e}", exc_info=True)
                    predictions[model_type] = f"Prediction Error: {e}"
            else:
                logging.error(f"Could not create feature vector for {model_type} model.")
                predictions[model_type] = "Feature vector creation error"
        else:
            logging.warning(f"{model_type} model not available for prediction.")
            
    return predictions

if __name__ == '__main__':
    # This block is for testing this inference script directly.
    
    # Ensure artifacts are loaded
    if not load_all_artifacts_once():
        print("Exiting due to failure in loading essential artifacts.")
    else:
        print("\n--- Example Interactive Prediction ---")
        example_input_with_ops_list = {
            'target_formula_raw': "YBa2Cu3O7",
            'precursor_formulas_raw': ["Y2O3", "BaCO3", "CuO"],
            'operations_simplified_list': [
                {'type': 'MixingOperation', 'string': 'Mix precursors by ball milling for 4h', 'conditions': {'duration': [{'value':4, 'unit':'h'}]}},
                {'type': 'HeatingOperation', 'string': 'Calcined at 900C for 12h in air', 'conditions': {'heating_temperature': [{'value':900, 'unit':'C'}], 'heating_time': [{'value':12, 'unit':'h'}], 'atmosphere': 'Air'}},
                {'type': 'HeatingOperation', 'string': 'Sintered at 950C for 24h in O2', 'conditions': {'heating_temperature': [{'value':950, 'unit':'C'}], 'heating_time': [{'value':20, 'unit':'h'}], 'atmosphere': 'Oxygen'}}
            ],
            'reactants_simplified': [{'material': 'Y2O3', 'amount': 0.5}, {'material':'BaCO3', 'amount': 2.0}, {'material':'CuO', 'amount': 3.0}],
            'products_simplified': [{'material':'YBa2Cu3O7', 'amount': 1.0}]
        }

        predictions = predict_synthesis_outcome(example_input_with_ops_list)
        print(f"\nFinal Predictions for example input: {predictions}")