sinxcosx11 commited on Jun 1, 2025

Commit

3961ee7

verified ·

1 Parent(s): 40e93de

Automated model and inference script upload v2

Browse files

Files changed (18) hide show

README.md +68 -0
models/atmosphere_category_tuned_feature_columns.joblib +3 -0
models/atmosphere_category_tuned_imputer.joblib +3 -0
models/atmosphere_category_tuned_label_encoder.joblib +3 -0
models/atmosphere_category_tuned_lgbm_model.joblib +3 -0
models/atmosphere_category_tuned_scaler.joblib +3 -0
models/df_elements_processed.pkl +3 -0
models/temperature_bin_tuned_feature_columns.joblib +3 -0
models/temperature_bin_tuned_imputer.joblib +3 -0
models/temperature_bin_tuned_label_encoder.joblib +3 -0
models/temperature_bin_tuned_lgbm_model.joblib +3 -0
models/temperature_bin_tuned_scaler.joblib +3 -0
requirements.txt +8 -0
src/__init__.py +1 -0
src/constants.py +40 -0
src/feature_engineering_utils.py +136 -0
src/inference.py +224 -0
src/process_feature_utils.py +192 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+license: mit
+language: en
+tags:
+- materials science
+- synthesis prediction
+- lightgbm
+- cheminformatics
+datasets: []
+metrics:
+- accuracy
+- f1
+---
+# Synthesis Condition Predictor
+This model predicts optimal temperature bins and atmosphere categories for inorganic material synthesis.
+It was trained on a dataset of text-mined synthesis procedures.
+**Models Included:**
+* Temperature Bin Prediction (LightGBM)
+* Atmosphere Category Prediction (LightGBM)
+**Intended Use:**
+To assist researchers in designing synthesis experiments by predicting key process parameters.
+Input a target material, precursors, and basic operational details to get predictions.
+**How to Use:**
+```python
+# Ensure your inference script and its dependencies are in the PYTHONPATH
+# Example, if your repo is named 'synthesis_predictor_hf_repo' and it's in your path:
+# from synthesis_predictor_hf_repo.src.inference import predict_synthesis_outcome, load_all_artifacts_once
+# Or, if running from a cloned repo where 'src' is a subdirectory:
+# from src.inference import predict_synthesis_outcome, load_all_artifacts_once
+# if not load_all_artifacts_once():
+#     print("Failed to load model artifacts.")
+# else:
+#     raw_input_example = {
+#         'target_formula_raw': "YBa2Cu3O7",
+#         'precursor_formulas_raw': ["Y2O3", "BaCO3", "CuO"],
+#         'operations_simplified_list': [
+#             {'type': 'MixingOperation', 'string': 'Ball milling for 2h', 'conditions': {'duration': [{'value':2, 'unit':'h'}]}},
+#             {'type': 'HeatingOperation', 'string': 'Calcined at 920C for 10h in air',
+#               'conditions': {'heating_temperature': [{'value':920}], 'heating_time': [{'value':10}], 'atmosphere':'air'}},
+#             {'type': 'HeatingOperation', 'string': 'Sintered at 950C for 20h in O2',
+#               'conditions': {'heating_temperature': [{'value':950}], 'heating_time': [{'value':20}], 'atmosphere':'Oxygen'}}
+#         ],
+#         'reactants_coeffs': [("Y2O3", 0.5), ("BaCO3", 2.0), ("CuO", 3.0)], # Example, adjust as needed
+#         'products_coeffs': [("YBa2Cu3O7", 1.0)] # Example
+#     }
+#     predictions = predict_synthesis_outcome(raw_input_example)
+#     print(predictions)
+```
+**Limitations:**
+* The model's accuracy is around 68-72%.
+* Predictions are based on patterns in the training data and may not generalize to all chemical systems.
+* The feature engineering for process parameters in the inference script relies on the user providing an `operations_simplified_list` that can be parsed by the internal logic. The quality of these inputs directly affects prediction accuracy.
+**Training Data:**
+The model was trained on a proprietary dataset of text-mined inorganic synthesis procedures.
+**Evaluation Results:**
+()

models/atmosphere_category_tuned_feature_columns.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02b835987564e8176b65df3dc8b01231bbe3d4b070e9b7782e84e1c681ca40a0
+size 42289

models/atmosphere_category_tuned_imputer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78fb389f43864a52c88abdf683efba3b6d2201944c95a61e29d4c7926da7f5df
+size 50367

models/atmosphere_category_tuned_label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28abe2833f4f1f63f8edc5f441003897e5b283ef62c4fc5403988192a716f7f2
+size 546

models/atmosphere_category_tuned_lgbm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f6ba5d42c8deaadd9c0ffe3c1cb7ad4cd55f17c37fd7c6b32e3c131a1e28cf
+size 10168460

models/atmosphere_category_tuned_scaler.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:060c0cf8bed3af7c2fe4d01f5fb00dc65fff1adfc7f4fae37ce2318dde0bb810
+size 65791

models/df_elements_processed.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:239fb692b5bbe23bbadc166427c25be0b139d2037e5266ce72061306d7dfe589
+size 87794

models/temperature_bin_tuned_feature_columns.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:acc884465ac46f5693ef039d4aa6b4921f7b8c003e11d8e53c92086471dec81b
+size 42314

models/temperature_bin_tuned_imputer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98c6f43618fc49cce41d4c72402b9d5bbfd549b43cb345f2443cf9dc47468ec9
+size 50223

models/temperature_bin_tuned_label_encoder.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d750eb6f78f9cade3b8cdd74308bbea8da6a3e96484b0e3966b065873e7d608a
+size 576

models/temperature_bin_tuned_lgbm_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd02f53571b21580d42fcf6dcc63584d43c1d0b6273129a7b940a97d3a02d9d8
+size 17173636

models/temperature_bin_tuned_scaler.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9903d5d60c279bcd459b9ed3aaa1e5698660e399aea0b19a23a472aa4f44840
+size 65599

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+pandas
+numpy
+scikit-learn
+lightgbm
+joblib
+pymatgen
+# matminer # Optional, if MAGPIE_FEATURIZER is used directly at inference and not just for labels

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Makes 'src' a package\n

src/constants.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from pymatgen.core import Element as PymatgenElement
+KNOWN_ELEMENT_SYMBOLS = {el.symbol for el in PymatgenElement}
+ATMOSPHERE_CONFIG = {
+    "patterns": [
+        (r'\b(air)\b', 'Air', 'Oxidizing'), (r'\b(O\s?2|oxygen)\b', 'O2', 'Oxidizing'),
+        (r'\b(Ar|argon)\b', 'Ar', 'Inert'), (r'\b(N\s?2|nitrogen)\b', 'N2', 'Inert'),
+        (r'\b(H\s?2/N\s?2|N\s?2/H\s?2|forming\s*gas)\b', 'FormingGas(N2/H2)', 'Reducing'),
+        (r'\b(Ar/H\s?2|H\s?2/Ar)\b', 'Ar/H2', 'Reducing'), (r'\b(H\s?2|hydrogen)\b', 'H2', 'Reducing'),
+        (r'\b(vacuum)\b', 'Vacuum', 'Vacuum'), (r'\b(He|helium)\b', 'He', 'Inert'),
+        (r'\b(CO2|carbon\s*dioxide)\b', 'CO2', 'Neutral/Other'),
+        (r'\b(CO|carbon\s*monoxide)\b', 'CO', 'Reducing'), (r'\b(NH3|ammonia)\b', 'NH3', 'Reducing/Other'),
+    ], "default_specific": "unknown_atm_specific", "default_category": "Unknown_Atm_Category"
+}
+MIXING_METHOD_CONFIG = {
+    "patterns": [
+        (r'\b(ball\s*mill(?:ing)?)\b', 'ball_milling'), (r'\b(grind(?:ing)?|ground|pulverized|milled)\b', 'grinding'),
+        (r'\b(solution|wet|homogeni[sz]ation|slurr(y|ies))\b', 'wet_method'),
+        (r'\b(solid-state|solid\s*state(\s*reaction)?)\b', 'solid_state_mixing'),
+        (r'\b(stir(?:ring)?)\b', 'stirring'), (r'\b(sonica(te|tion|ted))\b', 'sonication'),
+        (r'\b(planetary\s*mill(?:ing)?)\b', 'planetary_milling'), (r'\b(attritor\s*mill(?:ing)?)\b', 'attritor_milling'),
+        (r'\b(shaker\s*mill(?:ing)?)\b', 'shaker_milling'), (r'\b(mortar\s*(and\s*pestle)?)\b', 'mortar_pestle'),
+    ], "default_method": "unknown_mix_method"
+}
+MAGPIE_LABELS = []
+matminer_available = False
+MAGPIE_FEATURIZER = None
+try:
+    from matminer.featurizers.composition import ElementProperty
+    matminer_available = True
+    MAGPIE_FEATURIZER = ElementProperty.from_preset("magpie", impute_nan=True)
+    MAGPIE_LABELS = [f'magpie_{label.replace(" ", "_")}' for label in MAGPIE_FEATURIZER.feature_labels()]
+except ImportError:
+    pass
+except Exception:
+    pass

src/feature_engineering_utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import pandas as pd
+import numpy as np
+from pymatgen.core import Composition, Element as PymatgenElement
+import ast
+import re
+import logging
+from .constants import KNOWN_ELEMENT_SYMBOLS, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available
+# --- Formula Cleaning and Standardization ---
+def clean_formula_string_advanced(formula_str_original):
+    if not isinstance(formula_str_original, str): return formula_str_original
+    cleaned = formula_str_original.strip()
+    paren_match = re.search(r'\(([^()]+)\)[^()]*$', cleaned)
+    if paren_match:
+        potential_formula_in_parens = paren_match.group(1).strip()
+        part_before_parens = cleaned[:paren_match.start()].strip()
+        if len(potential_formula_in_parens) > 1 and re.search(r"[A-Z]", potential_formula_in_parens) and            re.fullmatch(r"[A-Za-z0-9\.\(\)\[\]]+", potential_formula_in_parens):
+            if not part_before_parens or " " in part_before_parens or len(part_before_parens) > len(potential_formula_in_parens) + 5 or                (part_before_parens.isalpha() and len(part_before_parens)>4) or                re.fullmatch(r"\d+(\.\d+)?", part_before_parens) or                re.fullmatch(r"\d*N", part_before_parens, re.IGNORECASE):
+                cleaned = potential_formula_in_parens
+            elif not re.search(r"[A-Za-z]", part_before_parens) and re.search(r"\d", part_before_parens):
+                 cleaned = potential_formula_in_parens
+    cleaned = re.sub(r"^[αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]-", "", cleaned)
+    cleaned = re.sub(r"^[a-zA-Z]-", "", cleaned)
+    cleaned = re.sub(r"[·*]\s*\d*(\.\d+)?[nNxX]?\s*H2O", "", cleaned)
+    cleaned = re.sub(r"\s*\(\s*H2O\s*\)\s*\d*(\.\d+)?", "", cleaned)
+    cleaned = re.sub(r"·\s*H2O", "", cleaned)
+    cleaned = re.sub(r"\s*\(\s*(?:\d*N|\d+(?:\.\d+)?%?|solution|gas|powder|aq|amorphous|amorph|polytype|phase|\d{1,4})\s*\)\s*$", "", cleaned, flags=re.IGNORECASE)
+    cleaned = re.sub(r"^\s*\(\s*\d+(\.\d+)?\s*\)\s*(?=[A-Z])", "", cleaned)
+    def replace_frac(match):
+        try: num = float(match.group(1)); den = float(match.group(2)); return str(round(num / den, 4)) if den != 0 else match.group(0)
+        except: return match.group(0)
+    cleaned = re.sub(r"(?<=[A-Za-z\d\)])(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
+    cleaned = re.sub(r"^(\d+)\s*/\s*(\d+)", replace_frac, cleaned)
+    cleaned = re.sub(r"^\s*\(?[a-zA-Z\s]+\)?-", "", cleaned); cleaned = re.sub(r"^[a-zA-Z]+-", "", cleaned)
+    cleaned = cleaned.strip(" .,;·*()")
+    return cleaned
+def is_plausible_formula_for_pymatgen(cleaned_formula_str, entry_identifier):
+    if not isinstance(cleaned_formula_str, str) or not cleaned_formula_str.strip(): return False
+    if '+' in cleaned_formula_str or '==' in cleaned_formula_str or '->' in cleaned_formula_str or ';' in cleaned_formula_str: return False
+    variable_indicators = [r"[A-Za-z]\d*\s*[-+*]\s*[xyzδδn]", r"[xyzδδn]\s*[-+*]", r"[A-Za-z]\d*\(\s*\d*\s*[-+]\s*[xyzδδn]\s*\)?", r"(?<![A-Za-z])(?:[1-9]\d*|0)?\.\d*[xyzδδn]", r"[xyzδδn]\d+", r"[A-Za-z]\s*[xyzδδn]\s*\d*", r"1-[xyzδδn]",]
+    variable_char_pattern = r"(?i)(?<![A-Z])([xyzδδn])(?![a-z])"
+    for pattern in variable_indicators:
+        if re.search(pattern, cleaned_formula_str, re.IGNORECASE):
+            possible_vars = re.findall(variable_char_pattern, cleaned_formula_str)
+            if any(pv.upper() not in KNOWN_ELEMENT_SYMBOLS for pv in possible_vars if len(pv)==1): return False
+    return True
+def standardize_chemical_formula(raw_formula_str, entry_identifier="Unknown_Entry"):
+    if not isinstance(raw_formula_str, str) or not raw_formula_str.strip(): return None
+    cleaned_formula_str = clean_formula_string_advanced(raw_formula_str)
+    if not cleaned_formula_str: return None
+    if is_plausible_formula_for_pymatgen(cleaned_formula_str, f"{entry_identifier} (Original: '{raw_formula_str}', Cleaned: '{cleaned_formula_str}')"):
+        try:
+            comp_formula_for_pymatgen = cleaned_formula_str.replace(" ", "")
+            if not comp_formula_for_pymatgen: return None
+            comp = Composition(comp_formula_for_pymatgen)
+            if all(el.symbol in KNOWN_ELEMENT_SYMBOLS for el in comp.elements): return comp.get_reduced_formula_and_factor()[0].replace(" ", "")
+        except Exception: pass
+    extracted_elements = {el for el in re.findall(r"([A-Z][a-z]?)", cleaned_formula_str) if el in KNOWN_ELEMENT_SYMBOLS}
+    if extracted_elements: return {'type': 'elements_only', 'elements': extracted_elements, 'original_cleaned': cleaned_formula_str}
+    return None
+def get_valence_features(valences_input, entry_identifier="Unknown_Entry"):
+    valences_list = valences_input
+    if isinstance(valences_input, str):
+        try: valences_list = ast.literal_eval(valences_input)
+        except (ValueError, SyntaxError, TypeError): valences_list = []
+    if not isinstance(valences_list, list) or not valences_list: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
+    numeric_valences = [v for v in valences_list if isinstance(v, (int, float))]
+    if not numeric_valences: return {'avg_valence': np.nan, 'min_valence': np.nan, 'max_valence': np.nan}
+    return {'avg_valence': np.mean(numeric_valences), 'min_valence': np.min(numeric_valences), 'max_valence': np.max(numeric_valences)}
+def generate_compositional_features(formula_input, df_elements_processed, entry_identifier="Unknown_Formula"):
+    default_feature_dict = {'is_stoichiometric_formula': False, 'num_elements_in_formula': 0}
+    basic_props = ['avg_atomic_weight', 'avg_electronegativity', 'avg_atomic_radius', 'avg_melting_point', 'avg_density', 'avg_specific_heat', 'avg_thermal_conductivity', 'avg_heat_of_fusion', 'sum_atomic_weight', 'range_electronegativity', 'min_electronegativity', 'max_electronegativity', 'var_atomic_radius', 'min_atomic_radius', 'max_atomic_radius', 'avg_valence_of_comp', 'avg_est_valence_electrons']
+    unweighted_props = [f'avg_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'min_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'max_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']] +                        [f'var_{prop.lower()}_unweighted' for prop in ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']]
+    for k in basic_props + unweighted_props: default_feature_dict[k] = np.nan
+    if matminer_available and MAGPIE_LABELS:
+        for label in MAGPIE_LABELS: default_feature_dict[label] = np.nan
+    if formula_input is None: return default_feature_dict.copy()
+    features = {}
+    if isinstance(formula_input, str):
+        try:
+            comp = Composition(formula_input); el_amt_dict = comp.get_el_amt_dict(); total_atoms = sum(el_amt_dict.values())
+            if total_atoms == 0: return {**default_feature_dict, 'is_stoichiometric_formula': False}
+            features['is_stoichiometric_formula'] = True; features['num_elements_in_formula'] = len(el_amt_dict)
+            props_for_avg_mapping = {'avg_atomic_weight': 'Atomic_Weight', 'avg_electronegativity': 'Electronegativity', 'avg_atomic_radius': 'Atomic_Radius', 'avg_melting_point': 'Melting_Point', 'avg_density': 'Density', 'avg_specific_heat': 'Specific_Heat', 'avg_thermal_conductivity': 'Thermal_Conductivity', 'avg_heat_of_fusion': 'Heat_of_Fusion', 'avg_valence_of_comp': 'avg_valence', 'avg_est_valence_electrons': 'valence_electrons_estimated'}
+            element_values_for_stats_mapping = {'electronegativity': 'Electronegativity', 'atomic_radius': 'Atomic_Radius'}
+            current_props_for_avg = {k: [] for k in props_for_avg_mapping.keys()}; current_element_values_for_stats = {k: [] for k in element_values_for_stats_mapping.keys()}; valid_elements_for_avg_count = {k: 0 for k in props_for_avg_mapping.keys()}
+            for el_obj, amt in el_amt_dict.items():
+                el_symbol_str = el_obj.symbol if isinstance(el_obj, PymatgenElement) else str(el_obj)
+                if el_symbol_str not in KNOWN_ELEMENT_SYMBOLS: continue
+                if el_symbol_str in df_elements_processed.index:
+                    el_props_series = df_elements_processed.loc[el_symbol_str]
+                    for feat_key, elem_col_name in props_for_avg_mapping.items():
+                        val = el_props_series.get(elem_col_name, np.nan)
+                        if pd.notna(val): current_props_for_avg[feat_key].append(val * amt); valid_elements_for_avg_count[feat_key] += amt
+                    for feat_key, elem_col_name in element_values_for_stats_mapping.items():
+                        val = el_props_series.get(elem_col_name, np.nan)
+                        if pd.notna(val): current_element_values_for_stats[feat_key].extend([val] * int(round(amt)))
+            for key, val_list in current_props_for_avg.items(): features[key] = np.nansum(val_list) / valid_elements_for_avg_count[key] if valid_elements_for_avg_count[key] > 0 else np.nan
+            features['sum_atomic_weight'] = comp.weight
+            for key, val_list in current_element_values_for_stats.items():
+                clean_val_list = [v for v in val_list if pd.notna(v)]
+                if clean_val_list: features[f'range_{key}'] = np.max(clean_val_list) - np.min(clean_val_list); features[f'min_{key}'] = np.min(clean_val_list); features[f'max_{key}'] = np.max(clean_val_list); features[f'var_{key}'] = np.var(clean_val_list)
+                else:
+                    for stat in ['range_', 'min_', 'max_', 'var_']: features[f'{stat}{key}'] = np.nan
+            if matminer_available and MAGPIE_FEATURIZER:
+                try:
+                    magpie_vals = MAGPIE_FEATURIZER.featurize(comp)
+                    for i, label in enumerate(MAGPIE_LABELS): features[label] = magpie_vals[i]
+                except: pass
+        except: features['is_stoichiometric_formula'] = False
+    elif isinstance(formula_input, dict) and formula_input.get('type') == 'elements_only':
+        features['is_stoichiometric_formula'] = False
+        elements_present = formula_input.get('elements', set())
+        valid_elements = [el for el in elements_present if el in df_elements_processed.index]
+        features['num_elements_in_formula'] = len(valid_elements)
+        if valid_elements:
+            element_props_subset = df_elements_processed.loc[valid_elements]
+            unweighted_props_to_calc = ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated']
+            for prop_col in unweighted_props_to_calc:
+                if prop_col in element_props_subset.columns:
+                    clean_vals = element_props_subset[prop_col].dropna()
+                    if not clean_vals.empty:
+                        features[f'avg_{prop_col.lower()}_unweighted'] = clean_vals.mean()
+                        features[f'min_{prop_col.lower()}_unweighted'] = clean_vals.min()
+                        features[f'max_{prop_col.lower()}_unweighted'] = clean_vals.max()
+                        features[f'var_{prop_col.lower()}_unweighted'] = clean_vals.var()
+    final_features = default_feature_dict.copy(); final_features.update(features)
+    return final_features

src/inference.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import pandas as pd
+import numpy as np
+import joblib
+import os
+import logging
+from pymatgen.core import Composition
+import re
+from .constants import KNOWN_ELEMENT_SYMBOLS, ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available
+from .feature_engineering_utils import standardize_chemical_formula, generate_compositional_features
+from .process_feature_utils import generate_process_features_for_input, generate_stoichiometry_features_for_input
+MODEL_DIR = "../models"
+PREPROCESSOR_DIR = "../models"
+ELEMENTAL_DATA_PATH = os.path.join(MODEL_DIR, "df_elements_processed.pkl")
+ESSENTIAL_OBJECTS = {}
+DF_ELEMENTS_PROCESSED_GLOBAL = None
+def load_all_artifacts_once():
+    global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS, matminer_available, MAGPIE_FEATURIZER, MAGPIE_LABELS
+    if ESSENTIAL_OBJECTS.get("loaded_successfully"):
+        logging.info("Artifacts already loaded.")
+        return True
+    logging.info("--- Loading Essential Artifacts for Prediction ---")
+    script_dir = os.path.dirname(__file__)
+    try:
+        elemental_data_full_path = os.path.join(script_dir, ELEMENTAL_DATA_PATH)
+        DF_ELEMENTS_PROCESSED_GLOBAL = pd.read_pickle(elemental_data_full_path)
+        ESSENTIAL_OBJECTS["elemental_data"] = DF_ELEMENTS_PROCESSED_GLOBAL
+        logging.info(f"Loaded processed elemental data from {elemental_data_full_path}")
+    except Exception as e:
+        logging.critical(f"CRITICAL: Error loading elemental data from {elemental_data_full_path}: {e}")
+        return False
+    if not matminer_available: # Attempt to re-init if constants.py didn't catch it
+        try:
+            from matminer.featurizers.composition import ElementProperty
+            MAGPIE_FEATURIZER = ElementProperty.from_preset("magpie", impute_nan=True)
+            MAGPIE_LABELS = [f'magpie_{label.replace(" ", "_")}' for label in MAGPIE_FEATURIZER.feature_labels()]
+            matminer_available = True
+            logging.info("Matminer re-initialized in inference script.")
+        except:
+            logging.warning("Matminer could not be re-initialized in inference script.")
+    ESSENTIAL_OBJECTS["models"] = {}
+    ESSENTIAL_OBJECTS["encoders"] = {}
+    ESSENTIAL_OBJECTS["imputers"] = {}
+    ESSENTIAL_OBJECTS["scalers"] = {}
+    ESSENTIAL_OBJECTS["feature_columns"] = {}
+    all_loaded_successfully = True
+    for model_type_key in ["temperature_bin", "atmosphere_category"]:
+        model_artifact_name = f"{model_type_key}_tuned"
+        try:
+            ESSENTIAL_OBJECTS["models"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_lgbm_model.joblib"))
+            ESSENTIAL_OBJECTS["encoders"][model_type_key] = joblib.load(os.path.join(script_dir, MODEL_DIR, f"{model_artifact_name}_label_encoder.joblib"))
+            ESSENTIAL_OBJECTS["imputers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_imputer.joblib"))
+            ESSENTIAL_OBJECTS["scalers"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_scaler.joblib"))
+            ESSENTIAL_OBJECTS["feature_columns"][model_type_key] = joblib.load(os.path.join(script_dir, PREPROCESSOR_DIR, f"{model_artifact_name}_feature_columns.joblib"))
+            logging.info(f"Loaded artifacts for {model_artifact_name} model.")
+        except Exception as e:
+            logging.error(f"Error loading one or more artifacts for '{model_artifact_name}': {e}. Predictions for it may fail.")
+            ESSENTIAL_OBJECTS["models"][model_type_key] = None
+            all_loaded_successfully = False
+    ESSENTIAL_OBJECTS["loaded_successfully"] = all_loaded_successfully
+    return all_loaded_successfully
+def create_feature_vector_for_prediction(raw_synthesis_input, model_target_name):
+    global DF_ELEMENTS_PROCESSED_GLOBAL, ESSENTIAL_OBJECTS
+    if DF_ELEMENTS_PROCESSED_GLOBAL is None:
+        logging.error("Elemental data not loaded. Call load_all_artifacts_once() first.")
+        return None
+    expected_feature_cols = ESSENTIAL_OBJECTS["feature_columns"].get(model_target_name)
+    if not expected_feature_cols:
+        logging.error(f"Feature column list for '{model_target_name}' not found in loaded artifacts.")
+        return None
+    feature_dict = {col: (0 if col.startswith(("ops_", "proc_has_", "elem_block_")) or "is_stoichiometric" in col or "is_elements_only" in col else np.nan) for col in expected_feature_cols}
+    # Target Compositional Features
+    std_target_output = standardize_chemical_formula(raw_synthesis_input.get('target_formula_raw'), "predict_target")
+    target_comp_feats = generate_compositional_features(std_target_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_target_comp")
+    for k, v in target_comp_feats.items():
+        feature_key = f'target_{k}'
+        if feature_key in feature_dict: feature_dict[feature_key] = v
+    # Precursor Compositional Features
+    precursor_formulas_raw = raw_synthesis_input.get('precursor_formulas_raw', [])
+    std_precursors_outputs = [standardize_chemical_formula(p, f"predict_prec_{i}") for i, p in enumerate(precursor_formulas_raw)]
+    num_valid_precursors, num_stoich_precursors, num_elements_only_precursors = 0,0,0
+    precursor_comp_feats_list = []
+    for std_p_output in std_precursors_outputs:
+        if std_p_output is not None:
+            num_valid_precursors += 1
+            if isinstance(std_p_output, str): num_stoich_precursors += 1
+            elif isinstance(std_p_output, dict) and std_p_output.get('type') == 'elements_only': num_elements_only_precursors +=1
+        precursor_comp_feats_list.append(generate_compositional_features(std_p_output, DF_ELEMENTS_PROCESSED_GLOBAL, "predict_prec_comp"))
+    feature_dict['num_valid_precursors'] = num_valid_precursors
+    feature_dict['all_prec_are_stoichiometric'] = (num_stoich_precursors == num_valid_precursors) if num_valid_precursors > 0 else False
+    feature_dict['any_prec_is_elements_only'] = (num_elements_only_precursors > 0) if num_valid_precursors > 0 else False
+    if precursor_comp_feats_list:
+        df_prec_feats = pd.DataFrame(precursor_comp_feats_list)
+        numeric_cols_df_prec = df_prec_feats.select_dtypes(include=np.number)
+        if not numeric_cols_df_prec.empty:
+            temp_sample_df = pd.DataFrame([generate_compositional_features("H2O", DF_ELEMENTS_PROCESSED_GLOBAL)])
+            numeric_sample_comp_keys = [k for k in temp_sample_df.columns if pd.api.types.is_numeric_dtype(temp_sample_df[k]) and k not in ['is_stoichiometric_formula']]
+            for agg_func_name in ['mean', 'std', 'min', 'max', 'sum']:
+                aggregated_vals = getattr(numeric_cols_df_prec, agg_func_name)()
+                for feat_name_suffix in numeric_sample_comp_keys:
+                    agg_feat_key = f"{agg_func_name}_prec_{feat_name_suffix}"
+                    if agg_feat_key in feature_dict and feat_name_suffix in aggregated_vals:
+                        feature_dict[agg_feat_key] = aggregated_vals[feat_name_suffix]
+    # Process Features
+    process_input_ops_list = raw_synthesis_input.get('operations_simplified_list', [])
+    all_atm_cats = list(set([col.split('ops_atm_cat_')[-1] for col in expected_feature_cols if col.startswith('ops_atm_cat_')]))
+    all_mix_meths = list(set([col.split('ops_mix_meth_')[-1] for col in expected_feature_cols if col.startswith('ops_mix_meth_')]))
+    proc_feats_generated = generate_process_features_for_input(process_input_ops_list, all_atm_cats, all_mix_meths)
+    for k, v in proc_feats_generated.items():
+        if k in feature_dict: feature_dict[k] = v
+    # Stoichiometry features
+    reactants_simplified = raw_synthesis_input.get('reactants_simplified', [])
+    products_simplified = raw_synthesis_input.get('products_simplified', [])
+    stoich_feats_generated = generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_chemical_formula)
+    for k, v in stoich_feats_generated.items():
+        if k in feature_dict: feature_dict[k] = v
+    feature_vector_df = pd.DataFrame([feature_dict], columns=expected_feature_cols)
+    # Impute and Scale
+    imputer = ESSENTIAL_OBJECTS["imputers"].get(model_target_name)
+    scaler = ESSENTIAL_OBJECTS["scalers"].get(model_target_name)
+    numerical_features_for_transform = [col for col in expected_feature_cols if col in feature_vector_df.columns and pd.api.types.is_numeric_dtype(feature_vector_df[col].dtype) and not col.startswith('ops_') and not col.startswith('proc_has_') and not col.startswith('elem_block_') and col not in ['is_stoichiometric_formula', 'all_prec_are_stoichiometric', 'any_prec_is_elements_only', 'num_valid_precursors']]
+    if imputer and scaler and numerical_features_for_transform:
+        try:
+            feature_vector_df[numerical_features_for_transform] = feature_vector_df[numerical_features_for_transform].astype(np.float64)
+            feature_vector_df[numerical_features_for_transform] = imputer.transform(feature_vector_df[numerical_features_for_transform])
+            feature_vector_df[numerical_features_for_transform] = scaler.transform(feature_vector_df[numerical_features_for_transform])
+            logging.info("Feature vector imputed and scaled for prediction.")
+        except Exception as e_transform:
+            logging.error(f"Error during imputation/scaling for prediction: {e_transform}", exc_info=True)
+            return None
+    else:
+        logging.warning("Imputer, Scaler or numerical features missing for prediction. Proceeding with caution.")
+    return feature_vector_df
+def predict_synthesis_outcome(raw_synthesis_input):
+    global ESSENTIAL_OBJECTS
+    if not ESSENTIAL_OBJECTS.get("loaded_successfully"):
+        success = load_all_artifacts_once()
+        if not success:
+            logging.error("Essential artifacts could not be loaded. Cannot make predictions.")
+            return {}
+    predictions = {}
+    model_types_to_predict = ["temperature_bin", "atmosphere_category"]
+    for model_type in model_types_to_predict:
+        if ESSENTIAL_OBJECTS["models"].get(model_type):
+            logging.info(f"\n--- Predicting {model_type} ---")
+            feature_vector = create_feature_vector_for_prediction(raw_synthesis_input, model_type)
+            if feature_vector is not None:
+                model = ESSENTIAL_OBJECTS["models"][model_type]
+                encoder = ESSENTIAL_OBJECTS["encoders"][model_type]
+                try:
+                    pred_encoded = model.predict(feature_vector)
+                    pred_proba = model.predict_proba(feature_vector)
+                    pred_label = encoder.inverse_transform(pred_encoded)[0]
+                    predictions[model_type] = {
+                        'predicted_label': pred_label,
+                        'probabilities': {str(cls): prob for cls, prob in zip(encoder.classes_, pred_proba[0])}
+                    }
+                    logging.info(f"Predicted {model_type}: {pred_label}")
+                    logging.info(f"Probabilities: {predictions[model_type]['probabilities']}")
+                except Exception as e:
+                    logging.error(f"Error during {model_type} prediction: {e}", exc_info=True)
+                    predictions[model_type] = f"Prediction Error: {e}"
+            else:
+                logging.error(f"Could not create feature vector for {model_type} model.")
+                predictions[model_type] = "Feature vector creation error"
+        else:
+            logging.warning(f"{model_type} model not available for prediction.")
+    return predictions
+if __name__ == '__main__':
+    # This block is for testing this inference script directly.
+    # Ensure artifacts are loaded
+    if not load_all_artifacts_once():
+        print("Exiting due to failure in loading essential artifacts.")
+    else:
+        print("\n--- Example Interactive Prediction ---")
+        example_input_with_ops_list = {
+            'target_formula_raw': "YBa2Cu3O7",
+            'precursor_formulas_raw': ["Y2O3", "BaCO3", "CuO"],
+            'operations_simplified_list': [
+                {'type': 'MixingOperation', 'string': 'Mix precursors by ball milling for 4h', 'conditions': {'duration': [{'value':4, 'unit':'h'}]}},
+                {'type': 'HeatingOperation', 'string': 'Calcined at 900C for 12h in air', 'conditions': {'heating_temperature': [{'value':900, 'unit':'C'}], 'heating_time': [{'value':12, 'unit':'h'}], 'atmosphere': 'Air'}},
+                {'type': 'HeatingOperation', 'string': 'Sintered at 950C for 24h in O2', 'conditions': {'heating_temperature': [{'value':950, 'unit':'C'}], 'heating_time': [{'value':20, 'unit':'h'}], 'atmosphere': 'Oxygen'}}
+            ],
+            'reactants_simplified': [{'material': 'Y2O3', 'amount': 0.5}, {'material':'BaCO3', 'amount': 2.0}, {'material':'CuO', 'amount': 3.0}],
+            'products_simplified': [{'material':'YBa2Cu3O7', 'amount': 1.0}]
+        }
+        predictions = predict_synthesis_outcome(example_input_with_ops_list)
+        print(f"\nFinal Predictions for example input: {predictions}")

src/process_feature_utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import pandas as pd
+import numpy as np
+import re
+from .constants import ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG
+def _extract_numerical_value_from_op_condition(condition_entry, target_keys=['value', 'max_value', 'values']):
+    if isinstance(condition_entry, list) and condition_entry:
+        if isinstance(condition_entry[0], dict):
+            for key in target_keys:
+                val = condition_entry[0].get(key)
+                if val is not None:
+                    if isinstance(val, list) and val:
+                        try: return float(val[0])
+                        except: continue
+                    try: return float(val)
+                    except: continue
+        elif isinstance(condition_entry[0], (int, float, np.number)):
+             try: return float(condition_entry[0])
+             except: pass
+    elif isinstance(condition_entry, dict):
+        for key in target_keys:
+            val = condition_entry.get(key)
+            if val is not None:
+                if isinstance(val, list) and val:
+                    try: return float(val[0])
+                    except: continue
+                try: return float(val)
+                except: continue
+    elif isinstance(condition_entry, (int, float, np.number)):
+        try: return float(condition_entry)
+        except: pass
+    return np.nan
+def _extract_atmosphere_from_op(op_conditions_dict, op_string, atm_config_local, entry_identifier):
+    atm_specific = atm_config_local["default_specific"]
+    atm_category = atm_config_local["default_category"]
+    found_atm = False
+    if isinstance(op_conditions_dict, dict):
+        atm_source_key_val = op_conditions_dict.get('atmosphere')
+        if not atm_source_key_val and 'text' in op_conditions_dict and isinstance(op_conditions_dict['text'], str) :
+            atm_source_key_val = op_conditions_dict['text']
+        if atm_source_key_val:
+            atm_str_to_parse = None
+            if isinstance(atm_source_key_val, list) and atm_source_key_val: atm_str_to_parse = str(atm_source_key_val[0])
+            elif isinstance(atm_source_key_val, str): atm_str_to_parse = atm_source_key_val
+            elif isinstance(atm_source_key_val, dict): atm_str_to_parse = str(atm_source_key_val.get('gas', atm_source_key_val.get('value', '')))
+            if atm_str_to_parse:
+                atm_str_lower = atm_str_to_parse.lower()
+                for pattern_regex, specific, category in atm_config_local["patterns"]:
+                    if specific.lower() == atm_str_lower or re.search(pattern_regex, atm_str_to_parse, re.IGNORECASE):
+                        atm_specific, atm_category, found_atm = specific, category, True; break
+                if not found_atm and '/' in atm_str_lower: atm_specific, atm_category, found_atm = atm_str_to_parse, "Mixed", True
+    if not found_atm and isinstance(op_string, str) and op_string:
+        for pattern_regex, specific, category in atm_config_local["patterns"]:
+            if re.search(pattern_regex, op_string, re.IGNORECASE):
+                atm_specific, atm_category, found_atm = specific, category, True; break
+    return atm_specific, atm_category
+def _extract_mixing_method_from_op(op_dict, op_string, mix_config_local, entry_identifier):
+    mix_method = mix_config_local["default_method"]
+    op_type = str(op_dict.get('type', '')).lower()
+    if isinstance(op_string, str) and op_string:
+        for pattern_regex, method_name in mix_config_local["patterns"]:
+            if re.search(pattern_regex, op_string, re.IGNORECASE): return method_name
+    if 'mix' in op_type or 'grind' in op_type or 'mill' in op_type:
+        for pattern_regex, method_name in mix_config_local["patterns"]:
+            if re.search(pattern_regex, op_type, re.IGNORECASE): return method_name
+        if op_type.strip() and op_type not in ["mixing", "liquidgrinding", "solutionmixing", "grinding"]: return op_type
+    return mix_method
+def _extract_thermal_conditions(conditions_dict, op_string, entry_identifier):
+    temps, durs = [], []
+    if isinstance(conditions_dict, dict):
+        temp_data = conditions_dict.get('heating_temperature')
+        if temp_data:
+            extracted_temp = _extract_numerical_value_from_op_condition(temp_data)
+            if pd.notna(extracted_temp): temps.append(extracted_temp)
+        dur_data = conditions_dict.get('heating_time')
+        if dur_data:
+            extracted_dur = _extract_numerical_value_from_op_condition(dur_data)
+            if pd.notna(extracted_dur): durs.append(extracted_dur)
+    return temps, durs
+def parse_single_operation_detailed_for_input(op_dict_raw, entry_identifier="predict_op"):
+    if not isinstance(op_dict_raw, dict): return {}
+    op_type_lower = str(op_dict_raw.get('type', 'UnknownType')).lower()
+    op_string_lower = str(op_dict_raw.get('string', '')).lower()
+    conditions = op_dict_raw.get('conditions', {})
+    op_features = {}
+    temps, durs = _extract_thermal_conditions(conditions, op_string_lower, entry_identifier)
+    op_features['op_temp_C_list'], op_features['op_duration_h_list'] = temps, durs
+    op_features['op_atmosphere_specific'], op_features['op_atmosphere_category'] = _extract_atmosphere_from_op(conditions, op_string_lower, ATMOSPHERE_CONFIG, entry_identifier)
+    op_features['op_mixing_method'] = _extract_mixing_method_from_op(op_dict_raw, op_string_lower, MIXING_METHOD_CONFIG, entry_identifier)
+    op_features['op_is_heating'] = any(k in op_type_lower for k in ['heat', 'anneal', 'sinter', 'calcination'])
+    op_features['op_is_mixing'] = 'mix' in op_type_lower or op_features['op_mixing_method'] != MIXING_METHOD_CONFIG["default_method"]
+    op_features['op_is_grinding'] = any(k in op_type_lower for k in ['grind', 'mill']) or 'pulverize' in op_string_lower or op_features['op_mixing_method'] in ['grinding', 'ball_milling', 'planetary_milling', 'attritor_milling', 'shaker_milling', 'mortar_pestle']
+    op_features['op_is_shaping'] = 'shap' in op_type_lower
+    op_features['op_is_drying'] = 'dry' in op_type_lower or 'drying' in op_type_lower
+    op_features['op_is_quenching'] = 'quench' in op_type_lower
+    op_features['op_is_annealing'] = 'anneal' in op_type_lower or 'anneal' in op_string_lower
+    op_features['op_is_sintering'] = 'sinter' in op_type_lower or 'sinter' in op_string_lower
+    op_features['op_is_calcination'] = any(k in op_type_lower for k in ['calcine', 'calcination']) or 'calcination' in op_string_lower
+    return op_features
+def generate_process_features_for_input(operations_simplified_list, all_possible_atm_categories, all_possible_mix_methods):
+    aggregated_ops_features = {
+        'proc_total_heating_duration_h': 0.0, 'proc_max_temperature_C': np.nan,
+        'proc_min_temperature_C': np.nan, 'proc_avg_temperature_C': np.nan,
+        'proc_primary_heating_temp_C': np.nan,
+        'proc_num_total_steps': 0, 'proc_num_heating_steps': 0,
+        'proc_num_mixing_steps': 0, 'proc_num_grinding_steps': 0,
+        'proc_has_annealing': False, 'proc_has_sintering': False,
+        'proc_has_calcination': False, 'proc_has_quenching': False,
+        'proc_has_shaping': False, 'proc_has_drying': False,
+    }
+    for cat in all_possible_atm_categories: aggregated_ops_features[f"ops_atm_cat_{cat}"] = 0
+    for meth in all_possible_mix_methods: aggregated_ops_features[f"ops_mix_meth_{meth}"] = 0
+    if not isinstance(operations_simplified_list, list): operations_simplified_list = []
+    aggregated_ops_features['proc_num_total_steps'] = len(operations_simplified_list)
+    all_temps_in_reaction, heating_steps_details_for_reaction, mixing_methods_found_in_reaction = [], [], []
+    atm_set_for_reaction_flag = False
+    parsed_atm_category_for_input = ATMOSPHERE_CONFIG["default_category"]
+    parsed_mix_method_for_input = MIXING_METHOD_CONFIG["default_method"]
+    for op_idx, op_dict_raw in enumerate(operations_simplified_list):
+        op_features = parse_single_operation_detailed_for_input(op_dict_raw, f"predict_op_{op_idx}")
+        if op_features.get('op_temp_C_list'): all_temps_in_reaction.extend(op_features['op_temp_C_list'])
+        if op_features.get('op_is_heating'):
+            aggregated_ops_features['proc_num_heating_steps'] += 1
+            if op_features.get('op_duration_h_list'): aggregated_ops_features['proc_total_heating_duration_h'] += np.nansum(op_features['op_duration_h_list'])
+            heating_steps_details_for_reaction.append({'temp': np.nanmax(op_features['op_temp_C_list']) if op_features.get('op_temp_C_list') and len(op_features['op_temp_C_list']) > 0 else np.nan,
+                                                       'duration': np.nansum(op_features.get('op_duration_h_list', [0.0])),
+                                                       'atm_category': op_features.get('op_atmosphere_category'),
+                                                       'is_anneal': op_features.get('op_is_annealing'), 'is_sinter': op_features.get('op_is_sintering'), 'is_calcine': op_features.get('op_is_calcination')})
+        if op_features.get('op_is_mixing'):
+            aggregated_ops_features['proc_num_mixing_steps'] += 1
+            current_mix_method = op_features.get('op_mixing_method', MIXING_METHOD_CONFIG["default_method"])
+            if current_mix_method != MIXING_METHOD_CONFIG["default_method"]: mixing_methods_found_in_reaction.append(current_mix_method)
+        if op_features.get('op_is_grinding'): aggregated_ops_features['proc_num_grinding_steps'] += 1
+        if op_features.get('op_is_shaping'): aggregated_ops_features['proc_has_shaping'] = True
+        if op_features.get('op_is_sintering'): aggregated_ops_features['proc_has_sintering'] = True
+        if op_features.get('op_is_drying'): aggregated_ops_features['proc_has_drying'] = True
+        if op_features.get('op_is_quenching'): aggregated_ops_features['proc_has_quenching'] = True
+        if op_features.get('op_is_annealing'): aggregated_ops_features['proc_has_annealing'] = True
+        if op_features.get('op_is_calcination'): aggregated_ops_features['proc_has_calcination'] = True
+        if not atm_set_for_reaction_flag and op_features.get('op_atmosphere_category') != ATMOSPHERE_CONFIG["default_category"]:
+            parsed_atm_category_for_input = op_features['op_atmosphere_category']
+            atm_set_for_reaction_flag = True
+    if heating_steps_details_for_reaction:
+        primary_heat_step = max(heating_steps_details_for_reaction, key=lambda x: (x['temp'] if pd.notna(x['temp']) else -float('inf'), x['duration']))
+        if pd.notna(primary_heat_step['temp']): aggregated_ops_features['proc_primary_heating_temp_C'] = primary_heat_step['temp']
+        if not atm_set_for_reaction_flag and primary_heat_step.get('atm_category') != ATMOSPHERE_CONFIG["default_category"]:
+             parsed_atm_category_for_input = primary_heat_step['atm_category']
+    if mixing_methods_found_in_reaction:
+        parsed_mix_method_for_input = mixing_methods_found_in_reaction[0]
+    atm_ohe_col = f"ops_atm_cat_{parsed_atm_category_for_input}"
+    if atm_ohe_col in aggregated_ops_features: aggregated_ops_features[atm_ohe_col] = 1
+    mix_ohe_col = f"ops_mix_meth_{parsed_mix_method_for_input}"
+    if mix_ohe_col in aggregated_ops_features: aggregated_ops_features[mix_ohe_col] = 1
+    if all_temps_in_reaction :
+        aggregated_ops_features['proc_max_temperature_C'] = np.nanmax(all_temps_in_reaction)
+        aggregated_ops_features['proc_min_temperature_C'] = np.nanmin(all_temps_in_reaction)
+        aggregated_ops_features['proc_avg_temperature_C'] = np.nanmean(all_temps_in_reaction)
+    if aggregated_ops_features['proc_num_heating_steps'] == 0 or pd.isna(aggregated_ops_features['proc_total_heating_duration_h']) or aggregated_ops_features['proc_total_heating_duration_h'] == 0:
+        aggregated_ops_features['proc_total_heating_duration_h'] = np.nan
+    return aggregated_ops_features
+def generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_fn_local):
+    stoich_features = {}
+    max_r, max_p = 3, 2
+    for i in range(max_r): stoich_features[f'reactant{i+1}_coeff'] = np.nan
+    for i in range(max_p): stoich_features[f'product{i+1}_coeff'] = np.nan
+    stoich_features['num_reactants_in_reaction'] = len(reactants_simplified) if reactants_simplified else 0
+    if reactants_simplified:
+        for i, r_item in enumerate(reactants_simplified[:max_r]):
+            if isinstance(r_item, dict):
+                 stoich_features[f'reactant{i+1}_coeff'] = float(r_item.get('amount')) if pd.notna(r_item.get('amount')) else np.nan
+    stoich_features['num_products_in_reaction'] = len(products_simplified) if products_simplified else 0
+    if products_simplified:
+        for i, p_item in enumerate(products_simplified[:max_p]):
+             if isinstance(p_item, dict):
+                stoich_features[f'product{i+1}_coeff'] = float(p_item.get('amount')) if pd.notna(p_item.get('amount')) else np.nan
+    return stoich_features