File size: 13,019 Bytes
3961ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

import pandas as pd
import numpy as np
import re
from .constants import ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG 

def _extract_numerical_value_from_op_condition(condition_entry, target_keys=['value', 'max_value', 'values']):
    if isinstance(condition_entry, list) and condition_entry:
        if isinstance(condition_entry[0], dict):
            for key in target_keys:
                val = condition_entry[0].get(key)
                if val is not None:
                    if isinstance(val, list) and val: 
                        try: return float(val[0])
                        except: continue
                    try: return float(val)
                    except: continue
        elif isinstance(condition_entry[0], (int, float, np.number)):
             try: return float(condition_entry[0])
             except: pass
    elif isinstance(condition_entry, dict): 
        for key in target_keys:
            val = condition_entry.get(key)
            if val is not None:
                if isinstance(val, list) and val:
                    try: return float(val[0])
                    except: continue
                try: return float(val)
                except: continue
    elif isinstance(condition_entry, (int, float, np.number)):
        try: return float(condition_entry)
        except: pass
    return np.nan

def _extract_atmosphere_from_op(op_conditions_dict, op_string, atm_config_local, entry_identifier):
    atm_specific = atm_config_local["default_specific"]
    atm_category = atm_config_local["default_category"]
    found_atm = False
    if isinstance(op_conditions_dict, dict): 
        atm_source_key_val = op_conditions_dict.get('atmosphere') 
        if not atm_source_key_val and 'text' in op_conditions_dict and isinstance(op_conditions_dict['text'], str) : 
            atm_source_key_val = op_conditions_dict['text']
        if atm_source_key_val: 
            atm_str_to_parse = None
            if isinstance(atm_source_key_val, list) and atm_source_key_val: atm_str_to_parse = str(atm_source_key_val[0]) 
            elif isinstance(atm_source_key_val, str): atm_str_to_parse = atm_source_key_val
            elif isinstance(atm_source_key_val, dict): atm_str_to_parse = str(atm_source_key_val.get('gas', atm_source_key_val.get('value', '')))
            if atm_str_to_parse:
                atm_str_lower = atm_str_to_parse.lower()
                for pattern_regex, specific, category in atm_config_local["patterns"]:
                    if specific.lower() == atm_str_lower or re.search(pattern_regex, atm_str_to_parse, re.IGNORECASE):
                        atm_specific, atm_category, found_atm = specific, category, True; break
                if not found_atm and '/' in atm_str_lower: atm_specific, atm_category, found_atm = atm_str_to_parse, "Mixed", True
    if not found_atm and isinstance(op_string, str) and op_string: 
        for pattern_regex, specific, category in atm_config_local["patterns"]:
            if re.search(pattern_regex, op_string, re.IGNORECASE):
                atm_specific, atm_category, found_atm = specific, category, True; break
    return atm_specific, atm_category

def _extract_mixing_method_from_op(op_dict, op_string, mix_config_local, entry_identifier):
    mix_method = mix_config_local["default_method"]
    op_type = str(op_dict.get('type', '')).lower() 
    if isinstance(op_string, str) and op_string:
        for pattern_regex, method_name in mix_config_local["patterns"]:
            if re.search(pattern_regex, op_string, re.IGNORECASE): return method_name 
    if 'mix' in op_type or 'grind' in op_type or 'mill' in op_type:
        for pattern_regex, method_name in mix_config_local["patterns"]: 
            if re.search(pattern_regex, op_type, re.IGNORECASE): return method_name
        if op_type.strip() and op_type not in ["mixing", "liquidgrinding", "solutionmixing", "grinding"]: return op_type 
    return mix_method

def _extract_thermal_conditions(conditions_dict, op_string, entry_identifier): 
    temps, durs = [], []
    if isinstance(conditions_dict, dict):
        temp_data = conditions_dict.get('heating_temperature')
        if temp_data:
            extracted_temp = _extract_numerical_value_from_op_condition(temp_data)
            if pd.notna(extracted_temp): temps.append(extracted_temp)
        dur_data = conditions_dict.get('heating_time')
        if dur_data:
            extracted_dur = _extract_numerical_value_from_op_condition(dur_data)
            if pd.notna(extracted_dur): durs.append(extracted_dur)
    return temps, durs

def parse_single_operation_detailed_for_input(op_dict_raw, entry_identifier="predict_op"):
    if not isinstance(op_dict_raw, dict): return {} 
    op_type_lower = str(op_dict_raw.get('type', 'UnknownType')).lower()
    op_string_lower = str(op_dict_raw.get('string', '')).lower() 
    conditions = op_dict_raw.get('conditions', {}) 
    op_features = {}
    temps, durs = _extract_thermal_conditions(conditions, op_string_lower, entry_identifier) 
    op_features['op_temp_C_list'], op_features['op_duration_h_list'] = temps, durs
    op_features['op_atmosphere_specific'], op_features['op_atmosphere_category'] = _extract_atmosphere_from_op(conditions, op_string_lower, ATMOSPHERE_CONFIG, entry_identifier)
    op_features['op_mixing_method'] = _extract_mixing_method_from_op(op_dict_raw, op_string_lower, MIXING_METHOD_CONFIG, entry_identifier)
    op_features['op_is_heating'] = any(k in op_type_lower for k in ['heat', 'anneal', 'sinter', 'calcination'])
    op_features['op_is_mixing'] = 'mix' in op_type_lower or op_features['op_mixing_method'] != MIXING_METHOD_CONFIG["default_method"]
    op_features['op_is_grinding'] = any(k in op_type_lower for k in ['grind', 'mill']) or 'pulverize' in op_string_lower or op_features['op_mixing_method'] in ['grinding', 'ball_milling', 'planetary_milling', 'attritor_milling', 'shaker_milling', 'mortar_pestle']
    op_features['op_is_shaping'] = 'shap' in op_type_lower 
    op_features['op_is_drying'] = 'dry' in op_type_lower or 'drying' in op_type_lower
    op_features['op_is_quenching'] = 'quench' in op_type_lower
    op_features['op_is_annealing'] = 'anneal' in op_type_lower or 'anneal' in op_string_lower
    op_features['op_is_sintering'] = 'sinter' in op_type_lower or 'sinter' in op_string_lower
    op_features['op_is_calcination'] = any(k in op_type_lower for k in ['calcine', 'calcination']) or 'calcination' in op_string_lower
    return op_features

def generate_process_features_for_input(operations_simplified_list, all_possible_atm_categories, all_possible_mix_methods):
    aggregated_ops_features = {
        'proc_total_heating_duration_h': 0.0, 'proc_max_temperature_C': np.nan,
        'proc_min_temperature_C': np.nan, 'proc_avg_temperature_C': np.nan,
        'proc_primary_heating_temp_C': np.nan,
        'proc_num_total_steps': 0, 'proc_num_heating_steps': 0,
        'proc_num_mixing_steps': 0, 'proc_num_grinding_steps': 0,
        'proc_has_annealing': False, 'proc_has_sintering': False,
        'proc_has_calcination': False, 'proc_has_quenching': False,
        'proc_has_shaping': False, 'proc_has_drying': False,
    }
    for cat in all_possible_atm_categories: aggregated_ops_features[f"ops_atm_cat_{cat}"] = 0
    for meth in all_possible_mix_methods: aggregated_ops_features[f"ops_mix_meth_{meth}"] = 0

    if not isinstance(operations_simplified_list, list): operations_simplified_list = []
    aggregated_ops_features['proc_num_total_steps'] = len(operations_simplified_list)
    all_temps_in_reaction, heating_steps_details_for_reaction, mixing_methods_found_in_reaction = [], [], []
    atm_set_for_reaction_flag = False
    parsed_atm_category_for_input = ATMOSPHERE_CONFIG["default_category"]
    parsed_mix_method_for_input = MIXING_METHOD_CONFIG["default_method"]

    for op_idx, op_dict_raw in enumerate(operations_simplified_list):
        op_features = parse_single_operation_detailed_for_input(op_dict_raw, f"predict_op_{op_idx}")
        if op_features.get('op_temp_C_list'): all_temps_in_reaction.extend(op_features['op_temp_C_list'])
        if op_features.get('op_is_heating'):
            aggregated_ops_features['proc_num_heating_steps'] += 1
            if op_features.get('op_duration_h_list'): aggregated_ops_features['proc_total_heating_duration_h'] += np.nansum(op_features['op_duration_h_list'])
            heating_steps_details_for_reaction.append({'temp': np.nanmax(op_features['op_temp_C_list']) if op_features.get('op_temp_C_list') and len(op_features['op_temp_C_list']) > 0 else np.nan, 
                                                       'duration': np.nansum(op_features.get('op_duration_h_list', [0.0])), 
                                                       'atm_category': op_features.get('op_atmosphere_category'),
                                                       'is_anneal': op_features.get('op_is_annealing'), 'is_sinter': op_features.get('op_is_sintering'), 'is_calcine': op_features.get('op_is_calcination')})
        if op_features.get('op_is_mixing'): 
            aggregated_ops_features['proc_num_mixing_steps'] += 1
            current_mix_method = op_features.get('op_mixing_method', MIXING_METHOD_CONFIG["default_method"])
            if current_mix_method != MIXING_METHOD_CONFIG["default_method"]: mixing_methods_found_in_reaction.append(current_mix_method)
        if op_features.get('op_is_grinding'): aggregated_ops_features['proc_num_grinding_steps'] += 1
        if op_features.get('op_is_shaping'): aggregated_ops_features['proc_has_shaping'] = True
        if op_features.get('op_is_sintering'): aggregated_ops_features['proc_has_sintering'] = True
        if op_features.get('op_is_drying'): aggregated_ops_features['proc_has_drying'] = True
        if op_features.get('op_is_quenching'): aggregated_ops_features['proc_has_quenching'] = True
        if op_features.get('op_is_annealing'): aggregated_ops_features['proc_has_annealing'] = True
        if op_features.get('op_is_calcination'): aggregated_ops_features['proc_has_calcination'] = True
        if not atm_set_for_reaction_flag and op_features.get('op_atmosphere_category') != ATMOSPHERE_CONFIG["default_category"]:
            parsed_atm_category_for_input = op_features['op_atmosphere_category']
            atm_set_for_reaction_flag = True
    
    if heating_steps_details_for_reaction: 
        primary_heat_step = max(heating_steps_details_for_reaction, key=lambda x: (x['temp'] if pd.notna(x['temp']) else -float('inf'), x['duration']))
        if pd.notna(primary_heat_step['temp']): aggregated_ops_features['proc_primary_heating_temp_C'] = primary_heat_step['temp']
        if not atm_set_for_reaction_flag and primary_heat_step.get('atm_category') != ATMOSPHERE_CONFIG["default_category"]:
             parsed_atm_category_for_input = primary_heat_step['atm_category'] 
    
    if mixing_methods_found_in_reaction:
        parsed_mix_method_for_input = mixing_methods_found_in_reaction[0] 

    atm_ohe_col = f"ops_atm_cat_{parsed_atm_category_for_input}"
    if atm_ohe_col in aggregated_ops_features: aggregated_ops_features[atm_ohe_col] = 1
    
    mix_ohe_col = f"ops_mix_meth_{parsed_mix_method_for_input}"
    if mix_ohe_col in aggregated_ops_features: aggregated_ops_features[mix_ohe_col] = 1
    
    if all_temps_in_reaction : 
        aggregated_ops_features['proc_max_temperature_C'] = np.nanmax(all_temps_in_reaction)
        aggregated_ops_features['proc_min_temperature_C'] = np.nanmin(all_temps_in_reaction)
        aggregated_ops_features['proc_avg_temperature_C'] = np.nanmean(all_temps_in_reaction)
    if aggregated_ops_features['proc_num_heating_steps'] == 0 or pd.isna(aggregated_ops_features['proc_total_heating_duration_h']) or aggregated_ops_features['proc_total_heating_duration_h'] == 0:
        aggregated_ops_features['proc_total_heating_duration_h'] = np.nan
        
    return aggregated_ops_features

def generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_fn_local):
    stoich_features = {}
    max_r, max_p = 3, 2 
    for i in range(max_r): stoich_features[f'reactant{i+1}_coeff'] = np.nan
    for i in range(max_p): stoich_features[f'product{i+1}_coeff'] = np.nan
    
    stoich_features['num_reactants_in_reaction'] = len(reactants_simplified) if reactants_simplified else 0
    if reactants_simplified:
        for i, r_item in enumerate(reactants_simplified[:max_r]):
            if isinstance(r_item, dict): 
                 stoich_features[f'reactant{i+1}_coeff'] = float(r_item.get('amount')) if pd.notna(r_item.get('amount')) else np.nan
    stoich_features['num_products_in_reaction'] = len(products_simplified) if products_simplified else 0
    if products_simplified:
        for i, p_item in enumerate(products_simplified[:max_p]):
             if isinstance(p_item, dict):
                stoich_features[f'product{i+1}_coeff'] = float(p_item.get('amount')) if pd.notna(p_item.get('amount')) else np.nan
    return stoich_features