cogenticml / lib /result_extractor.py
kateshdrops's picture
Upload 23 files
6ed031c verified
# Copyright 2023-2024 The SapientML Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import re
import pandas as pd
# @staticmethod
class ResultExtractor:
def get_preprocess(files):
code_expl = json.loads(files["script_code_explainability.json"])
preprocess_list = (
list(code_expl.get("preprocessing_before_target_separation", []))
+ list(code_expl.get("preprocessing_after_target_separation", []))
+ list(code_expl.get("preprocessing_after_train_test_split", []))
)
return preprocess_list
def get_candidates(files, adaptation_metric):
model_dict = {}
for file in files.keys():
if file.endswith("_script_code_explainability.json"):
code_info = json.loads(files[file])
candidate_number = file.split("/")[-1].split("_")[0]
model_dict[candidate_number] = code_info["model"]["explanation"]["target_component_name"].split(":")[2]
# Get candidate number and validation score of model from run_info
run_info = json.loads(files["run_info.json"])
model_score_dict = {}
for i in range(1, 4):
# candidate_number = run_info[str(i)]["run_info"]["filename"].split("_")[0]
candidate_number = str(i)
candidate_score = run_info[str(i)]["run_info"]["score"]
model_score_dict[candidate_number] = candidate_score
model_df = pd.DataFrame.from_records(list(model_dict.items()), columns=["candidate_number", "model"])
model_score_df = pd.DataFrame.from_records(
list(model_score_dict.items()),
columns=["candidate_number", f"score ({adaptation_metric})_float"],
)
skeleton_info = json.loads(files[".skeleton.json"])
model_info = {k.split(":")[2]: v for k, v in skeleton_info.items() if k.startswith("MODEL:")}
model_info = pd.DataFrame.from_records(list(model_info.items()), columns=["model", "probability"]).sort_values(
"probability", ascending=False
)
model_info["probability"] /= 2
model_info = pd.merge(model_info, model_df, on="model", how="left")
model_info = pd.merge(model_info, model_score_df, on="candidate_number", how="left")
model_info = model_info[
[
"candidate_number",
"model",
"probability",
f"score ({adaptation_metric})_float",
]
]
script_info = json.loads(files["script_code_explainability.json"])
selected_model = script_info["model"]["explanation"]["target_component_name"].split(":")[2]
score_index = model_info[model_info["model"] == selected_model].index
model_info[f"score ({adaptation_metric})"] = model_info[f"score ({adaptation_metric})_float"].apply(
lambda x: "{:.5f}".format(x) if abs(x) < 10000 else "{:.2E}".format(x)
)
model_info.drop(f"score ({adaptation_metric})_float", axis=1, inplace=True)
return model_info, score_index
def get_columns_info(files):
# Get the candidatenum ber and validation score of the model from run_info
run_info = json.loads(files["run_info.json"])
return {
"text_columns": run_info["TEXT_COLUMNS"],
"categorical_columns": run_info["CATEGORICAL_COLS"],
"date_columns": run_info["DATE_COLUMNS"],
}
def remake_run_info(script_dir):
run_info_path = os.path.join(script_dir, "run_info.json")
with open(run_info_path, "r") as f:
run_info_raw = json.load(f)
run_info = {}
for i in range(1, 4):
run_info[str(i)] = {}
run_info[str(i)]["run_info"] = {}
run_info[str(i)]["run_info"]["score"] = run_info_raw[str(i)]["run_info"]["score"]
# To handle both of JSON-serialized Pipeline and str-dumped Pipeline,
# check the type of `run_info["0"]["content"]` is `str` or not.
# For details, refer to https://github.com/F-AutoML/sapientml/pull/562
run_info_content = run_info_raw["1"]["content"]
if not isinstance(run_info_content, str):
run_info_content = json.dumps(run_info_content)
# text_columns_info = re.search(r"_TEXT_COLUMNS = (\[.*?\])", run_info_content)
text_columns_info = re.search(r"TEXT_COLUMNS = (\[.*?\])\\n", run_info_content)
# Fix: Comment Please
text_columns = []
if text_columns_info:
text_columns = json.loads(f'"{text_columns_info[1]}"')
run_info["TEXT_COLUMNS"] = text_columns
categorical_columns_info = re.search(r"CATEGORICAL_COLS = (\[.*?\])\\n", run_info_content)
categorical_columns = []
if categorical_columns_info is not None:
categorical_columns = json.loads(f'"{categorical_columns_info[1]}"')
run_info["CATEGORICAL_COLS"] = categorical_columns
date_columns_info = re.search(r"DATE_COLUMNS = (\[.*?\])\\n", run_info_content)
date_columns = []
if date_columns_info is not None:
date_columns = json.loads(f'"{date_columns_info[1]}"')
run_info["DATE_COLUMNS"] = date_columns
with open(run_info_path, "w") as f:
json.dump(run_info, f, indent=4)