Spaces:

kateshdrops
/

cogenticml

Sleeping

App Files Files Community

cogenticml / lib /result_extractor.py

kateshdrops

Upload 23 files

6ed031c verified over 1 year ago

raw

history blame contribute delete

5.84 kB

	# Copyright 2023-2024 The SapientML Authors
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import json
	import os
	import re

	import pandas as pd


	# @staticmethod
	class ResultExtractor:
	def get_preprocess(files):
	code_expl = json.loads(files["script_code_explainability.json"])

	preprocess_list = (
	list(code_expl.get("preprocessing_before_target_separation", []))
	+ list(code_expl.get("preprocessing_after_target_separation", []))
	+ list(code_expl.get("preprocessing_after_train_test_split", []))
	)

	return preprocess_list

	def get_candidates(files, adaptation_metric):
	model_dict = {}
	for file in files.keys():
	if file.endswith("_script_code_explainability.json"):
	code_info = json.loads(files[file])
	candidate_number = file.split("/")[-1].split("_")[0]
	model_dict[candidate_number] = code_info["model"]["explanation"]["target_component_name"].split(":")[2]
	# Get candidate number and validation score of model from run_info
	run_info = json.loads(files["run_info.json"])

	model_score_dict = {}
	for i in range(1, 4):
	# candidate_number = run_info[str(i)]["run_info"]["filename"].split("_")[0]
	candidate_number = str(i)
	candidate_score = run_info[str(i)]["run_info"]["score"]
	model_score_dict[candidate_number] = candidate_score

	model_df = pd.DataFrame.from_records(list(model_dict.items()), columns=["candidate_number", "model"])
	model_score_df = pd.DataFrame.from_records(
	list(model_score_dict.items()),
	columns=["candidate_number", f"score ({adaptation_metric})_float"],
	)
	skeleton_info = json.loads(files[".skeleton.json"])
	model_info = {k.split(":")[2]: v for k, v in skeleton_info.items() if k.startswith("MODEL:")}
	model_info = pd.DataFrame.from_records(list(model_info.items()), columns=["model", "probability"]).sort_values(
	"probability", ascending=False
	)
	model_info["probability"] /= 2
	model_info = pd.merge(model_info, model_df, on="model", how="left")
	model_info = pd.merge(model_info, model_score_df, on="candidate_number", how="left")
	model_info = model_info[
	[
	"candidate_number",
	"model",
	"probability",
	f"score ({adaptation_metric})_float",
	]
	]

	script_info = json.loads(files["script_code_explainability.json"])
	selected_model = script_info["model"]["explanation"]["target_component_name"].split(":")[2]
	score_index = model_info[model_info["model"] == selected_model].index

	model_info[f"score ({adaptation_metric})"] = model_info[f"score ({adaptation_metric})_float"].apply(
	lambda x: "{:.5f}".format(x) if abs(x) < 10000 else "{:.2E}".format(x)
	)
	model_info.drop(f"score ({adaptation_metric})_float", axis=1, inplace=True)

	return model_info, score_index

	def get_columns_info(files):
	# Get the candidatenum ber and validation score of the model from run_info
	run_info = json.loads(files["run_info.json"])

	return {
	"text_columns": run_info["TEXT_COLUMNS"],
	"categorical_columns": run_info["CATEGORICAL_COLS"],
	"date_columns": run_info["DATE_COLUMNS"],
	}

	def remake_run_info(script_dir):
	run_info_path = os.path.join(script_dir, "run_info.json")
	with open(run_info_path, "r") as f:
	run_info_raw = json.load(f)

	run_info = {}
	for i in range(1, 4):
	run_info[str(i)] = {}
	run_info[str(i)]["run_info"] = {}
	run_info[str(i)]["run_info"]["score"] = run_info_raw[str(i)]["run_info"]["score"]
	# To handle both of JSON-serialized Pipeline and str-dumped Pipeline,
	# check the type of `run_info["0"]["content"]` is `str` or not.
	# For details, refer to https://github.com/F-AutoML/sapientml/pull/562
	run_info_content = run_info_raw["1"]["content"]
	if not isinstance(run_info_content, str):
	run_info_content = json.dumps(run_info_content)

	# text_columns_info = re.search(r"_TEXT_COLUMNS = (\[.*?\])", run_info_content)
	text_columns_info = re.search(r"TEXT_COLUMNS = (\[.*?\])\\n", run_info_content)
	# Fix: Comment Please
	text_columns = []
	if text_columns_info:
	text_columns = json.loads(f'"{text_columns_info[1]}"')
	run_info["TEXT_COLUMNS"] = text_columns

	categorical_columns_info = re.search(r"CATEGORICAL_COLS = (\[.*?\])\\n", run_info_content)

	categorical_columns = []
	if categorical_columns_info is not None:
	categorical_columns = json.loads(f'"{categorical_columns_info[1]}"')
	run_info["CATEGORICAL_COLS"] = categorical_columns

	date_columns_info = re.search(r"DATE_COLUMNS = (\[.*?\])\\n", run_info_content)

	date_columns = []
	if date_columns_info is not None:
	date_columns = json.loads(f'"{date_columns_info[1]}"')
	run_info["DATE_COLUMNS"] = date_columns
	with open(run_info_path, "w") as f:
	json.dump(run_info, f, indent=4)