Spaces:

beihai
/

watermelon-prediction

Runtime error

watermelon-prediction / dtreeviz /interpretation.py

tjxj

1.0

f985c99 about 4 years ago

7.11 kB

	"""
	Prediction path interpretation for decision tree models.
	In this moment, it contains "plain english" implementation, but others can be added in the future.
	"""
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas

	from dtreeviz.colors import adjust_colors
	from dtreeviz.models.shadow_decision_tree import ShadowDecTree


	def explain_prediction_plain_english(shadow_tree: ShadowDecTree,
	x: (pandas.core.series.Series, np.ndarray)):
	"""
	Explains the prediction path using feature value's range.

	A possible output for this method could be :
	1.5 <= Pclass
	3.5 <= Age < 44.5
	7.91 <= Fare < 54.25
	0.5 <= Sex_label
	Cabin_label < 3.5
	0.5 <= Embarked_label
	Output explanation :
	The model chose to make this prediction because instance's Pclass feature value is bigger or equal to 1.5, Age
	is between 3.5 and 44.5, Fare is between 7.91 and 54.25, and so on.

	:param shadow_tree: tree used to make prediction
	:param x: Instance example to make prediction
	:return: str
	Prediction path explanation in plain english.
	"""

	node_feature_index = shadow_tree.get_features()
	feature_names = shadow_tree.feature_names
	node_threshold = shadow_tree.get_thresholds()
	decision_node_path = shadow_tree.predict_path(x)

	# TODO - refactor this logic and find a way to make it simpler
	feature_smaller_values = {}
	feature_bigger_values = {}
	feature_categorical_value = {}
	for i, node in enumerate(decision_node_path):
	if i == len(decision_node_path) - 1:
	break # stop at leaf node
	node_id = node.id

	feature_name = feature_names[node_feature_index[node_id]]
	feature_value = x[node_feature_index[node_id]]

	if not shadow_tree.is_categorical_split(node_id):
	feature_split_value = round(node_threshold[node_id], 2)

	if feature_split_value <= feature_value:
	if feature_smaller_values.get(feature_name) is None:
	feature_smaller_values[feature_name] = []
	feature_smaller_values.get(feature_name).append(feature_split_value)
	elif feature_split_value > feature_value:
	if feature_bigger_values.get(feature_name) is None:
	feature_bigger_values[feature_name] = []
	feature_bigger_values.get(feature_name).append(feature_split_value)
	else:
	if feature_value in node_threshold[node_id][0]:
	feature_categorical_value[feature_name] = node_threshold[node_id][0]
	else:
	feature_categorical_value[feature_name] = node_threshold[node_id][1]

	prediction_path_output = ""
	for feature_name in feature_names:
	feature_range = ""
	if feature_name in feature_smaller_values:
	feature_range = f"{max(feature_smaller_values[feature_name])} <= {feature_name} "
	if feature_name in feature_bigger_values:
	if feature_range == "":
	feature_range = f"{feature_name} < {min(feature_bigger_values[feature_name])}"
	else:
	feature_range += f" < {min(feature_bigger_values[feature_name])}"

	if feature_range != "":
	prediction_path_output += feature_range + "\n"

	for feature_name in feature_categorical_value:
	prediction_path_output += f"{feature_name} in {feature_categorical_value[feature_name]} \n"

	return prediction_path_output


	def explain_prediction_sklearn_default(shadow_tree: ShadowDecTree,
	x: (pandas.core.series.Series, np.ndarray),
	figsize: tuple = (10, 5),
	colors: dict = None,
	fontsize: int = 14,
	fontname: str = "Arial",
	grid: bool = False):
	"""
	Explain prediction calculating features importance using sklearn default algorithm : mean decrease in impurity
	(or gini importance) mechanism.
	This mechanism can be biased, especially for situations where features vary in their scale of measurement or
	their number of categories.
	For more details, you can read this article : https://explained.ai/rf-importance/index.html

	:param shadow_tree: tree used to make prediction
	:param x: Instance example to make prediction
	:param figsize: tuple of int, optional
	The plot size
	:param colors: dict, optional
	The set of colors used for plotting
	:param fontsize: int, optional
	Plot labels fontsize
	:param fontname: str, optional
	Plot labels font name
	:param grid: bool
	True if we want to display the grid lines on the visualization
	:return:
	Prediction feature's importance plot
	"""

	decision_node_path = shadow_tree.predict_path(x)
	decision_node_path = [node.id for node in decision_node_path]

	feature_path_importance = shadow_tree.get_feature_path_importance(decision_node_path)
	return _get_feature_path_importance_sklearn_plot(shadow_tree.feature_names, feature_path_importance, figsize,
	colors, fontsize,
	fontname,
	grid)


	def _get_feature_path_importance_sklearn_plot(features, feature_path_importance, figsize, colors, fontsize, fontname,
	grid):
	colors = adjust_colors(colors)
	fig, ax = plt.subplots(figsize=figsize)
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.spines['left'].set_linewidth(.3)
	ax.spines['bottom'].set_linewidth(.3)
	ax.set_xticks(range(0, len(features)))
	ax.set_xticklabels(features)

	barcontainers = ax.bar(range(0, len(features)), feature_path_importance, color=colors["hist_bar"], lw=.3,
	align='center',
	width=1)
	for rect in barcontainers.patches:
	rect.set_linewidth(.5)
	rect.set_edgecolor(colors['rect_edge'])
	ax.set_xlabel("features", fontsize=fontsize, fontname=fontname, color=colors['axis_label'])
	ax.set_ylabel("feature importance", fontsize=fontsize, fontname=fontname, color=colors['axis_label'])
	ax.grid(b=grid)

	return ax


	def get_prediction_explainer(explanation_type: str):
	"""Factory method responsible to return a prediction path implementation based on argument 'explanation_type'

	:param explanation_type: specify the type of path explanation to be returned
	:return: method implementation for specified path explanation.
	"""

	if explanation_type == "plain_english":
	return explain_prediction_plain_english
	elif explanation_type == "sklearn_default":
	return explain_prediction_sklearn_default
	else:
	raise ValueError(f"Explanation type {explanation_type} is not supported yet!")