Spaces:

Arun-AK
/

Model-Comparison

Sleeping

App Files Files Community

Model-Comparison / app.py

Arun-AK

Update app.py

6e513c8 verified 4 months ago

raw

history blame contribute delete

21.2 kB

	import pandas as pd
	import numpy as np
	import warnings
	from scipy.stats import ttest_rel
	import matplotlib.pyplot as plt
	import seaborn as sns
	import io
	import gradio as gr
	def get_keys(d, values):
	return [k for k, v in d.items() if v in values]
	pickle_file_path = 'model_results1.pkl'

	model_results = pd.read_pickle(pickle_file_path)

	csv_file_path = 'the_model_results.csv'

	model_results_csv = pd.read_csv(csv_file_path)

	fmodel_results = pd.concat([model_results, model_results_csv.rename(columns = {"dataset_name" : "dataset"})], ignore_index=True)

	DATASET_CATEGORIES = {
	"Medical & Healthcare": {
	"D1": "Heart Disease (Comprehensive)",
	"D2": "Heart attack possibility",
	"D3": "Heart Disease Dataset",
	"D4": "Liver Disorders",
	"D5": "Diabetes Prediction",
	"D9": "Chronic Kidney Disease",
	"D10": "Breast Cancer Prediction",
	"D11": "Stroke Prediction",
	"D12": "Lung Cancer Prediction",
	"D13": "Hepatitis",
	"D15": "Thyroid Disease",
	"D16": "Heart Failure Prediction",
	"D17": "Parkinson's",
	"D18": "Indian Liver Patient",
	"D19": "COVID-19 Effect on Liver Cancer",
	"D20": "Liver Dataset",
	"D21": "Specht Heart",
	"D22": "Early-stage Diabetes",
	"D23": "Diabetic Retinopathy",
	"D24": "Breast Cancer Coimbra",
	"D25": "Chronic Kidney Disease",
	"D26": "Kidney Stone",
	"D28": "Echocardiogram",
	"D29": "Bladder Cancer Recurrence",
	"D31": "Prostate Cancer",
	"D46": "Real Breast Cancer Data",
	"D47": "Breast Cancer (Royston)",
	"D48": "Lung Cancer Dataset",
	"D52": "Cervical Cancer Risk",
	"D53": "Breast Cancer Wisconsin",
	"D61": "Breast Cancer Prediction",
	"D62": "Thyroid Disease",
	"D68": "Lung Cancer",
	"D69": "Cancer Patients Data",
	"D70": "Labor Relations",
	"D71": "Glioma Grading",
	"D74": "Post-Operative Patient",
	"D80": "Heart Rate Stress Monitoring",
	"D82": "Diabetes 2019",
	"D87": "Personal Heart Disease Indicators",
	"D92": "Heart Disease (Logistic)",
	"D95": "Diabetes Prediction",
	"D97": "Cardiovascular Disease",
	"D98": "Diabetes 130 US Hospitals",
	"D99": "Heart Disease Dataset",
	"D181": "HCV Data",
	"D184": "Cardiotocography",
	"D189": "Mammographic Mass",
	"D199": "Easiest Diabetes",
	"D200": "Monkey-Pox Patients",
	"D54": "Breast Cancer Wisconsin",
	"D63": "Sick-euthyroid",
	"D64": "Ann-test",
	"D65": "Ann-train",
	"D66": "Hypothyroid",
	"D67": "New-thyroid",
	"D72": "Glioma Grading"
	},

	"Gaming & Sports": {
	"D27": "Chess King-Rook",
	"D36": "Tic-Tac-Toe",
	"D40": "IPL 2022 Matches",
	"D41": "League of Legends",
	"D55": "League of Legends Diamond",
	"D56": "Chess Game Dataset",
	"D57": "Game of Thrones",
	"D73": "Connect-4",
	"D75": "FIFA 2018",
	"D76": "Dota 2 Matches",
	"D77": "IPL Match Analysis",
	"D78": "CS:GO Professional",
	"D79": "IPL 2008-2022",
	"D114": "Video Games",
	"D115": "Video Games Sales",
	"D117": "Sacred Games",
	"D118": "PC Games Sales",
	"D119": "Popular Video Games",
	"D120": "Olympic Games 2021",
	"D121": "Video Games ESRB",
	"D122": "Top Play Store Games",
	"D123": "Steam Games",
	"D124": "PS4 Games",
	"D116": "Video Games Sales"
	},

	"Education & Students": {
	"D43": "Student Marks",
	"D44": "Student 2nd Year Result",
	"D45": "Student Mat Pass/Fail",
	"D103": "Academic Performance",
	"D104": "Student Academic Analysis",
	"D105": "Student Dropout Prediction",
	"D106": "Electronic Gadgets Impact",
	"D107": "Campus Recruitment",
	"D108": "End-Semester Performance",
	"D109": "Fitbits and Grades",
	"D110": "Student Time Management",
	"D111": "Student Feedback",
	"D112": "Depression & Performance",
	"D113": "University Rankings",
	"D126": "University Ranking CWUR",
	"D127": "University Ranking CWUR 2013-2014",
	"D128": "University Ranking CWUR 2014-2015",
	"D129": "University Ranking CWUR 2015-2016",
	"D130": "University Ranking CWUR 2016-2017",
	"D131": "University Ranking CWUR 2017-2018",
	"D132": "University Ranking CWUR 2018-2019",
	"D133": "University Ranking CWUR 2019-2020",
	"D134": "University Ranking CWUR 2020-2021",
	"D135": "University Ranking CWUR 2021-2022",
	"D136": "University Ranking CWUR 2022-2023",
	"D137": "University Ranking GM 2016",
	"D138": "University Ranking GM 2017",
	"D139": "University Ranking GM 2018",
	"D140": "University Ranking GM 2019",
	"D141": "University Ranking GM 2020",
	"D142": "University Ranking GM 2021",
	"D143": "University Ranking GM 2022",
	"D144": "University Ranking Webometric 2012",
	"D145": "University Ranking Webometric 2013",
	"D146": "University Ranking Webometric 2014",
	"D147": "University Ranking Webometric 2015",
	"D148": "University Ranking Webometric 2016",
	"D149": "University Ranking Webometric 2017",
	"D150": "University Ranking Webometric 2018",
	"D151": "University Ranking Webometric 2019",
	"D152": "University Ranking Webometric 2020",
	"D153": "University Ranking Webometric 2021",
	"D154": "University Ranking Webometric 2022",
	"D155": "University Ranking Webometric 2023",
	"D156": "University Ranking URAP 2018-2019",
	"D157": "University Ranking URAP 2019-2020",
	"D158": "University Ranking URAP 2020-2021",
	"D159": "University Ranking URAP 2021-2022",
	"D160": "University Ranking URAP 2022-2023",
	"D161": "University Ranking THE 2011",
	"D162": "University Ranking THE 2012",
	"D163": "University Ranking THE 2013",
	"D164": "University Ranking THE 2014",
	"D165": "University Ranking THE 2015",
	"D166": "University Ranking THE 2016",
	"D167": "University Ranking THE 2017",
	"D168": "University Ranking THE 2018",
	"D169": "University Ranking THE 2019",
	"D170": "University Ranking THE 2020",
	"D171": "University Ranking THE 2021",
	"D172": "University Ranking THE 2022",
	"D173": "University Ranking THE 2023",
	"D174": "University Ranking QS 2022",
	"D190": "Student Academics Performance"
	},

	"Banking & Finance": {
	"D6": "Bank Marketing 1",
	"D7": "Bank Marketing 2",
	"D30": "Adult Income",
	"D32": "Telco Customer Churn",
	"D35": "Credit Approval",
	"D50": "Term Deposit Prediction",
	"D96": "Credit Card Fraud",
	"D188": "South German Credit",
	"D193": "Credit Risk Classification",
	"D195": "Credit Score Classification",
	"D196": "Banking Classification"
	},

	"Science & Engineering": {
	"D8": "Mushroom",
	"D14": "Ionosphere",
	"D33": "EEG Eye State",
	"D37": "Steel Plates Faults",
	"D39": "Fertility",
	"D51": "Darwin",
	"D58": "EEG Emotions",
	"D81": "Predictive Maintenance",
	"D84": "Oranges vs Grapefruit",
	"D90": "Crystal System Li-ion",
	"D183": "Drug Consumption",
	"D49": "Air Pressure System Failures",
	"D93": "Air Pressure System Failures",
	"D185": "Toxicity",
	"D186": "Toxicity"
	},

	"Social & Lifestyle": {
	"D38": "Online Shoppers",
	"D59": "Red Wine Quality",
	"D60": "White Wine Quality",
	"D88": "Airline Passenger Satisfaction",
	"D94": "Go Emotions Google",
	"D100": "Spotify East Asian",
	"D125": "Suicide Rates",
	"D182": "Obesity Levels",
	"D187": "Blood Transfusion",
	"D191": "Obesity Classification",
	"D192": "Gender Classification",
	"D194": "Happiness Classification",
	"D42": "Airline customer Holiday Booking dataset"
	},

	"ML Benchmarks & Synthetic": {
	"D34": "Spambase",
	"D85": "Synthetic Binary",
	"D89": "Naive Bayes Data",
	"D175": "Monk's Problems 1",
	"D176": "Monk's Problems 2",
	"D177": "Monk's Problems 3",
	"D178": "Monk's Problems 4",
	"D179": "Monk's Problems 5",
	"D180": "Monk's Problems 6"
	},

	"Other": {
	"D83": "Paris Housing",
	"D91": "Fake Bills",
	"D197": "Star Classification"
	}
	}
	cats1 = list(DATASET_CATEGORIES.keys())
	import pandas as pd
	from scipy.stats import ttest_rel

	# Load results
	df_1 = fmodel_results

	# Define individual models
	tree_models = ["RandomForest", "DecisionTree"]
	non_tree_models = ["KNN", "SVM", "LogisticRegression"]
	#models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]



	# Store all results
	all_results = []

	# For each metric
	for metric in ["accuracy", "precision", "recall", "f1_score"]:


	comparison_num = 1

	models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
	# Compare each tree model with each non-tree model
	for tree_model in models:
	other_models = models.copy()
	other_models.remove(tree_model)
	if other_models == None:
	break
	for non_tree_model in other_models:

	# Get data for both models across all datasets
	tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric]
	non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric]

	# Align datasets (inner join - only datasets present for both models)
	combined = pd.DataFrame({
	'tree': tree_data,
	'non_tree': non_tree_data
	}).dropna()

	if len(combined) < 2:
	print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data")
	comparison_num += 1
	continue

	# Paired t-test
	t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree'])

	# Calculate means and stds
	mean1 = combined['tree'].mean()
	mean2 = combined['non_tree'].mean()
	std1 = combined['tree'].std()
	std2 = combined['non_tree'].std()
	n = len(combined)

	sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

	print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} {mean1:<10.5f} {mean2:<10.5f} {t_stat:<8.2f} {sig:<10} {'True' if p_val < 0.05 else 'False'}")

	all_results.append({
	'metric': metric,
	'tree_model': tree_model,
	'non_tree_model': non_tree_model,
	'tree_mean': mean1,
	'non_tree_mean': mean2,
	'tree_std': std1,
	'non_tree_std': std2,
	'n_datasets': n,
	't_statistic': t_stat,
	'p_value': p_val
	})

	comparison_num += 1
	models = other_models.copy()

	results_df = pd.DataFrame(all_results)
	significant_count = (results_df['p_value'] < 0.05).sum()
	total_count = len(results_df)

	# Save detailed results
	#results_df.to_csv('pairwise_comparison_results.csv', index=False)
	import pandas as pd
	from scipy.stats import ttest_rel
	sig1 = {}
	for key in list(DATASET_CATEGORIES.keys()):
	# Load results
	df_1 = fmodel_results[fmodel_results["dataset"].isin(list(DATASET_CATEGORIES[key].keys()))]

	# Define individual models
	tree_models = ["RandomForest", "DecisionTree"]
	non_tree_models = ["KNN", "SVM", "LogisticRegression"]
	#models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
	models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]

	# Store all results
	all_results = []

	# For each metric
	for metric in ["accuracy", "precision", "recall", "f1_score"]:
	comparison_num = 1

	models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
	# Compare each tree model with each non-tree model
	for tree_model in models:
	other_models = models.copy()
	other_models.remove(tree_model)
	if other_models == None:
	break
	for non_tree_model in other_models:

	# Get data for both models across all datasets
	tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric]
	non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric]

	# Align datasets (inner join - only datasets present for both models)
	combined = pd.DataFrame({
	'tree': tree_data,
	'non_tree': non_tree_data
	}).dropna()

	if len(combined) < 2:
	print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data")
	comparison_num += 1
	continue

	# Paired t-test
	t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree'])

	# Calculate means and stds
	mean1 = combined['tree'].mean()
	mean2 = combined['non_tree'].mean()
	std1 = combined['tree'].std()
	std2 = combined['non_tree'].std()
	n = len(combined)

	sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"


	all_results.append({
	'metric': metric,
	'tree_model': tree_model,
	'non_tree_model': non_tree_model,
	'tree_mean': mean1,
	'non_tree_mean': mean2,
	'tree_std': std1,
	'non_tree_std': std2,
	'n_datasets': n,
	't_statistic': t_stat,
	'p_value': p_val
	})

	comparison_num += 1
	models = other_models.copy()

	# Summary
	print("\n" + "="*80)
	print("SUMMARY")
	print("="*80)
	results_df4 = pd.DataFrame(all_results)
	sig1[key] = results_df4
	significant_count = (results_df4['p_value'] < 0.05).sum()
	total_count = len(results_df4)
	print(f"\nSignificant comparisons (p < 0.05): {significant_count}/{total_count}")
	print(f"Tree models won in: {(results_df4['tree_mean'] > results_df4['non_tree_mean']).sum()} comparisons")

	# Save detailed results
	#results_df.to_csv('pairwise_comparison_results.csv', index=False)
	print("\nDetailed results saved to: pairwise_comparison_results.csv")
	sig1["AllDatasets"] = results_df
	import matplotlib.pyplot as plt
	import seaborn as sns
	from PIL import Image
	import io
	cats = [f"{key} ({len(values)})" for key, values in DATASET_CATEGORIES.items()]
	datasel = cats.copy()
	datasel.extend(["AllDatasets (150)"])
	def compare_ind(med, game, ed, bank, sci, social, ml, other, models_to_compare=None):
	metri = ["accuracy", "precision", "recall", "f1_score"]
	figs = []
	messages = []
	dropdowns = [med, game, ed, bank, sci, social, ml, other]
	selected_datasets_keys = []

	for cat_name, dropdown_values in zip(cats1, dropdowns):
	if dropdown_values:
	selected_datasets_keys.extend(get_keys(DATASET_CATEGORIES[cat_name], dropdown_values))

	if not models_to_compare:
	messages.append("No models selected. Displaying results for all models.")
	models_to_compare = models

	dataset_id_to_name = {id: name for category_dict in DATASET_CATEGORIES.values() for id, name in category_dict.items()}

	filtered_df_all_metrics = fmodel_results[
	(fmodel_results["dataset"].isin(selected_datasets_keys)) &
	(fmodel_results["model"].isin(models_to_compare))
	].copy()

	for metric in metri:
	heatmap_data_metric = filtered_df_all_metrics.pivot_table(
	index='dataset',
	columns='model',
	values=metric
	)

	heatmap_data_metric = heatmap_data_metric.rename(index=dataset_id_to_name)
	print(heatmap_data_metric)
	fig = plt.figure(figsize=(12, 8))
	sns.heatmap(heatmap_data_metric, annot=True, cmap="crest", fmt=".3f", cbar=True)
	plt.title(f"{metric} per Dataset and Model ({len(selected_datasets_keys)} datasets)")
	plt.ylabel("Dataset")
	plt.xlabel("Model")
	plt.tight_layout()
	figs.append(fig)



	return figs[0], figs[1], figs[2], figs[3], "\n".join(messages) if messages else "Comparison complete."
	import matplotlib.pyplot as plt
	import seaborn as sns
	from PIL import Image
	import io
	def compare_groups(data_choice, model1, model2):
	messages = []
	data1 = sig1[data_choice.split(' (')[0]]
	comparison_data = data1[
	((data1['tree_model'] == model1) & (data1['non_tree_model'] == model2)) \|
	((data1['tree_model'] == model2) & (data1['non_tree_model'] == model1))
	]

	if comparison_data.empty:
	fig = plt.figure(figsize=(10, 6))
	plt.close(fig)
	return fig, "No comparison data found for the selected models. \n Dont pick the same models."

	plot_data = []
	p_values_text = []

	for index, row in comparison_data.iterrows():
	metric = row['metric']
	if row['tree_model'] == model1:
	plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['tree_mean']})
	plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['non_tree_mean']})
	else: # if: row['tree_model'] == model2
	plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['non_tree_mean']})
	plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['tree_mean']})

	p_values_text.append(f"{metric} p-value: {row['p_value']:.5f} (Significant (cutoff = 0.05): {'Yes' if row['p_value'] < 0.05 else 'No'})")

	df_plot = pd.DataFrame(plot_data)

	fig = plt.figure(figsize=(10, 6))
	sns.barplot(x='Metric', y='Mean Score', hue='Model', data=df_plot)
	plt.title(f'Comparison of {model1} vs {model2} Across Metrics')
	plt.ylabel('Mean Score')
	plt.xlabel('Metric')
	plt.ylim(0, 1)
	plt.legend(title='Model')
	plt.tight_layout()

	return fig, "\n".join(p_values_text)
	import gradio as gr
	models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
	with gr.Blocks() as demo:
	with gr.Tabs() as tabs:
	with gr.TabItem("Compaes groups of datasets"):
	with gr.Column():
	with gr.Row():
	gr.Markdown("# Comparing models")
	model1 = gr.Dropdown(["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression", "AllModels(not for AllData)"], label = "Model 1")
	model2 = gr.Dropdown(models, value = models[1], label = "Model 2")
	dataset = gr.Dropdown(datasel, label = "Datasets")
	compare_button = gr.Button("Compare")
	with gr.Row():
	gr.Markdown("## Results")
	#output_plot = gr.Gallery(label="Results")
	p_value = gr.Textbox(label="Model Output", lines = 4)
	with gr.Row():
	output_plot = gr.Plot(label="Results")
	compare_button.click(
	fn = compare_groups,
	inputs =[dataset, model1, model2],
	outputs = [output_plot, p_value]
	)
	with gr.TabItem("Compare Individual Tabs"):
	with gr.Column():
	with gr.Row():
	gr.Markdown("# Comparing models")
	model_choice = gr.Dropdown(models, label = "Model 1", multiselect = True)
	med = gr.Dropdown(DATASET_CATEGORIES[cats1[0]].values(), label = cats1[0],multiselect=True)
	game = gr.Dropdown(DATASET_CATEGORIES[cats1[1]].values(), label = cats1[1],multiselect=True)
	ed = gr.Dropdown(DATASET_CATEGORIES[cats1[2]].values(), label = cats1[2],multiselect=True)
	with gr.Row():
	bank = gr.Dropdown(DATASET_CATEGORIES[cats1[3]].values(), label = cats1[3],multiselect=True)
	sci = gr.Dropdown(DATASET_CATEGORIES[cats1[4]].values(), label = cats1[4],multiselect=True)
	social = gr.Dropdown(DATASET_CATEGORIES[cats1[5]].values(), label = cats1[5],multiselect=True)
	ml = gr.Dropdown(DATASET_CATEGORIES[cats1[6]].values(), label = cats1[6],multiselect=True)
	other = gr.Dropdown(DATASET_CATEGORIES[cats1[7]].values(), label = cats1[7],multiselect=True)
	compare_button = gr.Button("Compare")
	with gr.Row():
	#output_plot = gr.Gallery(label="Results")
	p1 = gr.Plot(label="Results")
	with gr.Row():
	p2 = gr.Plot(label="Results")
	with gr.Row():
	p3 = gr.Plot(label="Results")
	with gr.Row():
	p4 = gr.Plot(label="Results")
	with gr.Row():
	p_value = gr.Textbox(label="Model Output", lines = 4)
	compare_button.click(
	fn = compare_ind,
	inputs =[med, game, ed, bank, sci, social, ml, other, model_choice],
	outputs = [p1, p2, p3, p4, p_value]
	)
	demo.launch(share=True, show_error=True)