import pandas as pd import numpy as np import warnings from scipy.stats import ttest_rel import matplotlib.pyplot as plt import seaborn as sns import io import gradio as gr def get_keys(d, values): return [k for k, v in d.items() if v in values] pickle_file_path = 'model_results1.pkl' model_results = pd.read_pickle(pickle_file_path) csv_file_path = 'the_model_results.csv' model_results_csv = pd.read_csv(csv_file_path) fmodel_results = pd.concat([model_results, model_results_csv.rename(columns = {"dataset_name" : "dataset"})], ignore_index=True) DATASET_CATEGORIES = { "Medical & Healthcare": { "D1": "Heart Disease (Comprehensive)", "D2": "Heart attack possibility", "D3": "Heart Disease Dataset", "D4": "Liver Disorders", "D5": "Diabetes Prediction", "D9": "Chronic Kidney Disease", "D10": "Breast Cancer Prediction", "D11": "Stroke Prediction", "D12": "Lung Cancer Prediction", "D13": "Hepatitis", "D15": "Thyroid Disease", "D16": "Heart Failure Prediction", "D17": "Parkinson's", "D18": "Indian Liver Patient", "D19": "COVID-19 Effect on Liver Cancer", "D20": "Liver Dataset", "D21": "Specht Heart", "D22": "Early-stage Diabetes", "D23": "Diabetic Retinopathy", "D24": "Breast Cancer Coimbra", "D25": "Chronic Kidney Disease", "D26": "Kidney Stone", "D28": "Echocardiogram", "D29": "Bladder Cancer Recurrence", "D31": "Prostate Cancer", "D46": "Real Breast Cancer Data", "D47": "Breast Cancer (Royston)", "D48": "Lung Cancer Dataset", "D52": "Cervical Cancer Risk", "D53": "Breast Cancer Wisconsin", "D61": "Breast Cancer Prediction", "D62": "Thyroid Disease", "D68": "Lung Cancer", "D69": "Cancer Patients Data", "D70": "Labor Relations", "D71": "Glioma Grading", "D74": "Post-Operative Patient", "D80": "Heart Rate Stress Monitoring", "D82": "Diabetes 2019", "D87": "Personal Heart Disease Indicators", "D92": "Heart Disease (Logistic)", "D95": "Diabetes Prediction", "D97": "Cardiovascular Disease", "D98": "Diabetes 130 US Hospitals", "D99": "Heart Disease Dataset", "D181": "HCV Data", "D184": "Cardiotocography", "D189": "Mammographic Mass", "D199": "Easiest Diabetes", "D200": "Monkey-Pox Patients", "D54": "Breast Cancer Wisconsin", "D63": "Sick-euthyroid", "D64": "Ann-test", "D65": "Ann-train", "D66": "Hypothyroid", "D67": "New-thyroid", "D72": "Glioma Grading" }, "Gaming & Sports": { "D27": "Chess King-Rook", "D36": "Tic-Tac-Toe", "D40": "IPL 2022 Matches", "D41": "League of Legends", "D55": "League of Legends Diamond", "D56": "Chess Game Dataset", "D57": "Game of Thrones", "D73": "Connect-4", "D75": "FIFA 2018", "D76": "Dota 2 Matches", "D77": "IPL Match Analysis", "D78": "CS:GO Professional", "D79": "IPL 2008-2022", "D114": "Video Games", "D115": "Video Games Sales", "D117": "Sacred Games", "D118": "PC Games Sales", "D119": "Popular Video Games", "D120": "Olympic Games 2021", "D121": "Video Games ESRB", "D122": "Top Play Store Games", "D123": "Steam Games", "D124": "PS4 Games", "D116": "Video Games Sales" }, "Education & Students": { "D43": "Student Marks", "D44": "Student 2nd Year Result", "D45": "Student Mat Pass/Fail", "D103": "Academic Performance", "D104": "Student Academic Analysis", "D105": "Student Dropout Prediction", "D106": "Electronic Gadgets Impact", "D107": "Campus Recruitment", "D108": "End-Semester Performance", "D109": "Fitbits and Grades", "D110": "Student Time Management", "D111": "Student Feedback", "D112": "Depression & Performance", "D113": "University Rankings", "D126": "University Ranking CWUR", "D127": "University Ranking CWUR 2013-2014", "D128": "University Ranking CWUR 2014-2015", "D129": "University Ranking CWUR 2015-2016", "D130": "University Ranking CWUR 2016-2017", "D131": "University Ranking CWUR 2017-2018", "D132": "University Ranking CWUR 2018-2019", "D133": "University Ranking CWUR 2019-2020", "D134": "University Ranking CWUR 2020-2021", "D135": "University Ranking CWUR 2021-2022", "D136": "University Ranking CWUR 2022-2023", "D137": "University Ranking GM 2016", "D138": "University Ranking GM 2017", "D139": "University Ranking GM 2018", "D140": "University Ranking GM 2019", "D141": "University Ranking GM 2020", "D142": "University Ranking GM 2021", "D143": "University Ranking GM 2022", "D144": "University Ranking Webometric 2012", "D145": "University Ranking Webometric 2013", "D146": "University Ranking Webometric 2014", "D147": "University Ranking Webometric 2015", "D148": "University Ranking Webometric 2016", "D149": "University Ranking Webometric 2017", "D150": "University Ranking Webometric 2018", "D151": "University Ranking Webometric 2019", "D152": "University Ranking Webometric 2020", "D153": "University Ranking Webometric 2021", "D154": "University Ranking Webometric 2022", "D155": "University Ranking Webometric 2023", "D156": "University Ranking URAP 2018-2019", "D157": "University Ranking URAP 2019-2020", "D158": "University Ranking URAP 2020-2021", "D159": "University Ranking URAP 2021-2022", "D160": "University Ranking URAP 2022-2023", "D161": "University Ranking THE 2011", "D162": "University Ranking THE 2012", "D163": "University Ranking THE 2013", "D164": "University Ranking THE 2014", "D165": "University Ranking THE 2015", "D166": "University Ranking THE 2016", "D167": "University Ranking THE 2017", "D168": "University Ranking THE 2018", "D169": "University Ranking THE 2019", "D170": "University Ranking THE 2020", "D171": "University Ranking THE 2021", "D172": "University Ranking THE 2022", "D173": "University Ranking THE 2023", "D174": "University Ranking QS 2022", "D190": "Student Academics Performance" }, "Banking & Finance": { "D6": "Bank Marketing 1", "D7": "Bank Marketing 2", "D30": "Adult Income", "D32": "Telco Customer Churn", "D35": "Credit Approval", "D50": "Term Deposit Prediction", "D96": "Credit Card Fraud", "D188": "South German Credit", "D193": "Credit Risk Classification", "D195": "Credit Score Classification", "D196": "Banking Classification" }, "Science & Engineering": { "D8": "Mushroom", "D14": "Ionosphere", "D33": "EEG Eye State", "D37": "Steel Plates Faults", "D39": "Fertility", "D51": "Darwin", "D58": "EEG Emotions", "D81": "Predictive Maintenance", "D84": "Oranges vs Grapefruit", "D90": "Crystal System Li-ion", "D183": "Drug Consumption", "D49": "Air Pressure System Failures", "D93": "Air Pressure System Failures", "D185": "Toxicity", "D186": "Toxicity" }, "Social & Lifestyle": { "D38": "Online Shoppers", "D59": "Red Wine Quality", "D60": "White Wine Quality", "D88": "Airline Passenger Satisfaction", "D94": "Go Emotions Google", "D100": "Spotify East Asian", "D125": "Suicide Rates", "D182": "Obesity Levels", "D187": "Blood Transfusion", "D191": "Obesity Classification", "D192": "Gender Classification", "D194": "Happiness Classification", "D42": "Airline customer Holiday Booking dataset" }, "ML Benchmarks & Synthetic": { "D34": "Spambase", "D85": "Synthetic Binary", "D89": "Naive Bayes Data", "D175": "Monk's Problems 1", "D176": "Monk's Problems 2", "D177": "Monk's Problems 3", "D178": "Monk's Problems 4", "D179": "Monk's Problems 5", "D180": "Monk's Problems 6" }, "Other": { "D83": "Paris Housing", "D91": "Fake Bills", "D197": "Star Classification" } } cats1 = list(DATASET_CATEGORIES.keys()) import pandas as pd from scipy.stats import ttest_rel # Load results df_1 = fmodel_results # Define individual models tree_models = ["RandomForest", "DecisionTree"] non_tree_models = ["KNN", "SVM", "LogisticRegression"] #models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] # Store all results all_results = [] # For each metric for metric in ["accuracy", "precision", "recall", "f1_score"]: comparison_num = 1 models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] # Compare each tree model with each non-tree model for tree_model in models: other_models = models.copy() other_models.remove(tree_model) if other_models == None: break for non_tree_model in other_models: # Get data for both models across all datasets tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric] non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric] # Align datasets (inner join - only datasets present for both models) combined = pd.DataFrame({ 'tree': tree_data, 'non_tree': non_tree_data }).dropna() if len(combined) < 2: print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data") comparison_num += 1 continue # Paired t-test t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree']) # Calculate means and stds mean1 = combined['tree'].mean() mean2 = combined['non_tree'].mean() std1 = combined['tree'].std() std2 = combined['non_tree'].std() n = len(combined) sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}" print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} {mean1:<10.5f} {mean2:<10.5f} {t_stat:<8.2f} {sig:<10} {'True' if p_val < 0.05 else 'False'}") all_results.append({ 'metric': metric, 'tree_model': tree_model, 'non_tree_model': non_tree_model, 'tree_mean': mean1, 'non_tree_mean': mean2, 'tree_std': std1, 'non_tree_std': std2, 'n_datasets': n, 't_statistic': t_stat, 'p_value': p_val }) comparison_num += 1 models = other_models.copy() results_df = pd.DataFrame(all_results) significant_count = (results_df['p_value'] < 0.05).sum() total_count = len(results_df) # Save detailed results #results_df.to_csv('pairwise_comparison_results.csv', index=False) import pandas as pd from scipy.stats import ttest_rel sig1 = {} for key in list(DATASET_CATEGORIES.keys()): # Load results df_1 = fmodel_results[fmodel_results["dataset"].isin(list(DATASET_CATEGORIES[key].keys()))] # Define individual models tree_models = ["RandomForest", "DecisionTree"] non_tree_models = ["KNN", "SVM", "LogisticRegression"] #models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] # Store all results all_results = [] # For each metric for metric in ["accuracy", "precision", "recall", "f1_score"]: comparison_num = 1 models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] # Compare each tree model with each non-tree model for tree_model in models: other_models = models.copy() other_models.remove(tree_model) if other_models == None: break for non_tree_model in other_models: # Get data for both models across all datasets tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric] non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric] # Align datasets (inner join - only datasets present for both models) combined = pd.DataFrame({ 'tree': tree_data, 'non_tree': non_tree_data }).dropna() if len(combined) < 2: print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data") comparison_num += 1 continue # Paired t-test t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree']) # Calculate means and stds mean1 = combined['tree'].mean() mean2 = combined['non_tree'].mean() std1 = combined['tree'].std() std2 = combined['non_tree'].std() n = len(combined) sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}" all_results.append({ 'metric': metric, 'tree_model': tree_model, 'non_tree_model': non_tree_model, 'tree_mean': mean1, 'non_tree_mean': mean2, 'tree_std': std1, 'non_tree_std': std2, 'n_datasets': n, 't_statistic': t_stat, 'p_value': p_val }) comparison_num += 1 models = other_models.copy() # Summary print("\n" + "="*80) print("SUMMARY") print("="*80) results_df4 = pd.DataFrame(all_results) sig1[key] = results_df4 significant_count = (results_df4['p_value'] < 0.05).sum() total_count = len(results_df4) print(f"\nSignificant comparisons (p < 0.05): {significant_count}/{total_count}") print(f"Tree models won in: {(results_df4['tree_mean'] > results_df4['non_tree_mean']).sum()} comparisons") # Save detailed results #results_df.to_csv('pairwise_comparison_results.csv', index=False) print("\nDetailed results saved to: pairwise_comparison_results.csv") sig1["AllDatasets"] = results_df import matplotlib.pyplot as plt import seaborn as sns from PIL import Image import io cats = [f"{key} ({len(values)})" for key, values in DATASET_CATEGORIES.items()] datasel = cats.copy() datasel.extend(["AllDatasets (150)"]) def compare_ind(med, game, ed, bank, sci, social, ml, other, models_to_compare=None): metri = ["accuracy", "precision", "recall", "f1_score"] figs = [] messages = [] dropdowns = [med, game, ed, bank, sci, social, ml, other] selected_datasets_keys = [] for cat_name, dropdown_values in zip(cats1, dropdowns): if dropdown_values: selected_datasets_keys.extend(get_keys(DATASET_CATEGORIES[cat_name], dropdown_values)) if not models_to_compare: messages.append("No models selected. Displaying results for all models.") models_to_compare = models dataset_id_to_name = {id: name for category_dict in DATASET_CATEGORIES.values() for id, name in category_dict.items()} filtered_df_all_metrics = fmodel_results[ (fmodel_results["dataset"].isin(selected_datasets_keys)) & (fmodel_results["model"].isin(models_to_compare)) ].copy() for metric in metri: heatmap_data_metric = filtered_df_all_metrics.pivot_table( index='dataset', columns='model', values=metric ) heatmap_data_metric = heatmap_data_metric.rename(index=dataset_id_to_name) print(heatmap_data_metric) fig = plt.figure(figsize=(12, 8)) sns.heatmap(heatmap_data_metric, annot=True, cmap="crest", fmt=".3f", cbar=True) plt.title(f"{metric} per Dataset and Model ({len(selected_datasets_keys)} datasets)") plt.ylabel("Dataset") plt.xlabel("Model") plt.tight_layout() figs.append(fig) return figs[0], figs[1], figs[2], figs[3], "\n".join(messages) if messages else "Comparison complete." import matplotlib.pyplot as plt import seaborn as sns from PIL import Image import io def compare_groups(data_choice, model1, model2): messages = [] data1 = sig1[data_choice.split(' (')[0]] comparison_data = data1[ ((data1['tree_model'] == model1) & (data1['non_tree_model'] == model2)) | ((data1['tree_model'] == model2) & (data1['non_tree_model'] == model1)) ] if comparison_data.empty: fig = plt.figure(figsize=(10, 6)) plt.close(fig) return fig, "No comparison data found for the selected models. \n Dont pick the same models." plot_data = [] p_values_text = [] for index, row in comparison_data.iterrows(): metric = row['metric'] if row['tree_model'] == model1: plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['tree_mean']}) plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['non_tree_mean']}) else: # if: row['tree_model'] == model2 plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['non_tree_mean']}) plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['tree_mean']}) p_values_text.append(f"{metric} p-value: {row['p_value']:.5f} (Significant (cutoff = 0.05): {'Yes' if row['p_value'] < 0.05 else 'No'})") df_plot = pd.DataFrame(plot_data) fig = plt.figure(figsize=(10, 6)) sns.barplot(x='Metric', y='Mean Score', hue='Model', data=df_plot) plt.title(f'Comparison of {model1} vs {model2} Across Metrics') plt.ylabel('Mean Score') plt.xlabel('Metric') plt.ylim(0, 1) plt.legend(title='Model') plt.tight_layout() return fig, "\n".join(p_values_text) import gradio as gr models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"] with gr.Blocks() as demo: with gr.Tabs() as tabs: with gr.TabItem("Compaes groups of datasets"): with gr.Column(): with gr.Row(): gr.Markdown("# Comparing models") model1 = gr.Dropdown(["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression", "AllModels(not for AllData)"], label = "Model 1") model2 = gr.Dropdown(models, value = models[1], label = "Model 2") dataset = gr.Dropdown(datasel, label = "Datasets") compare_button = gr.Button("Compare") with gr.Row(): gr.Markdown("## Results") #output_plot = gr.Gallery(label="Results") p_value = gr.Textbox(label="Model Output", lines = 4) with gr.Row(): output_plot = gr.Plot(label="Results") compare_button.click( fn = compare_groups, inputs =[dataset, model1, model2], outputs = [output_plot, p_value] ) with gr.TabItem("Compare Individual Tabs"): with gr.Column(): with gr.Row(): gr.Markdown("# Comparing models") model_choice = gr.Dropdown(models, label = "Model 1", multiselect = True) med = gr.Dropdown(DATASET_CATEGORIES[cats1[0]].values(), label = cats1[0],multiselect=True) game = gr.Dropdown(DATASET_CATEGORIES[cats1[1]].values(), label = cats1[1],multiselect=True) ed = gr.Dropdown(DATASET_CATEGORIES[cats1[2]].values(), label = cats1[2],multiselect=True) with gr.Row(): bank = gr.Dropdown(DATASET_CATEGORIES[cats1[3]].values(), label = cats1[3],multiselect=True) sci = gr.Dropdown(DATASET_CATEGORIES[cats1[4]].values(), label = cats1[4],multiselect=True) social = gr.Dropdown(DATASET_CATEGORIES[cats1[5]].values(), label = cats1[5],multiselect=True) ml = gr.Dropdown(DATASET_CATEGORIES[cats1[6]].values(), label = cats1[6],multiselect=True) other = gr.Dropdown(DATASET_CATEGORIES[cats1[7]].values(), label = cats1[7],multiselect=True) compare_button = gr.Button("Compare") with gr.Row(): #output_plot = gr.Gallery(label="Results") p1 = gr.Plot(label="Results") with gr.Row(): p2 = gr.Plot(label="Results") with gr.Row(): p3 = gr.Plot(label="Results") with gr.Row(): p4 = gr.Plot(label="Results") with gr.Row(): p_value = gr.Textbox(label="Model Output", lines = 4) compare_button.click( fn = compare_ind, inputs =[med, game, ed, bank, sci, social, ml, other, model_choice], outputs = [p1, p2, p3, p4, p_value] ) demo.launch(share=True, show_error=True)