Spaces:

Arun-AK
/

Model-Comparison

Running

File size: 21,180 Bytes

import pandas as pd
import numpy as np
import warnings
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import seaborn as sns
import io
import gradio as gr
def get_keys(d, values):
  return [k for k, v in d.items() if v in values]
pickle_file_path = 'model_results1.pkl'

model_results = pd.read_pickle(pickle_file_path)

csv_file_path = 'the_model_results.csv'

model_results_csv = pd.read_csv(csv_file_path)

fmodel_results = pd.concat([model_results, model_results_csv.rename(columns = {"dataset_name" : "dataset"})], ignore_index=True)

DATASET_CATEGORIES = {
    "Medical & Healthcare": {
        "D1": "Heart Disease (Comprehensive)",
        "D2": "Heart attack possibility",
        "D3": "Heart Disease Dataset",
        "D4": "Liver Disorders",
        "D5": "Diabetes Prediction",
        "D9": "Chronic Kidney Disease",
        "D10": "Breast Cancer Prediction",
        "D11": "Stroke Prediction",
        "D12": "Lung Cancer Prediction",
        "D13": "Hepatitis",
        "D15": "Thyroid Disease",
        "D16": "Heart Failure Prediction",
        "D17": "Parkinson's",
        "D18": "Indian Liver Patient",
        "D19": "COVID-19 Effect on Liver Cancer",
        "D20": "Liver Dataset",
        "D21": "Specht Heart",
        "D22": "Early-stage Diabetes",
        "D23": "Diabetic Retinopathy",
        "D24": "Breast Cancer Coimbra",
        "D25": "Chronic Kidney Disease",
        "D26": "Kidney Stone",
        "D28": "Echocardiogram",
        "D29": "Bladder Cancer Recurrence",
        "D31": "Prostate Cancer",
        "D46": "Real Breast Cancer Data",
        "D47": "Breast Cancer (Royston)",
        "D48": "Lung Cancer Dataset",
        "D52": "Cervical Cancer Risk",
        "D53": "Breast Cancer Wisconsin",
        "D61": "Breast Cancer Prediction",
        "D62": "Thyroid Disease",
        "D68": "Lung Cancer",
        "D69": "Cancer Patients Data",
        "D70": "Labor Relations",
        "D71": "Glioma Grading",
        "D74": "Post-Operative Patient",
        "D80": "Heart Rate Stress Monitoring",
        "D82": "Diabetes 2019",
        "D87": "Personal Heart Disease Indicators",
        "D92": "Heart Disease (Logistic)",
        "D95": "Diabetes Prediction",
        "D97": "Cardiovascular Disease",
        "D98": "Diabetes 130 US Hospitals",
        "D99": "Heart Disease Dataset",
        "D181": "HCV Data",
        "D184": "Cardiotocography",
        "D189": "Mammographic Mass",
        "D199": "Easiest Diabetes",
        "D200": "Monkey-Pox Patients",
        "D54": "Breast Cancer Wisconsin",
        "D63": "Sick-euthyroid",
        "D64": "Ann-test",
        "D65": "Ann-train",
        "D66": "Hypothyroid",
        "D67": "New-thyroid",
        "D72": "Glioma Grading"
    },

    "Gaming & Sports": {
        "D27": "Chess King-Rook",
        "D36": "Tic-Tac-Toe",
        "D40": "IPL 2022 Matches",
        "D41": "League of Legends",
        "D55": "League of Legends Diamond",
        "D56": "Chess Game Dataset",
        "D57": "Game of Thrones",
        "D73": "Connect-4",
        "D75": "FIFA 2018",
        "D76": "Dota 2 Matches",
        "D77": "IPL Match Analysis",
        "D78": "CS:GO Professional",
        "D79": "IPL 2008-2022",
        "D114": "Video Games",
        "D115": "Video Games Sales",
        "D117": "Sacred Games",
        "D118": "PC Games Sales",
        "D119": "Popular Video Games",
        "D120": "Olympic Games 2021",
        "D121": "Video Games ESRB",
        "D122": "Top Play Store Games",
        "D123": "Steam Games",
        "D124": "PS4 Games",
        "D116": "Video Games Sales"
    },

    "Education & Students": {
        "D43": "Student Marks",
        "D44": "Student 2nd Year Result",
        "D45": "Student Mat Pass/Fail",
        "D103": "Academic Performance",
        "D104": "Student Academic Analysis",
        "D105": "Student Dropout Prediction",
        "D106": "Electronic Gadgets Impact",
        "D107": "Campus Recruitment",
        "D108": "End-Semester Performance",
        "D109": "Fitbits and Grades",
        "D110": "Student Time Management",
        "D111": "Student Feedback",
        "D112": "Depression & Performance",
        "D113": "University Rankings",
        "D126": "University Ranking CWUR",
        "D127": "University Ranking CWUR 2013-2014",
        "D128": "University Ranking CWUR 2014-2015",
        "D129": "University Ranking CWUR 2015-2016",
        "D130": "University Ranking CWUR 2016-2017",
        "D131": "University Ranking CWUR 2017-2018",
        "D132": "University Ranking CWUR 2018-2019",
        "D133": "University Ranking CWUR 2019-2020",
        "D134": "University Ranking CWUR 2020-2021",
        "D135": "University Ranking CWUR 2021-2022",
        "D136": "University Ranking CWUR 2022-2023",
        "D137": "University Ranking GM 2016",
        "D138": "University Ranking GM 2017",
        "D139": "University Ranking GM 2018",
        "D140": "University Ranking GM 2019",
        "D141": "University Ranking GM 2020",
        "D142": "University Ranking GM 2021",
        "D143": "University Ranking GM 2022",
        "D144": "University Ranking Webometric 2012",
        "D145": "University Ranking Webometric 2013",
        "D146": "University Ranking Webometric 2014",
        "D147": "University Ranking Webometric 2015",
        "D148": "University Ranking Webometric 2016",
        "D149": "University Ranking Webometric 2017",
        "D150": "University Ranking Webometric 2018",
        "D151": "University Ranking Webometric 2019",
        "D152": "University Ranking Webometric 2020",
        "D153": "University Ranking Webometric 2021",
        "D154": "University Ranking Webometric 2022",
        "D155": "University Ranking Webometric 2023",
        "D156": "University Ranking URAP 2018-2019",
        "D157": "University Ranking URAP 2019-2020",
        "D158": "University Ranking URAP 2020-2021",
        "D159": "University Ranking URAP 2021-2022",
        "D160": "University Ranking URAP 2022-2023",
        "D161": "University Ranking THE 2011",
        "D162": "University Ranking THE 2012",
        "D163": "University Ranking THE 2013",
        "D164": "University Ranking THE 2014",
        "D165": "University Ranking THE 2015",
        "D166": "University Ranking THE 2016",
        "D167": "University Ranking THE 2017",
        "D168": "University Ranking THE 2018",
        "D169": "University Ranking THE 2019",
        "D170": "University Ranking THE 2020",
        "D171": "University Ranking THE 2021",
        "D172": "University Ranking THE 2022",
        "D173": "University Ranking THE 2023",
        "D174": "University Ranking QS 2022",
        "D190": "Student Academics Performance"
    },

    "Banking & Finance": {
        "D6": "Bank Marketing 1",
        "D7": "Bank Marketing 2",
        "D30": "Adult Income",
        "D32": "Telco Customer Churn",
        "D35": "Credit Approval",
        "D50": "Term Deposit Prediction",
        "D96": "Credit Card Fraud",
        "D188": "South German Credit",
        "D193": "Credit Risk Classification",
        "D195": "Credit Score Classification",
        "D196": "Banking Classification"
    },

    "Science & Engineering": {
        "D8": "Mushroom",
        "D14": "Ionosphere",
        "D33": "EEG Eye State",
        "D37": "Steel Plates Faults",
        "D39": "Fertility",
        "D51": "Darwin",
        "D58": "EEG Emotions",
        "D81": "Predictive Maintenance",
        "D84": "Oranges vs Grapefruit",
        "D90": "Crystal System Li-ion",
        "D183": "Drug Consumption",
        "D49": "Air Pressure System Failures",
        "D93": "Air Pressure System Failures",
        "D185": "Toxicity",
        "D186": "Toxicity"
    },

    "Social & Lifestyle": {
        "D38": "Online Shoppers",
        "D59": "Red Wine Quality",
        "D60": "White Wine Quality",
        "D88": "Airline Passenger Satisfaction",
        "D94": "Go Emotions Google",
        "D100": "Spotify East Asian",
        "D125": "Suicide Rates",
        "D182": "Obesity Levels",
        "D187": "Blood Transfusion",
        "D191": "Obesity Classification",
        "D192": "Gender Classification",
        "D194": "Happiness Classification",
        "D42": "Airline customer Holiday Booking dataset"
    },

    "ML Benchmarks & Synthetic": {
        "D34": "Spambase",
        "D85": "Synthetic Binary",
        "D89": "Naive Bayes Data",
        "D175": "Monk's Problems 1",
        "D176": "Monk's Problems 2",
        "D177": "Monk's Problems 3",
        "D178": "Monk's Problems 4",
        "D179": "Monk's Problems 5",
        "D180": "Monk's Problems 6"
    },

    "Other": {
        "D83": "Paris Housing",
        "D91": "Fake Bills",
        "D197": "Star Classification"
    }
}
cats1 = list(DATASET_CATEGORIES.keys())
import pandas as pd
from scipy.stats import ttest_rel

# Load results
df_1 = fmodel_results

# Define individual models
tree_models = ["RandomForest", "DecisionTree"]
non_tree_models = ["KNN", "SVM", "LogisticRegression"]
#models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]



# Store all results
all_results = []

# For each metric
for metric in ["accuracy", "precision", "recall", "f1_score"]:
    

    comparison_num = 1

    models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
    # Compare each tree model with each non-tree model
    for tree_model in models:
        other_models = models.copy()
        other_models.remove(tree_model)
        if other_models == None:
          break
        for non_tree_model in other_models:

            # Get data for both models across all datasets
            tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric]
            non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric]

            # Align datasets (inner join - only datasets present for both models)
            combined = pd.DataFrame({
                'tree': tree_data,
                'non_tree': non_tree_data
            }).dropna()

            if len(combined) < 2:
                print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data")
                comparison_num += 1
                continue

            # Paired t-test
            t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree'])

            # Calculate means and stds
            mean1 = combined['tree'].mean()
            mean2 = combined['non_tree'].mean()
            std1 = combined['tree'].std()
            std2 = combined['non_tree'].std()
            n = len(combined)

            sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"

            print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} {mean1:<10.5f} {mean2:<10.5f} {t_stat:<8.2f} {sig:<10} {'True' if p_val < 0.05 else 'False'}")

            all_results.append({
                'metric': metric,
                'tree_model': tree_model,
                'non_tree_model': non_tree_model,
                'tree_mean': mean1,
                'non_tree_mean': mean2,
                'tree_std': std1,
                'non_tree_std': std2,
                'n_datasets': n,
                't_statistic': t_stat,
                'p_value': p_val
            })

            comparison_num += 1
        models = other_models.copy()

results_df = pd.DataFrame(all_results)
significant_count = (results_df['p_value'] < 0.05).sum()
total_count = len(results_df)

# Save detailed results
#results_df.to_csv('pairwise_comparison_results.csv', index=False)
import pandas as pd
from scipy.stats import ttest_rel
sig1 = {}
for key in list(DATASET_CATEGORIES.keys()):
  # Load results
  df_1 = fmodel_results[fmodel_results["dataset"].isin(list(DATASET_CATEGORIES[key].keys()))]

  # Define individual models
  tree_models = ["RandomForest", "DecisionTree"]
  non_tree_models = ["KNN", "SVM", "LogisticRegression"]
  #models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
  models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]

  # Store all results
  all_results = []

  # For each metric
  for metric in ["accuracy", "precision", "recall", "f1_score"]:
      comparison_num = 1

      models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
      # Compare each tree model with each non-tree model
      for tree_model in models:
          other_models = models.copy()
          other_models.remove(tree_model)
          if other_models == None:
            break
          for non_tree_model in other_models:

              # Get data for both models across all datasets
              tree_data = df_1[df_1['model'] == tree_model].set_index('dataset')[metric]
              non_tree_data = df_1[df_1['model'] == non_tree_model].set_index('dataset')[metric]

              # Align datasets (inner join - only datasets present for both models)
              combined = pd.DataFrame({
                  'tree': tree_data,
                  'non_tree': non_tree_data
              }).dropna()

              if len(combined) < 2:
                  print(f"{comparison_num:<3} {tree_model:<20} {non_tree_model:<20} Insufficient data")
                  comparison_num += 1
                  continue

              # Paired t-test
              t_stat, p_val = ttest_rel(combined['tree'], combined['non_tree'])

              # Calculate means and stds
              mean1 = combined['tree'].mean()
              mean2 = combined['non_tree'].mean()
              std1 = combined['tree'].std()
              std2 = combined['non_tree'].std()
              n = len(combined)

              sig = "< 0.001" if p_val < 0.001 else f"{p_val:.3f}"


              all_results.append({
                  'metric': metric,
                  'tree_model': tree_model,
                  'non_tree_model': non_tree_model,
                  'tree_mean': mean1,
                  'non_tree_mean': mean2,
                  'tree_std': std1,
                  'non_tree_std': std2,
                  'n_datasets': n,
                  't_statistic': t_stat,
                  'p_value': p_val
              })

              comparison_num += 1
          models = other_models.copy()

  # Summary
  print("\n" + "="*80)
  print("SUMMARY")
  print("="*80)
  results_df4 = pd.DataFrame(all_results)
  sig1[key] = results_df4
  significant_count = (results_df4['p_value'] < 0.05).sum()
  total_count = len(results_df4)
  print(f"\nSignificant comparisons (p < 0.05): {significant_count}/{total_count}")
  print(f"Tree models won in: {(results_df4['tree_mean'] > results_df4['non_tree_mean']).sum()} comparisons")

  # Save detailed results
  #results_df.to_csv('pairwise_comparison_results.csv', index=False)
  print("\nDetailed results saved to: pairwise_comparison_results.csv")
sig1["AllDatasets"] = results_df
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
cats = [f"{key} ({len(values)})" for key, values in DATASET_CATEGORIES.items()]
datasel = cats.copy()
datasel.extend(["AllDatasets (150)"])
def compare_ind(med, game, ed, bank, sci, social, ml, other, models_to_compare=None):
  metri = ["accuracy", "precision", "recall", "f1_score"]
  figs = []
  messages = []
  dropdowns = [med, game, ed, bank, sci, social, ml, other]
  selected_datasets_keys = []

  for cat_name, dropdown_values in zip(cats1, dropdowns):
      if dropdown_values:
          selected_datasets_keys.extend(get_keys(DATASET_CATEGORIES[cat_name], dropdown_values))

  if not models_to_compare:
      messages.append("No models selected. Displaying results for all models.")
      models_to_compare = models

  dataset_id_to_name = {id: name for category_dict in DATASET_CATEGORIES.values() for id, name in category_dict.items()}

  filtered_df_all_metrics = fmodel_results[
      (fmodel_results["dataset"].isin(selected_datasets_keys)) &
      (fmodel_results["model"].isin(models_to_compare))
  ].copy()

  for metric in metri:
      heatmap_data_metric = filtered_df_all_metrics.pivot_table(
          index='dataset', 
          columns='model', 
          values=metric   
      )

      heatmap_data_metric = heatmap_data_metric.rename(index=dataset_id_to_name)
      print(heatmap_data_metric)
      fig = plt.figure(figsize=(12, 8))
      sns.heatmap(heatmap_data_metric, annot=True, cmap="crest", fmt=".3f", cbar=True)
      plt.title(f"{metric} per Dataset and Model ({len(selected_datasets_keys)} datasets)")
      plt.ylabel("Dataset")
      plt.xlabel("Model")
      plt.tight_layout()
      figs.append(fig)



  return figs[0], figs[1], figs[2], figs[3], "\n".join(messages) if messages else "Comparison complete."
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
def compare_groups(data_choice, model1, model2):
  messages = []
  data1 = sig1[data_choice.split(' (')[0]]
  comparison_data = data1[
      ((data1['tree_model'] == model1) & (data1['non_tree_model'] == model2)) |
      ((data1['tree_model'] == model2) & (data1['non_tree_model'] == model1))
  ]

  if comparison_data.empty:
      fig = plt.figure(figsize=(10, 6))
      plt.close(fig)
      return fig, "No comparison data found for the selected models. \n Dont pick the same models."

  plot_data = []
  p_values_text = []

  for index, row in comparison_data.iterrows():
      metric = row['metric']
      if row['tree_model'] == model1:
          plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['tree_mean']})
          plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['non_tree_mean']})
      else: # if: row['tree_model'] == model2
          plot_data.append({'Metric': metric, 'Model': model1, 'Mean Score': row['non_tree_mean']})
          plot_data.append({'Metric': metric, 'Model': model2, 'Mean Score': row['tree_mean']})

      p_values_text.append(f"{metric} p-value: {row['p_value']:.5f} (Significant (cutoff = 0.05): {'Yes' if row['p_value'] < 0.05 else 'No'})")

  df_plot = pd.DataFrame(plot_data)

  fig = plt.figure(figsize=(10, 6))
  sns.barplot(x='Metric', y='Mean Score', hue='Model', data=df_plot)
  plt.title(f'Comparison of {model1} vs {model2} Across Metrics')
  plt.ylabel('Mean Score')
  plt.xlabel('Metric')
  plt.ylim(0, 1)
  plt.legend(title='Model')
  plt.tight_layout()

  return fig, "\n".join(p_values_text)
import gradio as gr
models = ["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression"]
with gr.Blocks() as demo:
  with gr.Tabs() as tabs:
    with gr.TabItem("Compaes groups of datasets"):
      with gr.Column():
        with gr.Row():
          gr.Markdown("# Comparing models")
          model1 = gr.Dropdown(["RandomForest", "DecisionTree", "KNN", "SVM", "LogisticRegression", "AllModels(not for AllData)"], label = "Model 1")
          model2 = gr.Dropdown(models, value = models[1], label = "Model 2")
          dataset = gr.Dropdown(datasel, label = "Datasets")
          compare_button = gr.Button("Compare")
        with gr.Row():
          gr.Markdown("## Results")
          #output_plot = gr.Gallery(label="Results")
          p_value = gr.Textbox(label="Model Output", lines = 4)
        with gr.Row():
          output_plot = gr.Plot(label="Results")
      compare_button.click(
          fn = compare_groups,
          inputs =[dataset, model1, model2],
          outputs = [output_plot, p_value]
      )
    with gr.TabItem("Compare Individual Tabs"):
      with gr.Column():
        with gr.Row():
          gr.Markdown("# Comparing models")
          model_choice = gr.Dropdown(models, label = "Model 1", multiselect = True)
          med = gr.Dropdown(DATASET_CATEGORIES[cats1[0]].values(), label = cats1[0],multiselect=True)
          game = gr.Dropdown(DATASET_CATEGORIES[cats1[1]].values(), label = cats1[1],multiselect=True)
          ed = gr.Dropdown(DATASET_CATEGORIES[cats1[2]].values(), label = cats1[2],multiselect=True)
        with gr.Row():
          bank = gr.Dropdown(DATASET_CATEGORIES[cats1[3]].values(), label = cats1[3],multiselect=True)
          sci = gr.Dropdown(DATASET_CATEGORIES[cats1[4]].values(), label = cats1[4],multiselect=True)
          social = gr.Dropdown(DATASET_CATEGORIES[cats1[5]].values(), label = cats1[5],multiselect=True)
          ml = gr.Dropdown(DATASET_CATEGORIES[cats1[6]].values(), label = cats1[6],multiselect=True)
          other = gr.Dropdown(DATASET_CATEGORIES[cats1[7]].values(), label = cats1[7],multiselect=True)
          compare_button = gr.Button("Compare")
        with gr.Row():
          #output_plot = gr.Gallery(label="Results")
          p1 = gr.Plot(label="Results")
        with gr.Row():
          p2 = gr.Plot(label="Results")
        with gr.Row():
          p3 = gr.Plot(label="Results")
        with gr.Row():
          p4 = gr.Plot(label="Results")
        with gr.Row():
          p_value = gr.Textbox(label="Model Output", lines = 4)
      compare_button.click(
          fn = compare_ind,
          inputs =[med, game, ed, bank, sci, social, ml, other,  model_choice],
          outputs = [p1, p2, p3, p4, p_value]
      )
demo.launch(share=True, show_error=True)