Spaces:
Sleeping
Sleeping
File size: 5,606 Bytes
b077775 b52804e b077775 b52804e b077775 9301d5d b077775 b52804e b077775 b52804e b077775 b52804e b077775 b52804e b077775 b52804e b077775 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | import gradio as gr
import pandas as pd
from data_generation import generate_synthetic_training_data, evaluate_synthetic_data
from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
from credit_models import real_data_credit_model, synthetic_data_credit_model
from visualization import (
plot_feature_distributions,
plot_comparative_credit_score_distribution,
plot_comparison_table,
plot_comparative_confusion_matrices,
plot_comparative_credit_score_distribution_by_actual_class,
plot_evaluation_table,
get_metrics_df,
)
COLOR_MAP = {
'Good': '#28B463',
'Standard': '#F1C40F',
'Poor': '#E74C3C',
}
LABEL_ORDER = ['Good', 'Standard', 'Poor']
TARGET = 'Credit_Score'
# Load and preprocess real data once at startup
real_train = pd.read_csv('data/processed/v4/real_train_data.csv')
real_test = pd.read_csv('data/processed/v4/real_test_data.csv')
X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
real_train, real_test, TARGET
)
# Train real-data model once at startup
real_scores, real_classification = real_data_credit_model(
X_real_train, y_real_train, X_real_test
)
def run_analysis():
"""Generate new synthetic data, train the synthetic model, and return all comparison plots."""
synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]]
numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]]
fig_feature_dist = plot_feature_distributions(
X_real_train, X_synth_train
)
summary_rows = []
for cls in LABEL_ORDER:
real_cls = X_real_train[y_real_train == cls]
synth_cls = X_synth_train[y_synth_train == cls]
ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
real_cls, synth_cls, categorical_cols, numeric_cols
)
summary_rows.append({
"ks_passed": ks_pass_rate == 1.0,
"mean_ks_stat": round(mean_ks, 4),
"chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None,
"mean_corr_diff": round(mean_corr_diff, 4),
})
summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER)
summary = plot_evaluation_table(summary_df)
synth_scores, synth_classification = synthetic_data_credit_model(
X_synth_train, y_synth_train, X_real_test
)
fig_score_dist = plot_comparative_credit_score_distribution(
real_scores, synth_scores
)
fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
y_real_test, real_scores, synth_scores,
color_map=COLOR_MAP,
label_order=LABEL_ORDER,
)
fig_metrics = plot_comparison_table(
y_real_test, real_classification, synth_classification
)
fig_cm = plot_comparative_confusion_matrices(
y_real_test, real_classification, synth_classification,
labels=LABEL_ORDER,
)
metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
metrics_df = metrics_df.round(4)
return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# Credit Score Model Dashboard
Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data.
Click the button to regenerate synthetic data and retrain the synthetic model.
"""
)
run_btn = gr.Button(
"Generate New Synthetic Data & Analyze", variant="primary", size="lg"
)
gr.Markdown(
"""
## Feature Distribution Comparison
Below are the distributions of the features in the real vs synthetic training datasets.
"""
)
with gr.Row():
plot_feature_dist = gr.Plot(label='')
gr.Markdown(
"""
## Generated Data Quality Summary
Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics.
"""
)
with gr.Row():
plot_summary = gr.Plot(label='')
gr.Markdown(
"""
## Credit Models Metrics
Below are the metrics for the real-data and synthetic-data models.
"""
)
with gr.Row():
plot_metrics = gr.Plot(label='')
gr.Markdown(
"""
## Credit Score Distribution Comparison
Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
"""
)
with gr.Row():
plot_score_dist = gr.Plot(label='')
gr.Markdown(
"""
## Credit Score Distribution by Actual Class
Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
"""
)
with gr.Row():
plot_score_by_class = gr.Plot(label='')
gr.Markdown(
"""
## Confusion Matrix Comparison
Below are the confusion matrices for the real-data and synthetic-data models.
"""
)
with gr.Row():
plot_cm = gr.Plot(label='')
run_btn.click(
fn=run_analysis,
inputs=[],
outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
)
demo.launch()
|