File size: 5,606 Bytes
b077775
 
 
b52804e
b077775
 
 
 
 
 
 
 
b52804e
b077775
 
 
 
 
 
 
 
 
 
 
 
 
9301d5d
 
b077775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52804e
 
 
b077775
 
 
 
b52804e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b077775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52804e
b077775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52804e
 
 
 
 
 
 
 
 
 
b077775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b52804e
b077775
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import gradio as gr
import pandas as pd

from data_generation import generate_synthetic_training_data, evaluate_synthetic_data
from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
from credit_models import real_data_credit_model, synthetic_data_credit_model
from visualization import (
    plot_feature_distributions,
    plot_comparative_credit_score_distribution,
    plot_comparison_table,
    plot_comparative_confusion_matrices,
    plot_comparative_credit_score_distribution_by_actual_class,
    plot_evaluation_table,
    get_metrics_df,
)

COLOR_MAP = {
    'Good': '#28B463',
    'Standard': '#F1C40F',
    'Poor': '#E74C3C',
}

LABEL_ORDER = ['Good', 'Standard', 'Poor']
TARGET = 'Credit_Score'

# Load and preprocess real data once at startup
real_train = pd.read_csv('data/processed/v4/real_train_data.csv')
real_test = pd.read_csv('data/processed/v4/real_test_data.csv')

X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
    real_train, real_test, TARGET
)

# Train real-data model once at startup
real_scores, real_classification = real_data_credit_model(
    X_real_train, y_real_train, X_real_test
)

def run_analysis():
    """Generate new synthetic data, train the synthetic model, and return all comparison plots."""
    synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
    X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)

    categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]]
    numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]]

    fig_feature_dist = plot_feature_distributions(
        X_real_train, X_synth_train
    )

    summary_rows = []
    for cls in LABEL_ORDER:
        real_cls = X_real_train[y_real_train == cls]
        synth_cls = X_synth_train[y_synth_train == cls]
        ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
            real_cls, synth_cls, categorical_cols, numeric_cols
        )
        summary_rows.append({
            "ks_passed": ks_pass_rate == 1.0,
            "mean_ks_stat": round(mean_ks, 4),
            "chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None,
            "mean_corr_diff": round(mean_corr_diff, 4),
        })
    summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER)
    summary = plot_evaluation_table(summary_df)

    synth_scores, synth_classification = synthetic_data_credit_model(
        X_synth_train, y_synth_train, X_real_test
    )

    fig_score_dist = plot_comparative_credit_score_distribution(
        real_scores, synth_scores
    )
    fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
        y_real_test, real_scores, synth_scores,
        color_map=COLOR_MAP,
        label_order=LABEL_ORDER,
    )
    fig_metrics = plot_comparison_table(
        y_real_test, real_classification, synth_classification
    )
    fig_cm = plot_comparative_confusion_matrices(
        y_real_test, real_classification, synth_classification,
        labels=LABEL_ORDER,
    )

    metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
    metrics_df = metrics_df.round(4)

    return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df


with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Credit Score Model Dashboard
        Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data.
        Click the button to regenerate synthetic data and retrain the synthetic model.
        """
    )

    run_btn = gr.Button(
        "Generate New Synthetic Data & Analyze", variant="primary", size="lg"
    )

    gr.Markdown(
    """
    ## Feature Distribution Comparison

    Below are the distributions of the features in the real vs synthetic training datasets.
    """
    )

    with gr.Row():
        plot_feature_dist = gr.Plot(label='')

    gr.Markdown(
    """
    ## Generated Data Quality Summary

    Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics.
    """
    )

    with gr.Row():
        plot_summary = gr.Plot(label='')

    gr.Markdown(
    """
    ## Credit Models Metrics

    Below are the metrics for the real-data and synthetic-data models.
    """
    )

    with gr.Row():
        plot_metrics = gr.Plot(label='')

    gr.Markdown(
    """
    ## Credit Score Distribution Comparison
    Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
    """
    )

    with gr.Row():
        plot_score_dist = gr.Plot(label='')

    gr.Markdown(
    """
    ## Credit Score Distribution by Actual Class
    Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
    """
    )

    with gr.Row():
        plot_score_by_class = gr.Plot(label='')

    gr.Markdown(
    """
    ## Confusion Matrix Comparison
    Below are the confusion matrices for the real-data and synthetic-data models.
    """
    )

    with gr.Row():
        plot_cm = gr.Plot(label='')

    run_btn.click(
        fn=run_analysis,
        inputs=[],
        outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
    )

demo.launch()