Spaces:

sklearn-docs
/

early_stopping_of_gradient_boosting

Build error

App Files Files Community

vumichien commited on Apr 25, 2023

Commit

e88b390

1 Parent(s): 10b1e17

Create app.py

Browse files

Files changed (1) hide show

app.py +164 -0

app.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import gradio as gr
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn import ensemble
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+theme = gr.themes.Monochrome(
+    primary_hue="indigo",
+    secondary_hue="blue",
+    neutral_hue="slate",
+)
+model_card = f"""
+## Description
+**Gradient boosting** is a machine learning technique that combines several regression trees to create a powerful model in an iterative manner.
+**Early stopping** is a technique used in **gradient boosting** to determine the least number of iterations required to create a model that generalizes well to new data.
+It involves specifying a validation set and using it to evaluate the model after each stage of tree building.
+The process is continued until the model's scores do not improve for a specified number of stages.
+Using early stopping can significantly reduce training time, memory usage, and prediction latency while achieving almost the same accuracy as a model built without early stopping using many more estimators.
+You can play around with different ``number of samples`` and ``number of new estimators`` to see the effect
+## Dataset
+Iris dataset, Classification dataset, Hastie dataset
+"""
+def do_train(n_samples, n_estimators, progress=gr.Progress()):
+    data_list = [
+      datasets.load_iris(return_X_y=True),
+      datasets.make_classification(n_samples=n_samples, random_state=0),
+      datasets.make_hastie_10_2(n_samples=n_samples, random_state=0),
+    ]
+    names = ["Iris Data", "Classification Data", "Hastie Data"]
+    n_gb = []
+    score_gb = []
+    time_gb = []
+    n_gbes = []
+    score_gbes = []
+    time_gbes = []
+    for X, y in progress.tqdm(data_list):
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=0
+        )
+        # We specify that if the scores don't improve by at least 0.01 for the last
+        # 10 stages, stop fitting additional stages
+        gbes = ensemble.GradientBoostingClassifier(
+            n_estimators=n_estimators,
+            validation_fraction=0.2,
+            n_iter_no_change=5,
+            tol=0.01,
+            random_state=0,
+        )
+        gb = ensemble.GradientBoostingClassifier(n_estimators=n_estimators, random_state=0)
+        start = time.time()
+        gb.fit(X_train, y_train)
+        time_gb.append(time.time() - start)
+        start = time.time()
+        gbes.fit(X_train, y_train)
+        time_gbes.append(time.time() - start)
+        score_gb.append(gb.score(X_test, y_test))
+        score_gbes.append(gbes.score(X_test, y_test))
+        n_gb.append(gb.n_estimators_)
+        n_gbes.append(gbes.n_estimators_)
+    bar_width = 0.2
+    n = len(data_list)
+    index = np.arange(0, n * bar_width, bar_width) * 2.5
+    index = index[0:n]
+    fig1, axes1 = plt.subplots(figsize=(9, 5))
+    bar1 = axes1.bar(
+        index, score_gb, bar_width, label="Without early stopping", color="crimson"
+    )
+    bar2 = axes1.bar(
+        index + bar_width, score_gbes, bar_width, label="With early stopping", color="coral"
+    )
+    axes1.set_xticks(index + bar_width, names);
+    axes1.set_yticks(np.arange(0, 1.3, 0.1));
+    def autolabel(ax, rects, n_estimators):
+        """
+        Attach a text label above each bar displaying n_estimators of each model
+        """
+        for i, rect in enumerate(rects):
+            ax.text(
+                rect.get_x() + rect.get_width() / 2.0,
+                1.05 * rect.get_height(),
+                "n_est=%d" % n_estimators[i],
+                ha="center",
+                va="bottom",
+            )
+    autolabel(axes1, bar1, n_gb)
+    autolabel(axes1, bar2, n_gbes)
+    plt.xlabel("Datasets")
+    plt.ylabel("Test score")
+    axes1.set_xlabel("Datasets")
+    axes1.set_ylabel("Test score")
+    axes1.set_ylim([0, 1.3])
+    axes1.legend(loc="best")
+    axes1.grid(True)
+    fig2, axes2 = plt.subplots(figsize=(9, 5))
+    bar1 = axes2.bar(
+        index, time_gb, bar_width, label="Without early stopping", color="crimson"
+    )
+    bar2 = axes2.bar(
+        index + bar_width, time_gbes, bar_width, label="With early stopping", color="coral"
+    )
+    max_y = np.amax(np.maximum(time_gb, time_gbes))
+    axes2.set_xticks(index + bar_width, names)
+    axes2.set_yticks(np.linspace(0, 1.3 * max_y, 13))
+    autolabel(axes2, bar1, n_gb)
+    autolabel(axes2, bar2, n_gbes)
+    axes2.set_ylim([0, 1.3 * max_y])
+    axes2.legend(loc="best")
+    axes2.grid(True)
+    axes2.set_xlabel("Datasets")
+    axes2.set_ylabel("Fit Time")
+    return fig1, fig2
+with gr.Blocks(theme=theme) as demo:
+    gr.Markdown('''
+            <div>
+            <h1 style='text-align: center'>Early stopping of Gradient Boosting</h1>
+            </div>
+        ''')
+    gr.Markdown(model_card)
+    gr.Markdown("Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the example from <a href=\"https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_early_stopping.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-early-stopping-py\">scikit-learn</a>")
+    n_samples = gr.Slider(minimum=500, maximum=10000, step=500, value=1000, label="Number of samples")
+    n_estimators = gr.Slider(minimum=50, maximum=300, step=50, value=100, label="Number of estimators")
+    with gr.Row():
+        with gr.Column():
+            plot1 = gr.Plot(label="Test score")
+        with gr.Column():
+            plot2 = gr.Plot(label="Running time")
+    n_samples.change(fn=do_train, inputs=[n_samples, n_estimators], outputs=[plot1, plot2])
+    n_estimators.change(fn=do_train, inputs=[n_samples, n_estimators], outputs=[plot1, plot2])
+demo.launch(enable_queue=True)