{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "h4i-iuO3h9Ne" }, "outputs": [], "source": [ "!wget -O dataset.zip https://archive.ics.uci.edu/static/public/468/online+shoppers+purchasing+intention+dataset.zip\n", "!unzip dataset.zip\n" ] }, { "cell_type": "code", "source": [ "import pandas as pd\n", "\n", "path='online_shoppers_intention.csv'\n", "df = pd.read_csv(path)\n", "\n", "df.head()\n" ], "metadata": { "id": "lb0cbEB-iJgL" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "df.duplicated().sum()\n", "df.drop_duplicates(inplace=True)\n", "df.head()" ], "metadata": { "id": "yiThYtQJioWN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "le = LabelEncoder()\n", "df['Month'] = le.fit_transform(df['Month'])\n", "df['VisitorType'] = le.fit_transform(df['VisitorType'])\n", "df['Weekend'] = le.fit_transform(df['Weekend'])\n", "df['Revenue'] = le.fit_transform(df['Revenue'])\n", "\n", "df.head()" ], "metadata": { "id": "RmmoJySljbP3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "X = df.drop('Revenue', axis=1)\n", "y = df['Revenue']\n", "\n", "print(\"Shape of X:\", X.shape)\n", "print(\"Shape of y:\", y.shape)" ], "metadata": { "id": "4n0BTlPVjgTR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" ], "metadata": { "id": "t1JGP2rMjjZS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.feature_selection import SelectKBest, chi2\n", "\n", "k = 10\n", "select_kbest = SelectKBest(chi2, k=k)\n", "X_train_selected = select_kbest.fit_transform(X_train, y_train)\n", "X_test_selected = select_kbest.transform(X_test)\n", "\n", "selected_feature_indices = select_kbest.get_support(indices=True)\n", "selected_feature_names = X.columns[selected_feature_indices]\n", "\n", "print(\"Original number of features:\", X_train.shape[1])\n", "print(\"Number of selected features:\", X_train_selected.shape[1])\n", "print(\"Selected feature names:\", selected_feature_names.tolist())" ], "metadata": { "id": "4Oc7G6eXsNwd" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [], "metadata": { "id": "U-OLrWOtpe-f" } }, { "cell_type": "code", "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "dt_model = DecisionTreeClassifier(random_state=42)\n", "\n", "dt_model.fit(X_train_selected, y_train)\n", "\n", "y_pred = dt_model.predict(X_test_selected)\n", "\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy of the Decision Tree model: {accuracy:.4f}\")" ], "metadata": { "id": "rS6EiJcTspiw" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score\n", "\n", "print(\"Classification Report:\")\n", "print(classification_report(y_test, y_pred))\n", "\n", "print(\"Confusion Matrix:\")\n", "print(confusion_matrix(y_test, y_pred))\n", "\n", "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n", "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n", "print(f\"F1 Score: {f1_score(y_test, y_pred):.4f}\")\n", "print(f\"ROC AUC Score: {roc_auc_score(y_test, y_pred):.4f}\")\n" ], "metadata": { "id": "bXkYkEG7tI63" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay\n", "\n", "cm = confusion_matrix(y_test, y_pred, labels=dt_model.classes_)\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt_model.classes_)\n", "disp.plot()\n", "plt.title('Confusion Matrix')\n", "plt.show()\n", "\n", "RocCurveDisplay.from_estimator(dt_model, X_test_selected, y_test)\n", "plt.title('ROC Curve')\n", "plt.show()\n", "\n", "\n", "metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score']\n", "metrics_values = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]\n", "\n", "plt.figure(figsize=(8, 6))\n", "sns.barplot(x=metrics_names, y=metrics_values)\n", "plt.title('Classification Metrics')\n", "plt.ylabel('Score')\n", "plt.ylim(0, 1)\n", "plt.show()" ], "metadata": { "id": "G3y3pR9athJo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import joblib\n", "\n", "joblib.dump(dt_model, 'decision_tree_model.pkl')\n", "\n", "joblib.dump(le, 'label_encoder.pkl')\n", "\n", "joblib.dump(select_kbest, 'selectkbest_transformer.pkl')" ], "metadata": { "id": "9vMfYnua7G5j" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "loaded_model = joblib.load('decision_tree_model.pkl')\n", "\n", "manual_data = {\n", " 'Administrative': [0],\n", " 'Administrative_Duration': [0],\n", " 'Informational': [0],\n", " 'Informational_Duration': [0],\n", " 'ProductRelated': [1],\n", " 'ProductRelated_Duration': [100],\n", " 'BounceRates': [0.0],\n", " 'ExitRates': [0.0],\n", " 'PageValues': [0.0],\n", " 'SpecialDay': [0.0],\n", " 'Month': ['Feb'],\n", " 'OperatingSystems': [1],\n", " 'Browser': [1],\n", " 'Region': [1],\n", " 'TrafficType': [1],\n", " 'VisitorType': ['New_Visitor'],\n", " 'Weekend': ['False']\n", "}\n", "\n", "manual_df = pd.DataFrame(manual_data)\n", "\n", "manual_df['Month'] = le.transform(manual_df['Month'])\n", "manual_df['VisitorType'] = le.transform(manual_df['VisitorType'])\n", "manual_df['Weekend'] = le.transform(manual_df['Weekend'])\n", "\n", "\n", "manual_df_selected = select_kbest.transform(manual_df[X.columns])\n", "manual_prediction = loaded_model.predict(manual_df_selected)\n", "manual_prediction_proba = loaded_model.predict_proba(manual_df_selected)\n", "\n", "print(\"Manual data prediction:\", manual_prediction)\n", "print(\"Manual data prediction probabilities:\", manual_prediction_proba)" ], "metadata": { "id": "MydKkb8L7NJv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import os\n", "os.makedirs('images', exist_ok=True)\n", "\n", "cm = confusion_matrix(y_test, y_pred, labels=dt_model.classes_)\n", "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt_model.classes_)\n", "disp.plot()\n", "plt.title('Confusion Matrix')\n", "plt.savefig('images/confusion_matrix.png')\n", "plt.close()\n", "\n", "RocCurveDisplay.from_estimator(dt_model, X_test_selected, y_test)\n", "plt.title('ROC Curve')\n", "plt.savefig('images/roc_curve.png')\n", "plt.close()\n", "\n", "metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score']\n", "metrics_values = [accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred)]\n", "\n", "plt.figure(figsize=(8, 6))\n", "sns.barplot(x=metrics_names, y=metrics_values)\n", "plt.title('Classification Metrics')\n", "plt.ylabel('Score')\n", "plt.ylim(0, 1)\n", "plt.savefig('images/classification_metrics_bar_plot.png')\n", "plt.close()\n" ], "metadata": { "id": "zGSQrq7XDyn5" }, "execution_count": null, "outputs": [] } ] }