{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "2YpCZ5QwOGCL" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.utils import shuffle\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import json\n" ] }, { "cell_type": "code", "source": [ "# Read the dataset\n", "df = pd.read_json(\"DATA.json\", encoding=\"utf-8\")" ], "metadata": { "id": "1BpsBFfLZbW8" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "# Shuffle the dataset\n", "df = shuffle(df)" ], "metadata": { "id": "lTijWIiEVjVc" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a training set and a test set\n", "train_data = df[:int(len(df) * 0.8)]\n", "test_data = df[int(len(df) * 0.8):]" ], "metadata": { "id": "RDTKsDCsWGRk" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a vocabulary of all the words in the training set\n", "vocabulary = set()\n", "for d in df[\"content\"]:\n", " if isinstance(d, str):\n", " vocabulary.update(d.split())" ], "metadata": { "id": "rta9jPyxVm-I" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the list of text documents to a 2D array for the training set\n", "bow_train = np.array([[str(doc).count(word) for word in vocabulary] for doc in train_data[\"content\"].fillna(\"\")])\n" ], "metadata": { "id": "owPao6jdVprA" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the list of text documents to a 2D array for the test set\n", "bow_test = np.array([[str(doc).count(word) for word in vocabulary] for doc in test_data[\"content\"].fillna(\"\")])" ], "metadata": { "id": "Blc4X_qNV72O" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the labels to numeric values\n", "label_encoder = LabelEncoder()\n", "train_labels = label_encoder.fit_transform(train_data[\"label\"])" ], "metadata": { "id": "pe1Y1uNEWRek" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a Naive Bayes model\n", "model = MultinomialNB()" ], "metadata": { "id": "pv1ps_IwWV8M" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "# Train the model\n", "model.fit(bow_train, train_labels)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "JYIiFXYdWY73", "outputId": "a698f8a6-eeda-49ab-c434-f438c6967ad5" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB()" ], "text/html": [ "
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "# Test the model\n", "test_labels = label_encoder.transform(test_data[\"label\"])\n", "predictions = model.predict(bow_test)" ], "metadata": { "id": "gDcK0uiWWeKc" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "# Calculate the accuracy of the model\n", "accuracy = (predictions == test_labels).mean()\n", "print(\"Accuracy:\", accuracy)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eBMyVM5hWjiL", "outputId": "4c26f478-76b9-4404-9d61-10da839d6465" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Accuracy: 0.9348500517063082\n" ] } ] }, { "cell_type": "code", "source": [ "def test_model(text):\n", " bow = {}\n", " for word in text.split():\n", " if word in vocabulary:\n", " bow[word] = text.count(word)\n", " bow = np.array([bow.get(word, 0) for word in vocabulary]).reshape(1, -1)\n", " prediction = model.predict(bow)[0]\n", " return label_encoder.inverse_transform([prediction])[0]\n", "\n" ], "metadata": { "id": "TFH--bOmWpEU" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "# Test the model with some text\n", "text = \"Aura Azure Collagen Gummies Advantages, Official Website & Reviews [2023]\"\n", "prediction = test_model(text)\n", "print(prediction)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GDL4OZ_5Wsc7", "outputId": "e97b7b46-3a18-446e-e5f6-186bf285d96f" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "NSFW\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "CPxtDWlYWwYC" }, "execution_count": null, "outputs": [] } ] }