{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "2YpCZ5QwOGCL" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.utils import shuffle\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import json\n" ] }, { "cell_type": "code", "source": [ "# Read the dataset\n", "df = pd.read_json(\"DATA.json\", encoding=\"utf-8\")" ], "metadata": { "id": "1BpsBFfLZbW8" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "# Shuffle the dataset\n", "df = shuffle(df)" ], "metadata": { "id": "lTijWIiEVjVc" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a training set and a test set\n", "train_data = df[:int(len(df) * 0.8)]\n", "test_data = df[int(len(df) * 0.8):]" ], "metadata": { "id": "RDTKsDCsWGRk" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a vocabulary of all the words in the training set\n", "vocabulary = set()\n", "for d in df[\"content\"]:\n", " if isinstance(d, str):\n", " vocabulary.update(d.split())" ], "metadata": { "id": "rta9jPyxVm-I" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the list of text documents to a 2D array for the training set\n", "bow_train = np.array([[str(doc).count(word) for word in vocabulary] for doc in train_data[\"content\"].fillna(\"\")])\n" ], "metadata": { "id": "owPao6jdVprA" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the list of text documents to a 2D array for the test set\n", "bow_test = np.array([[str(doc).count(word) for word in vocabulary] for doc in test_data[\"content\"].fillna(\"\")])" ], "metadata": { "id": "Blc4X_qNV72O" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "# Convert the labels to numeric values\n", "label_encoder = LabelEncoder()\n", "train_labels = label_encoder.fit_transform(train_data[\"label\"])" ], "metadata": { "id": "pe1Y1uNEWRek" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "# Create a Naive Bayes model\n", "model = MultinomialNB()" ], "metadata": { "id": "pv1ps_IwWV8M" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "# Train the model\n", "model.fit(bow_train, train_labels)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 75 }, "id": "JYIiFXYdWY73", "outputId": "a698f8a6-eeda-49ab-c434-f438c6967ad5" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB()" ], "text/html": [ "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()