{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!unzip archive.zip\n", "\n", "#https://www.kaggle.com/datasets/basilb2s/language-detection\n", "#https://github.com/basil-b2s/Language-Detector" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import warnings\n", "import re\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'1.3.0'" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sklearn\n", "\n", "sklearn.__version__" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('language_detection.csv')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X = data['Text']\n", "y = data['Language']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "le = LabelEncoder()\n", "y = le.fit_transform(y)\n" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',\n", " 'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',\n", " 'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "le.classes_" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "data_list =[]\n", "for text in X:\n", " text = re.sub(r'[!@#$(),n\"%^*?:;~`0-9]', ' ', text)\n", " text = re.sub(r'[[]]', ' ', text)\n", " text = text.lower()\n", " data_list.append(text)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "cv = CountVectorizer()\n", "X_train = cv.fit_transform(X_train).toarray()\n", "X_test = cv.transform(X_test).toarray()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()