{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import f1_score\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "SEED = 1\n", "random.seed(SEED)\n", "np.random.seed(SEED)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n", "test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n", "df = pd.concat([train, test], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]\n", "y = df['Tag']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "X_pr = pd.get_dummies(X)\n", "le = LabelEncoder()\n", "y = le.fit_transform(y)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "train_X = X_pr.iloc[:train.shape[0]]\n", "train_y = y[:train.shape[0]]\n", "train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=1)