{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import f1_score\n", "from sklearn.preprocessing import LabelEncoder" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "SEED = 1\n", "random.seed(SEED)\n", "np.random.seed(SEED)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n", "test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n", "df = pd.concat([train, test], ignore_index=True)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]\n", "y = df['Tag']" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "X_pr = pd.get_dummies(X)\n", "le = LabelEncoder()\n", "y = le.fit_transform(y)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "train_X = X_pr.iloc[:train.shape[0]]\n", "train_y = y[:train.shape[0]]\n", "train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
RandomForestClassifier(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "RandomForestClassifier(random_state=1)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rf = RandomForestClassifier(n_estimators=100, random_state=SEED)\n", "rf.fit(train_X, train_y)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "rf_predict_result = rf.predict(val_X)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "F1 score: 0.9099025974025974\n" ] } ], "source": [ "f1_micro = f1_score(val_y, rf_predict_result, average='micro')\n", "print(\"F1 score:\", f1_micro)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "test_X = X_pr.iloc[train.shape[0]:]\n", "predictions = rf.predict(test_X)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "test['Tag'] = le.inverse_transform(predictions)\n", "test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " " ] } ], "metadata": { "kernelspec": { "display_name": "myenv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }