| import random | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.metrics import f1_score | |
| from sklearn.preprocessing import LabelEncoder | |
| SEED = 1 | |
| random.seed(SEED) | |
| np.random.seed(SEED) | |
| train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix']) | |
| test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix']) | |
| df = pd.concat([train, test], ignore_index=True) | |
| X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']] | |
| y = df['Tag'] | |
| X_pr = pd.get_dummies(X) | |
| le = LabelEncoder() | |
| y = le.fit_transform(y) | |
| train_X = X_pr.iloc[:train.shape[0]] | |
| train_y = y[:train.shape[0]] | |
| train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED) | |
| rf = RandomForestClassifier(n_estimators=100, random_state=SEED) | |
| rf.fit(train_X, train_y) | |
| rf_predict_result = rf.predict(val_X) | |
| f1_micro = f1_score(val_y, rf_predict_result, average='micro') | |
| print("F1 score:", f1_micro) | |
| test_X = X_pr.iloc[train.shape[0]:] | |
| predictions = rf.predict(test_X) | |
| test['Tag'] = le.inverse_transform(predictions) | |
| test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True) |