|
|
|
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split, GridSearchCV |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.ensemble import RandomForestClassifier |
|
|
from sklearn.metrics import accuracy_score, f1_score |
|
|
import joblib |
|
|
from scipy.sparse import hstack |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from collections import defaultdict |
|
|
|
|
|
def split_train_left_right(data): |
|
|
sorted = data.sort_values(['Tag', 'Affix']) |
|
|
sorted = sorted.drop_duplicates(subset=['Word', 'Tag']) |
|
|
|
|
|
tags = defaultdict(list) |
|
|
|
|
|
left = [] |
|
|
right = [] |
|
|
|
|
|
for i, row in sorted.iterrows(): |
|
|
|
|
|
word = row['Word'] |
|
|
tag = row['Tag'] |
|
|
|
|
|
if tags[word] and (tag not in tags[word]): |
|
|
|
|
|
left.append(row) |
|
|
else: |
|
|
right.append(row) |
|
|
|
|
|
tags[word].append(tag) |
|
|
|
|
|
right_df = pd.DataFrame(right) |
|
|
left_df = pd.DataFrame(left) |
|
|
|
|
|
return right_df, left_df |
|
|
|
|
|
filepath = "train_fixed.csv" |
|
|
data = pd.read_csv(filepath) |
|
|
|
|
|
right_df, left_df = split_train_left_right(data) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (side, df) in [('right', right_df), ('left', left_df)]: |
|
|
|
|
|
categories = df["PoS_word"].unique() |
|
|
|
|
|
category_res = {} |
|
|
|
|
|
for category in categories: |
|
|
print(f"Category: {category}") |
|
|
|
|
|
|
|
|
category_data = df[df["PoS_word"] == category] |
|
|
print(category_data.shape) |
|
|
|
|
|
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x)) |
|
|
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x)) |
|
|
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү')) |
|
|
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө')) |
|
|
|
|
|
|
|
|
X = category_data["Affix"] |
|
|
y = category_data["Tag"] |
|
|
|
|
|
|
|
|
|
|
|
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 5)) |
|
|
X_train_tfidf = vectorizer.fit_transform(X) |
|
|
|
|
|
|
|
|
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = RandomForestClassifier(n_estimators=300) |
|
|
model.fit(X_train_combined, y) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
y_pred = model.predict(X_train_combined) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
category_data['pred'] = y_pred |
|
|
category_res[category] = category_data |
|
|
|
|
|
|
|
|
|
|
|
accuracy = accuracy_score(y, y_pred) |
|
|
f1 = f1_score(y, y_pred, average="weighted") |
|
|
|
|
|
|
|
|
|
|
|
print("Accuracy:", accuracy) |
|
|
print("F1 Score:", f1) |
|
|
print(model) |
|
|
|
|
|
|
|
|
|
|
|
model_filepath = f"artefacts/model_{category}_{side}.joblib" |
|
|
vectorizer_filepath = f"artefacts/vectorizer_{category}_{side}.joblib" |
|
|
joblib.dump(model, model_filepath) |
|
|
joblib.dump(vectorizer, vectorizer_filepath) |
|
|
|
|
|
|
|
|
filepath = "test_fixed.csv" |
|
|
data = pd.read_csv(filepath) |
|
|
|
|
|
|
|
|
def split_test_left_right(data): |
|
|
sorted = data.sort_values(['Affix']) |
|
|
|
|
|
|
|
|
tags = defaultdict(list) |
|
|
|
|
|
left = [] |
|
|
right = [] |
|
|
|
|
|
for i, row in sorted.iterrows(): |
|
|
word = row['Word'] |
|
|
|
|
|
if tags[word]: |
|
|
|
|
|
left.append(row) |
|
|
else: |
|
|
right.append(row) |
|
|
tags[word].append(word) |
|
|
|
|
|
|
|
|
right_df = pd.DataFrame(right) |
|
|
left_df = pd.DataFrame(left) |
|
|
|
|
|
return right_df, left_df |
|
|
|
|
|
right_df, left_df = split_test_left_right(data) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
result_dfs = [] |
|
|
for (side, df) in [('right', right_df), ('left', left_df)]: |
|
|
|
|
|
print(side) |
|
|
categories = df["PoS_word"].unique() |
|
|
|
|
|
|
|
|
|
|
|
for category in categories: |
|
|
print(f"Category: {category}, side: {side}") |
|
|
|
|
|
|
|
|
category_data = df[df["PoS_word"] == category] |
|
|
print(category_data.shape) |
|
|
|
|
|
|
|
|
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x)) |
|
|
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x)) |
|
|
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү')) |
|
|
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө')) |
|
|
|
|
|
|
|
|
|
|
|
X = category_data["Affix"] |
|
|
y = category_data["Tag"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vectorizer = joblib.load(f"artefacts/vectorizer_{category}_{side}.joblib") |
|
|
X_train_tfidf = vectorizer.transform(X) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model = joblib.load(f"artefacts/model_{category}_{side}.joblib") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]]) |
|
|
|
|
|
|
|
|
y_pred = model.predict(X_train_combined) |
|
|
|
|
|
category_data['Tag'] = y_pred |
|
|
result_dfs.append(category_data) |
|
|
|
|
|
|
|
|
pd.concat(result_dfs).to_csv('file_pred_12.csv', index=False) |
|
|
|
|
|
|