Zarinaaa's picture
Special for morphological analysis
7486641
# %%
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
import joblib
from scipy.sparse import hstack
# Read the data from the CSV file
from collections import defaultdict
def split_train_left_right(data):
sorted = data.sort_values(['Tag', 'Affix'])
sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
tags = defaultdict(list)
left = []
right = []
for i, row in sorted.iterrows():
# word = f"{row['Word']}{row['Affix']}"
word = row['Word']
tag = row['Tag']
if tags[word] and (tag not in tags[word]):
# print(tag not in tags['word'])
left.append(row)
else:
right.append(row)
tags[word].append(tag)
right_df = pd.DataFrame(right)
left_df = pd.DataFrame(left)
return right_df, left_df
filepath = "train_fixed.csv"
data = pd.read_csv(filepath)
right_df, left_df = split_train_left_right(data)
# right_df = pd.read_csv('right.csv')
# left_df = pd.read_csv('left.csv')
# %%
for (side, df) in [('right', right_df), ('left', left_df)]:
# Get unique categories from "PoS_word" column
categories = df["PoS_word"].unique()
category_res = {}
for category in categories:
print(f"Category: {category}")
# Filter data for the current category
category_data = df[df["PoS_word"] == category]
print(category_data.shape)
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
# Splitting data into train and test
X = category_data["Affix"]
y = category_data["Tag"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature extraction
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 5))
X_train_tfidf = vectorizer.fit_transform(X)
# print(len(vectorizer.vocabulary_))
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
# X_test_combined = hstack([X_test_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
# X_test_vec = vectorizer.transform(X)
model = RandomForestClassifier(n_estimators=300)
model.fit(X_train_combined, y)
# Save the best model for the category
# category_models[category] = (model, vectorizer)
# Predict on the test data using the best model
y_pred = model.predict(X_train_combined)
# res_df = pd.DataFrame()
# res_df['pred'] = y_pred
# res_df['orig'] = y
category_data['pred'] = y_pred
category_res[category] = category_data
# Calculate accuracy and F1 score
accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average="weighted")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print(model)
# Save the models and vectorizers
# for category, (model, vectorizer) in category_models.items():
model_filepath = f"artefacts/model_{category}_{side}.joblib"
vectorizer_filepath = f"artefacts/vectorizer_{category}_{side}.joblib"
joblib.dump(model, model_filepath)
joblib.dump(vectorizer, vectorizer_filepath)
# %%
filepath = "test_fixed.csv"
data = pd.read_csv(filepath)
def split_test_left_right(data):
sorted = data.sort_values(['Affix'])
# sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
tags = defaultdict(list)
left = []
right = []
for i, row in sorted.iterrows():
word = row['Word']
if tags[word]:
# print(tag not in tags['word'])
left.append(row)
else:
right.append(row)
tags[word].append(word)
right_df = pd.DataFrame(right)
left_df = pd.DataFrame(left)
return right_df, left_df
right_df, left_df = split_test_left_right(data)
# right_df = pd.read_csv('right.csv')
# left_df = pd.read_csv('left.csv')
# left_df[left_df['Word'] == 'божомолдчу']
# %%
result_dfs = []
for (side, df) in [('right', right_df), ('left', left_df)]:
# Get unique categories from "PoS_word" column
print(side)
categories = df["PoS_word"].unique()
# category_models = {}
for category in categories:
print(f"Category: {category}, side: {side}")
# Filter data for the current category
category_data = df[df["PoS_word"] == category]
print(category_data.shape)
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
# Splitting data into train and test
X = category_data["Affix"]
y = category_data["Tag"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature extraction
vectorizer = joblib.load(f"artefacts/vectorizer_{category}_{side}.joblib")
X_train_tfidf = vectorizer.transform(X)
# X_test_vec = vectorizer.transform(X)
model = joblib.load(f"artefacts/model_{category}_{side}.joblib")
# Save the best model for the category
# category_models[category] = (model, vectorizer)
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
# X
# Predict on the test data using the best model
y_pred = model.predict(X_train_combined)
category_data['Tag'] = y_pred
result_dfs.append(category_data)
# %%
pd.concat(result_dfs).to_csv('file_pred_12.csv', index=False)
# %%