Special for morphological analysis
Browse files- .DS_Store +0 -0
- TAG.docx +0 -0
- bert_model_variant.py +420 -0
- dev.ipynb +182 -0
- dev.py +40 -0
- image_2023-05-13_16-58-05.png +0 -0
- logistic_regression.ipynb +0 -0
- test_fixed.csv +0 -0
- train.py +205 -0
- train_fixed.csv +0 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
TAG.docx
ADDED
|
Binary file (19.4 kB). View file
|
|
|
bert_model_variant.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import BertConfig, BertModel
|
| 2 |
+
import torch
|
| 3 |
+
import re
|
| 4 |
+
from torch.utils.data import DataLoader, Dataset
|
| 5 |
+
from sklearn.model_selection import train_test_split, cross_validate
|
| 6 |
+
import pytorch_lightning as pl
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
|
| 9 |
+
from torch.optim import AdamW
|
| 10 |
+
from sklearn.metrics import f1_score
|
| 11 |
+
|
| 12 |
+
MAX_LEN = 96
|
| 13 |
+
PAD_ID = 0
|
| 14 |
+
|
| 15 |
+
config = BertConfig(
|
| 16 |
+
vocab_size=40,
|
| 17 |
+
hidden_size=64,
|
| 18 |
+
num_hidden_layers=4,
|
| 19 |
+
num_attention_heads=4,
|
| 20 |
+
intermediate_size=256,
|
| 21 |
+
max_position_embeddings=MAX_LEN,
|
| 22 |
+
type_vocab_size=4
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class MyDataset(Dataset):
|
| 28 |
+
def __init__(self, df, char2idx, label2idx, is_train=True):
|
| 29 |
+
super().__init__()
|
| 30 |
+
print(char2idx)
|
| 31 |
+
print(label2idx)
|
| 32 |
+
self.is_train = is_train
|
| 33 |
+
self.dataset = get_dataset3(df, char2idx, label2idx, is_train=is_train)
|
| 34 |
+
|
| 35 |
+
def __len__(self):
|
| 36 |
+
return len(self.dataset)
|
| 37 |
+
|
| 38 |
+
def __getitem__(self, idx):
|
| 39 |
+
return self.dataset[idx]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def collate_fn(self, batch):
|
| 43 |
+
collated = {
|
| 44 |
+
"input_ids": torch.IntTensor([(x[0] if self.is_train else x)["input_ids"] for x in batch]),
|
| 45 |
+
"attention_mask": torch.Tensor([(x[0] if self.is_train else x)["attention_mask"] for x in batch]),
|
| 46 |
+
"token_type_ids": torch.IntTensor([(x[0] if self.is_train else x)["token_type_ids"] for x in batch])
|
| 47 |
+
}
|
| 48 |
+
if self.is_train:
|
| 49 |
+
collated = collated, torch.IntTensor([x[1] for x in batch])
|
| 50 |
+
|
| 51 |
+
return collated
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def get_preprocessed_dfs(folder):
|
| 55 |
+
df = pd.read_csv(f"{folder}/train_data.csv").drop_duplicates()
|
| 56 |
+
df.loc[:, "Tag"] = df.Tag.apply(lambda x: "CAUS_2" if x.startswith("CAUS_") and x != "CAUS_1" else x)
|
| 57 |
+
|
| 58 |
+
cats = ['FUT_INDF_3PLF', 'FUT_INDF_NEG', 'PST_INDF_PS', 'PCP_FUT_NEG', 'PCP_FUT_DEF', 'PRES_CONT', 'PRES_2SGF', 'POSS_2SGF', 'POSS_2PLF', 'NUM_APPR3', 'NUM_APPR2', 'NUM_APPR1', 'ADVV_CONT', 'ADJECTIVE', 'PST_ITER', 'PST_INDF', 'PST_EVID', 'PRES_PST', 'POSS_3SG', 'POSS_3PL', 'POSS_2SG', 'POSS_2PL', 'POSS_1SG', 'POSS_1PL', 'NUM_COLL', 'FUT_INDF', 'ADVV_SUC', 'ADVV_NEG', 'ADVV_INT', 'ADVV_ACC', 'PST_DEF', 'NUM_ORD', 'NUMERAL', 'IMP_SGF', 'IMP_PLF', 'FUT_DEF', 'PREC_1', 'PCP_PS', 'PCP_PR', 'JUS_SG', 'JUS_PL', 'IMP_SG', 'IMP_PL', 'HOR_SG', 'HOR_PL', 'DESIDE', 'CAUS_2', 'CAUS_1', 'INF_5', 'INF_4', 'INF_3', 'INF_2', 'INF_1', 'VERB', 'REFL', 'RECP', 'PRES', 'PREM', 'PERS', 'PASS', 'COND', 'COMP', '2SGF', '2PLF', 'SUC', 'OPT', 'NOM', 'NEG', 'NEG', 'LOC', 'INT', 'GEN', 'DAT', 'ACT', 'ACC', 'ABL', '3SG', '3PL', '2SG', '2PL', '1SG', '1PL', 'SG', 'PL']
|
| 59 |
+
cats = sorted([x.lower() for x in cats], key=lambda x: (len(x), x), reverse=True)
|
| 60 |
+
|
| 61 |
+
for col in df.columns:
|
| 62 |
+
df.loc[:, col] = df[col].apply(lambda x: x.strip().lower())
|
| 63 |
+
|
| 64 |
+
def tag2list(t):
|
| 65 |
+
res = []
|
| 66 |
+
for c in cats:
|
| 67 |
+
if c in t:
|
| 68 |
+
res.append(c)
|
| 69 |
+
t = t.replace(c, "")
|
| 70 |
+
return res
|
| 71 |
+
|
| 72 |
+
df.loc[:, "Tag"] = df.Tag.apply(tag2list)
|
| 73 |
+
|
| 74 |
+
tdf = pd.read_csv(f"{folder}/test_data.csv")
|
| 75 |
+
tdf.pop("Tag")
|
| 76 |
+
for col in tdf.columns:
|
| 77 |
+
tdf.loc[:, col] = tdf[col].apply(lambda x: x.strip().lower())
|
| 78 |
+
|
| 79 |
+
return {"train": df.rename(columns={x: x.lower() for x in df.columns}), "test": tdf.rename(columns={x: x.lower() for x in tdf.columns})}
|
| 80 |
+
|
| 81 |
+
def get_preprocessed_dfs2(folder):
|
| 82 |
+
df = pd.read_csv(f"{folder}/train_data.csv").drop_duplicates()
|
| 83 |
+
df.loc[:, "Tag"] = df.Tag.apply(lambda x: "CAUS_2" if x.startswith("CAUS_") and x != "CAUS_1" else x)
|
| 84 |
+
|
| 85 |
+
for col in df.columns:
|
| 86 |
+
df.loc[:, col] = df[col].apply(lambda x: x.strip().lower())
|
| 87 |
+
|
| 88 |
+
tdf = pd.read_csv(f"{folder}/test_data.csv")
|
| 89 |
+
tdf.pop("Tag")
|
| 90 |
+
for col in tdf.columns:
|
| 91 |
+
tdf.loc[:, col] = tdf[col].apply(lambda x: x.strip().lower())
|
| 92 |
+
|
| 93 |
+
return {"train": df.rename(columns={x: x.lower() for x in df.columns}), "test": tdf.rename(columns={x: x.lower() for x in tdf.columns})}
|
| 94 |
+
|
| 95 |
+
def get_splits(df, test_size=0.2):
|
| 96 |
+
unique_roots = df.root.unique()
|
| 97 |
+
print("unique roots", len(unique_roots))
|
| 98 |
+
train, validation = train_test_split(unique_roots, test_size=test_size, random_state=2023)
|
| 99 |
+
print("unique train roots", len(train))
|
| 100 |
+
print("unique validation roots", len(validation))
|
| 101 |
+
train_df = df[df.root.isin(train)]
|
| 102 |
+
validation_df = df[df.root.isin(validation)]
|
| 103 |
+
|
| 104 |
+
return train_df, validation_df
|
| 105 |
+
|
| 106 |
+
def get_char2idx(all_splits, special_chars=("<pad>", "<s>", "</s>")):
|
| 107 |
+
charset = set()
|
| 108 |
+
for split, df in all_splits.items():
|
| 109 |
+
charset = charset.union("".join(df.apply(lambda r: r.root + r.affix, axis=1)))
|
| 110 |
+
return {x: i for i, x in enumerate(list(special_chars) + sorted(charset))}
|
| 111 |
+
|
| 112 |
+
def get_dataset(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
|
| 113 |
+
pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
|
| 114 |
+
|
| 115 |
+
result = []
|
| 116 |
+
|
| 117 |
+
for r in split.itertuples():
|
| 118 |
+
|
| 119 |
+
input_ids = [char2idx["<s>"], pos2idx[r.pos_word], pos2idx[r.pos_root]]
|
| 120 |
+
attention_mask = [1, 1, 1]
|
| 121 |
+
token_type_ids = [0, 0, 0]
|
| 122 |
+
|
| 123 |
+
# print(r.word, r.root, r.affix)
|
| 124 |
+
for c in r.word:
|
| 125 |
+
input_ids.append(char2idx[c])
|
| 126 |
+
attention_mask.append(1)
|
| 127 |
+
token_type_ids.append(1)
|
| 128 |
+
|
| 129 |
+
for c in r.root:
|
| 130 |
+
input_ids.append(char2idx[c])
|
| 131 |
+
attention_mask.append(1)
|
| 132 |
+
token_type_ids.append(2)
|
| 133 |
+
|
| 134 |
+
for c in r.affix:
|
| 135 |
+
input_ids.append(char2idx[c])
|
| 136 |
+
attention_mask.append(1)
|
| 137 |
+
token_type_ids.append(3)
|
| 138 |
+
|
| 139 |
+
input_ids.append(char2idx["</s>"])
|
| 140 |
+
attention_mask.append(1)
|
| 141 |
+
token_type_ids.append(3)
|
| 142 |
+
|
| 143 |
+
input_ids = input_ids[:MAX_LEN]
|
| 144 |
+
attention_mask = attention_mask[:MAX_LEN]
|
| 145 |
+
token_type_ids = token_type_ids[:MAX_LEN]
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
for _ in range(MAX_LEN - len(input_ids)):
|
| 149 |
+
input_ids.append(char2idx["<pad>"])
|
| 150 |
+
attention_mask.append(0)
|
| 151 |
+
token_type_ids.append(3)
|
| 152 |
+
|
| 153 |
+
result.append(
|
| 154 |
+
{
|
| 155 |
+
"input_ids": input_ids,
|
| 156 |
+
"attention_mask": attention_mask,
|
| 157 |
+
"token_type_ids": token_type_ids,
|
| 158 |
+
}
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
if is_train:
|
| 162 |
+
result[-1] = (result[-1], [0 for _ in range(len(label2idx))])
|
| 163 |
+
for tag in r.tag:
|
| 164 |
+
result[-1][-1][label2idx[tag]] = 1
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
return result
|
| 168 |
+
|
| 169 |
+
def get_dataset3(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
|
| 170 |
+
pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
|
| 171 |
+
|
| 172 |
+
result = []
|
| 173 |
+
|
| 174 |
+
for xs, r in enumerate(split.itertuples()):
|
| 175 |
+
|
| 176 |
+
input_ids = [char2idx["<s>"], pos2idx[r.pos_root]]
|
| 177 |
+
attention_mask = [1, 1]
|
| 178 |
+
token_type_ids = [0, 0]
|
| 179 |
+
|
| 180 |
+
for c in r.root:
|
| 181 |
+
input_ids.append(char2idx[c])
|
| 182 |
+
attention_mask.append(1)
|
| 183 |
+
token_type_ids.append(1)
|
| 184 |
+
|
| 185 |
+
for c in r.affix:
|
| 186 |
+
input_ids.append(char2idx[c])
|
| 187 |
+
attention_mask.append(1)
|
| 188 |
+
token_type_ids.append(2)
|
| 189 |
+
|
| 190 |
+
input_ids.append(char2idx["</s>"])
|
| 191 |
+
attention_mask.append(1)
|
| 192 |
+
token_type_ids.append(2)
|
| 193 |
+
|
| 194 |
+
input_ids = input_ids[:MAX_LEN]
|
| 195 |
+
attention_mask = attention_mask[:MAX_LEN]
|
| 196 |
+
token_type_ids = token_type_ids[:MAX_LEN]
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
for _ in range(MAX_LEN - len(input_ids)):
|
| 200 |
+
input_ids.append(char2idx["<pad>"])
|
| 201 |
+
attention_mask.append(0)
|
| 202 |
+
token_type_ids.append(2)
|
| 203 |
+
|
| 204 |
+
result.append(
|
| 205 |
+
{
|
| 206 |
+
"input_ids": input_ids,
|
| 207 |
+
"attention_mask": attention_mask,
|
| 208 |
+
"token_type_ids": token_type_ids,
|
| 209 |
+
}
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
if is_train:
|
| 213 |
+
result[-1] = (result[-1], label2idx[r.tag])
|
| 214 |
+
|
| 215 |
+
if xs + 1 % 1000 == 0:
|
| 216 |
+
print(input_ids)
|
| 217 |
+
print(attention_mask)
|
| 218 |
+
print(token_type_ids)
|
| 219 |
+
|
| 220 |
+
return result
|
| 221 |
+
|
| 222 |
+
def get_dataset2(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
|
| 223 |
+
pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
|
| 224 |
+
|
| 225 |
+
result = []
|
| 226 |
+
|
| 227 |
+
for xs, r in enumerate(split.itertuples()):
|
| 228 |
+
|
| 229 |
+
input_ids = [char2idx["<s>"], pos2idx[r.pos_word], pos2idx[r.pos_root]]
|
| 230 |
+
attention_mask = [1, 1, 1]
|
| 231 |
+
token_type_ids = [0, 0, 0]
|
| 232 |
+
|
| 233 |
+
# print(r.word, r.root, r.affix)
|
| 234 |
+
for c in r.word:
|
| 235 |
+
input_ids.append(char2idx[c])
|
| 236 |
+
attention_mask.append(1)
|
| 237 |
+
token_type_ids.append(1)
|
| 238 |
+
|
| 239 |
+
for c in r.root:
|
| 240 |
+
input_ids.append(char2idx[c])
|
| 241 |
+
attention_mask.append(1)
|
| 242 |
+
token_type_ids.append(2)
|
| 243 |
+
|
| 244 |
+
for c in r.affix:
|
| 245 |
+
input_ids.append(char2idx[c])
|
| 246 |
+
attention_mask.append(1)
|
| 247 |
+
token_type_ids.append(3)
|
| 248 |
+
|
| 249 |
+
input_ids.append(char2idx["</s>"])
|
| 250 |
+
attention_mask.append(1)
|
| 251 |
+
token_type_ids.append(3)
|
| 252 |
+
|
| 253 |
+
input_ids = input_ids[:MAX_LEN]
|
| 254 |
+
attention_mask = attention_mask[:MAX_LEN]
|
| 255 |
+
token_type_ids = token_type_ids[:MAX_LEN]
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
for _ in range(MAX_LEN - len(input_ids)):
|
| 259 |
+
input_ids.append(char2idx["<pad>"])
|
| 260 |
+
attention_mask.append(0)
|
| 261 |
+
token_type_ids.append(3)
|
| 262 |
+
|
| 263 |
+
result.append(
|
| 264 |
+
{
|
| 265 |
+
"input_ids": input_ids,
|
| 266 |
+
"attention_mask": attention_mask,
|
| 267 |
+
"token_type_ids": token_type_ids,
|
| 268 |
+
}
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
if is_train:
|
| 272 |
+
result[-1] = (result[-1], label2idx[r.tag])
|
| 273 |
+
|
| 274 |
+
if xs + 1 % 10000 == 0:
|
| 275 |
+
print(input_ids)
|
| 276 |
+
print(attention_mask)
|
| 277 |
+
print(token_type_ids)
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
return result
|
| 281 |
+
|
| 282 |
+
def train_model(epochs=100, batch_size=400, data_folder="../Downloads/"):
|
| 283 |
+
dfs = get_preprocessed_dfs2(data_folder)
|
| 284 |
+
train, val = get_splits(dfs["train"])
|
| 285 |
+
char2idx = get_char2idx(dfs)
|
| 286 |
+
# label2idx = {j: i for i, j in enumerate(sorted(set([x for y in dfs["train"].tag for x in y])))}
|
| 287 |
+
label2idx = {l: i for i, l in enumerate(dfs["train"].tag.unique())}
|
| 288 |
+
|
| 289 |
+
model = MyModel2(config, label2idx, char2idx, 0.5)
|
| 290 |
+
checkpoint_callback = ModelCheckpoint(
|
| 291 |
+
dirpath="fmicro_weights",
|
| 292 |
+
save_top_k=3,
|
| 293 |
+
monitor="fmicro",
|
| 294 |
+
mode="max",
|
| 295 |
+
filename="{epoch}-{step}",
|
| 296 |
+
)
|
| 297 |
+
trainer = pl.Trainer(
|
| 298 |
+
deterministic=True,
|
| 299 |
+
max_epochs=epochs,
|
| 300 |
+
callbacks=[EarlyStopping(monitor="fmicro", mode="max"), checkpoint_callback],
|
| 301 |
+
log_every_n_steps=30,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
train_dataset = MyDataset(train, char2idx, label2idx)
|
| 305 |
+
validation_dataset = MyDataset(val, char2idx, label2idx)
|
| 306 |
+
trainer.fit(model, DataLoader(train_dataset, batch_size=400, collate_fn=train_dataset.collate_fn), DataLoader(validation_dataset, batch_size=400, collate_fn=validation_dataset.collate_fn))
|
| 307 |
+
|
| 308 |
+
best_model_path = [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)][0].best_model_path
|
| 309 |
+
|
| 310 |
+
model.load_state_dict(torch.load(best_model_path)["state_dict"])
|
| 311 |
+
|
| 312 |
+
return model, train, val, dfs["test"]
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
class MyModel(pl.LightningModule):
|
| 316 |
+
def __init__(self, config, label2idx, threshold, *args, **kwargs):
|
| 317 |
+
super().__init__(*args, **kwargs)
|
| 318 |
+
self.threshold = threshold
|
| 319 |
+
self.char2idx = char2idx
|
| 320 |
+
self.label2idx = label2idx
|
| 321 |
+
self.idx2label = {i: l for l, i in label2idx.items()}
|
| 322 |
+
self.bert = BertModel(config)
|
| 323 |
+
self.dropout = torch.nn.Dropout(0.3)
|
| 324 |
+
self.proj = torch.nn.Linear(config.hidden_size, len(label2idx))
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def common_step(self, batch):
|
| 328 |
+
X, _ = batch
|
| 329 |
+
hidden = self.bert(**X)[1]
|
| 330 |
+
return self.proj(self.dropout(hidden))
|
| 331 |
+
|
| 332 |
+
def training_step(self, batch, batch_idx):
|
| 333 |
+
# print(batch)
|
| 334 |
+
logits = self.common_step(batch)
|
| 335 |
+
loss = torch.nn.BCEWithLogitsLoss()(logits, batch[1].float())
|
| 336 |
+
self.log("train_loss", loss.mean(), on_step=True, on_epoch=True, prog_bar=True)
|
| 337 |
+
|
| 338 |
+
return loss
|
| 339 |
+
|
| 340 |
+
def validation_step(self, batch, batch_idx):
|
| 341 |
+
# print(batch[0]["input_ids"])
|
| 342 |
+
# print(batch[0]["token_type_ids"])
|
| 343 |
+
logits = self.common_step(batch)
|
| 344 |
+
# print(logits)
|
| 345 |
+
# print(batch[1])
|
| 346 |
+
loss = torch.nn.BCEWithLogitsLoss()(logits, batch[1].float())
|
| 347 |
+
self.log("loss", loss.mean(), prog_bar=True, on_epoch=True)
|
| 348 |
+
|
| 349 |
+
return logits, loss
|
| 350 |
+
|
| 351 |
+
def test_step(self, batch, batch_idx):
|
| 352 |
+
return self.common_step((batch, []))
|
| 353 |
+
|
| 354 |
+
def forward(self, batch, batch_idx):
|
| 355 |
+
return self.common_step((batch, []))
|
| 356 |
+
|
| 357 |
+
def configure_optimizers(self):
|
| 358 |
+
return AdamW(params=self.parameters())
|
| 359 |
+
|
| 360 |
+
class MyModel2(pl.LightningModule):
|
| 361 |
+
def __init__(self, config, label2idx, char2idx, threshold, *args, **kwargs):
|
| 362 |
+
super().__init__(*args, **kwargs)
|
| 363 |
+
self.threshold = threshold
|
| 364 |
+
self.char2idx = char2idx
|
| 365 |
+
self.fscore = 0.0
|
| 366 |
+
self.label2idx = label2idx
|
| 367 |
+
self.idx2label = {i: l for l, i in label2idx.items()}
|
| 368 |
+
self.bert = BertModel(config)
|
| 369 |
+
self.dropout = torch.nn.Dropout(0.3)
|
| 370 |
+
self.proj = torch.nn.Linear(config.hidden_size, len(label2idx))
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def common_step(self, batch):
|
| 374 |
+
X, _ = batch
|
| 375 |
+
hidden = self.bert(**X)[1]
|
| 376 |
+
return self.proj(self.dropout(hidden))
|
| 377 |
+
|
| 378 |
+
def training_step(self, batch, batch_idx):
|
| 379 |
+
# print(batch)
|
| 380 |
+
logits = self.common_step(batch)
|
| 381 |
+
loss = torch.nn.CrossEntropyLoss()(logits.view(-1, len(self.label2idx)), batch[1].view(-1).long())
|
| 382 |
+
self.log("train_loss", loss.mean(), on_step=True, on_epoch=True, prog_bar=True)
|
| 383 |
+
|
| 384 |
+
return loss
|
| 385 |
+
|
| 386 |
+
def validation_step(self, batch, batch_idx):
|
| 387 |
+
# print(batch[0]["input_ids"])
|
| 388 |
+
# print(batch[0]["token_type_ids"])
|
| 389 |
+
logits = self.common_step(batch)
|
| 390 |
+
# print(logits)
|
| 391 |
+
# print(batch[1])
|
| 392 |
+
loss = torch.nn.CrossEntropyLoss()(logits.view(-1, len(self.label2idx)), batch[1].view(-1).long())
|
| 393 |
+
for p in logits:
|
| 394 |
+
self.predos.append(self.idx2label[p.argmax().cpu().item()])
|
| 395 |
+
for t in batch[1]:
|
| 396 |
+
self.trues.append(self.idx2label[t.cpu().item()])
|
| 397 |
+
self.log("loss", loss.mean(), prog_bar=True, on_epoch=True)
|
| 398 |
+
self.log("fmicro", self.fscore, prog_bar=True, on_epoch=True)
|
| 399 |
+
|
| 400 |
+
return logits, loss
|
| 401 |
+
|
| 402 |
+
def on_validation_start(self):
|
| 403 |
+
self.predos = []
|
| 404 |
+
self.trues = []
|
| 405 |
+
|
| 406 |
+
def on_validation_end(self):
|
| 407 |
+
self.fscore = f1_score(self.trues, self.predos, average="micro")
|
| 408 |
+
|
| 409 |
+
def test_step(self, batch, batch_idx):
|
| 410 |
+
return self.common_step((batch, []))
|
| 411 |
+
|
| 412 |
+
def forward(self, batch, batch_idx):
|
| 413 |
+
return self.common_step((batch, []))
|
| 414 |
+
|
| 415 |
+
def configure_optimizers(self):
|
| 416 |
+
return AdamW(params=self.parameters())
|
| 417 |
+
|
| 418 |
+
def predict(self, dataloader):
|
| 419 |
+
pass
|
| 420 |
+
|
dev.ipynb
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": []
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"cell_type": "code",
|
| 12 |
+
"execution_count": 35,
|
| 13 |
+
"metadata": {},
|
| 14 |
+
"outputs": [],
|
| 15 |
+
"source": [
|
| 16 |
+
"import random\n",
|
| 17 |
+
"import numpy as np\n",
|
| 18 |
+
"import pandas as pd\n",
|
| 19 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 20 |
+
"from sklearn.ensemble import RandomForestClassifier\n",
|
| 21 |
+
"from sklearn.metrics import f1_score\n",
|
| 22 |
+
"from sklearn.preprocessing import LabelEncoder"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"execution_count": 36,
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"outputs": [],
|
| 30 |
+
"source": [
|
| 31 |
+
"SEED = 1\n",
|
| 32 |
+
"random.seed(SEED)\n",
|
| 33 |
+
"np.random.seed(SEED)"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": 37,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [],
|
| 41 |
+
"source": [
|
| 42 |
+
"train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n",
|
| 43 |
+
"test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n",
|
| 44 |
+
"df = pd.concat([train, test], ignore_index=True)"
|
| 45 |
+
]
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"cell_type": "code",
|
| 49 |
+
"execution_count": 38,
|
| 50 |
+
"metadata": {},
|
| 51 |
+
"outputs": [],
|
| 52 |
+
"source": [
|
| 53 |
+
"X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]\n",
|
| 54 |
+
"y = df['Tag']"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": 39,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [],
|
| 62 |
+
"source": [
|
| 63 |
+
"X_pr = pd.get_dummies(X)\n",
|
| 64 |
+
"le = LabelEncoder()\n",
|
| 65 |
+
"y = le.fit_transform(y)"
|
| 66 |
+
]
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"cell_type": "code",
|
| 70 |
+
"execution_count": 40,
|
| 71 |
+
"metadata": {},
|
| 72 |
+
"outputs": [],
|
| 73 |
+
"source": [
|
| 74 |
+
"train_X = X_pr.iloc[:train.shape[0]]\n",
|
| 75 |
+
"train_y = y[:train.shape[0]]\n",
|
| 76 |
+
"train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "code",
|
| 81 |
+
"execution_count": 41,
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [
|
| 84 |
+
{
|
| 85 |
+
"data": {
|
| 86 |
+
"text/html": [
|
| 87 |
+
"<style>#sk-container-id-4 {color: black;background-color: white;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=1)</pre></div></div></div></div></div>"
|
| 88 |
+
],
|
| 89 |
+
"text/plain": [
|
| 90 |
+
"RandomForestClassifier(random_state=1)"
|
| 91 |
+
]
|
| 92 |
+
},
|
| 93 |
+
"execution_count": 41,
|
| 94 |
+
"metadata": {},
|
| 95 |
+
"output_type": "execute_result"
|
| 96 |
+
}
|
| 97 |
+
],
|
| 98 |
+
"source": [
|
| 99 |
+
"rf = RandomForestClassifier(n_estimators=100, random_state=SEED)\n",
|
| 100 |
+
"rf.fit(train_X, train_y)"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"cell_type": "code",
|
| 105 |
+
"execution_count": 42,
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"outputs": [],
|
| 108 |
+
"source": [
|
| 109 |
+
"rf_predict_result = rf.predict(val_X)"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": 43,
|
| 115 |
+
"metadata": {},
|
| 116 |
+
"outputs": [
|
| 117 |
+
{
|
| 118 |
+
"name": "stdout",
|
| 119 |
+
"output_type": "stream",
|
| 120 |
+
"text": [
|
| 121 |
+
"F1 score: 0.9099025974025974\n"
|
| 122 |
+
]
|
| 123 |
+
}
|
| 124 |
+
],
|
| 125 |
+
"source": [
|
| 126 |
+
"f1_micro = f1_score(val_y, rf_predict_result, average='micro')\n",
|
| 127 |
+
"print(\"F1 score:\", f1_micro)"
|
| 128 |
+
]
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"cell_type": "code",
|
| 132 |
+
"execution_count": 44,
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"outputs": [],
|
| 135 |
+
"source": [
|
| 136 |
+
"test_X = X_pr.iloc[train.shape[0]:]\n",
|
| 137 |
+
"predictions = rf.predict(test_X)"
|
| 138 |
+
]
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"cell_type": "code",
|
| 142 |
+
"execution_count": 45,
|
| 143 |
+
"metadata": {},
|
| 144 |
+
"outputs": [],
|
| 145 |
+
"source": [
|
| 146 |
+
"test['Tag'] = le.inverse_transform(predictions)\n",
|
| 147 |
+
"test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True)"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"cell_type": "code",
|
| 152 |
+
"execution_count": null,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"outputs": [],
|
| 155 |
+
"source": [
|
| 156 |
+
" "
|
| 157 |
+
]
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"metadata": {
|
| 161 |
+
"kernelspec": {
|
| 162 |
+
"display_name": "myenv",
|
| 163 |
+
"language": "python",
|
| 164 |
+
"name": "python3"
|
| 165 |
+
},
|
| 166 |
+
"language_info": {
|
| 167 |
+
"codemirror_mode": {
|
| 168 |
+
"name": "ipython",
|
| 169 |
+
"version": 3
|
| 170 |
+
},
|
| 171 |
+
"file_extension": ".py",
|
| 172 |
+
"mimetype": "text/x-python",
|
| 173 |
+
"name": "python",
|
| 174 |
+
"nbconvert_exporter": "python",
|
| 175 |
+
"pygments_lexer": "ipython3",
|
| 176 |
+
"version": "3.11.0"
|
| 177 |
+
},
|
| 178 |
+
"orig_nbformat": 4
|
| 179 |
+
},
|
| 180 |
+
"nbformat": 4,
|
| 181 |
+
"nbformat_minor": 2
|
| 182 |
+
}
|
dev.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.metrics import f1_score
|
| 7 |
+
from sklearn.preprocessing import LabelEncoder
|
| 8 |
+
|
| 9 |
+
SEED = 1
|
| 10 |
+
random.seed(SEED)
|
| 11 |
+
np.random.seed(SEED)
|
| 12 |
+
|
| 13 |
+
train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])
|
| 14 |
+
test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])
|
| 15 |
+
df = pd.concat([train, test], ignore_index=True)
|
| 16 |
+
|
| 17 |
+
X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]
|
| 18 |
+
y = df['Tag']
|
| 19 |
+
|
| 20 |
+
X_pr = pd.get_dummies(X)
|
| 21 |
+
le = LabelEncoder()
|
| 22 |
+
y = le.fit_transform(y)
|
| 23 |
+
|
| 24 |
+
train_X = X_pr.iloc[:train.shape[0]]
|
| 25 |
+
train_y = y[:train.shape[0]]
|
| 26 |
+
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)
|
| 27 |
+
|
| 28 |
+
rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
|
| 29 |
+
rf.fit(train_X, train_y)
|
| 30 |
+
|
| 31 |
+
rf_predict_result = rf.predict(val_X)
|
| 32 |
+
|
| 33 |
+
f1_micro = f1_score(val_y, rf_predict_result, average='micro')
|
| 34 |
+
print("F1 score:", f1_micro)
|
| 35 |
+
|
| 36 |
+
test_X = X_pr.iloc[train.shape[0]:]
|
| 37 |
+
predictions = rf.predict(test_X)
|
| 38 |
+
|
| 39 |
+
test['Tag'] = le.inverse_transform(predictions)
|
| 40 |
+
test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True)
|
image_2023-05-13_16-58-05.png
ADDED
|
logistic_regression.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
test_fixed.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# %%
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 6 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 7 |
+
import joblib
|
| 8 |
+
from scipy.sparse import hstack
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Read the data from the CSV file
|
| 12 |
+
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
|
| 15 |
+
def split_train_left_right(data):
|
| 16 |
+
sorted = data.sort_values(['Tag', 'Affix'])
|
| 17 |
+
sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
|
| 18 |
+
|
| 19 |
+
tags = defaultdict(list)
|
| 20 |
+
|
| 21 |
+
left = []
|
| 22 |
+
right = []
|
| 23 |
+
|
| 24 |
+
for i, row in sorted.iterrows():
|
| 25 |
+
# word = f"{row['Word']}{row['Affix']}"
|
| 26 |
+
word = row['Word']
|
| 27 |
+
tag = row['Tag']
|
| 28 |
+
|
| 29 |
+
if tags[word] and (tag not in tags[word]):
|
| 30 |
+
# print(tag not in tags['word'])
|
| 31 |
+
left.append(row)
|
| 32 |
+
else:
|
| 33 |
+
right.append(row)
|
| 34 |
+
|
| 35 |
+
tags[word].append(tag)
|
| 36 |
+
|
| 37 |
+
right_df = pd.DataFrame(right)
|
| 38 |
+
left_df = pd.DataFrame(left)
|
| 39 |
+
|
| 40 |
+
return right_df, left_df
|
| 41 |
+
|
| 42 |
+
filepath = "train_fixed.csv"
|
| 43 |
+
data = pd.read_csv(filepath)
|
| 44 |
+
|
| 45 |
+
right_df, left_df = split_train_left_right(data)
|
| 46 |
+
# right_df = pd.read_csv('right.csv')
|
| 47 |
+
# left_df = pd.read_csv('left.csv')
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# %%
|
| 51 |
+
for (side, df) in [('right', right_df), ('left', left_df)]:
|
| 52 |
+
# Get unique categories from "PoS_word" column
|
| 53 |
+
categories = df["PoS_word"].unique()
|
| 54 |
+
|
| 55 |
+
category_res = {}
|
| 56 |
+
|
| 57 |
+
for category in categories:
|
| 58 |
+
print(f"Category: {category}")
|
| 59 |
+
|
| 60 |
+
# Filter data for the current category
|
| 61 |
+
category_data = df[df["PoS_word"] == category]
|
| 62 |
+
print(category_data.shape)
|
| 63 |
+
|
| 64 |
+
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
|
| 65 |
+
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
|
| 66 |
+
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
|
| 67 |
+
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
|
| 68 |
+
|
| 69 |
+
# Splitting data into train and test
|
| 70 |
+
X = category_data["Affix"]
|
| 71 |
+
y = category_data["Tag"]
|
| 72 |
+
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 73 |
+
|
| 74 |
+
# Feature extraction
|
| 75 |
+
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 5))
|
| 76 |
+
X_train_tfidf = vectorizer.fit_transform(X)
|
| 77 |
+
# print(len(vectorizer.vocabulary_))
|
| 78 |
+
|
| 79 |
+
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
|
| 80 |
+
# X_test_combined = hstack([X_test_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
|
| 81 |
+
|
| 82 |
+
# X_test_vec = vectorizer.transform(X)
|
| 83 |
+
|
| 84 |
+
model = RandomForestClassifier(n_estimators=300)
|
| 85 |
+
model.fit(X_train_combined, y)
|
| 86 |
+
|
| 87 |
+
# Save the best model for the category
|
| 88 |
+
# category_models[category] = (model, vectorizer)
|
| 89 |
+
|
| 90 |
+
# Predict on the test data using the best model
|
| 91 |
+
y_pred = model.predict(X_train_combined)
|
| 92 |
+
|
| 93 |
+
# res_df = pd.DataFrame()
|
| 94 |
+
# res_df['pred'] = y_pred
|
| 95 |
+
# res_df['orig'] = y
|
| 96 |
+
category_data['pred'] = y_pred
|
| 97 |
+
category_res[category] = category_data
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# Calculate accuracy and F1 score
|
| 101 |
+
accuracy = accuracy_score(y, y_pred)
|
| 102 |
+
f1 = f1_score(y, y_pred, average="weighted")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
print("Accuracy:", accuracy)
|
| 107 |
+
print("F1 Score:", f1)
|
| 108 |
+
print(model)
|
| 109 |
+
|
| 110 |
+
# Save the models and vectorizers
|
| 111 |
+
# for category, (model, vectorizer) in category_models.items():
|
| 112 |
+
model_filepath = f"artefacts/model_{category}_{side}.joblib"
|
| 113 |
+
vectorizer_filepath = f"artefacts/vectorizer_{category}_{side}.joblib"
|
| 114 |
+
joblib.dump(model, model_filepath)
|
| 115 |
+
joblib.dump(vectorizer, vectorizer_filepath)
|
| 116 |
+
|
| 117 |
+
# %%
|
| 118 |
+
filepath = "test_fixed.csv"
|
| 119 |
+
data = pd.read_csv(filepath)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def split_test_left_right(data):
|
| 123 |
+
sorted = data.sort_values(['Affix'])
|
| 124 |
+
# sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
|
| 125 |
+
|
| 126 |
+
tags = defaultdict(list)
|
| 127 |
+
|
| 128 |
+
left = []
|
| 129 |
+
right = []
|
| 130 |
+
|
| 131 |
+
for i, row in sorted.iterrows():
|
| 132 |
+
word = row['Word']
|
| 133 |
+
|
| 134 |
+
if tags[word]:
|
| 135 |
+
# print(tag not in tags['word'])
|
| 136 |
+
left.append(row)
|
| 137 |
+
else:
|
| 138 |
+
right.append(row)
|
| 139 |
+
tags[word].append(word)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
right_df = pd.DataFrame(right)
|
| 143 |
+
left_df = pd.DataFrame(left)
|
| 144 |
+
|
| 145 |
+
return right_df, left_df
|
| 146 |
+
|
| 147 |
+
right_df, left_df = split_test_left_right(data)
|
| 148 |
+
# right_df = pd.read_csv('right.csv')
|
| 149 |
+
# left_df = pd.read_csv('left.csv')
|
| 150 |
+
# left_df[left_df['Word'] == 'божомолдчу']
|
| 151 |
+
|
| 152 |
+
# %%
|
| 153 |
+
result_dfs = []
|
| 154 |
+
for (side, df) in [('right', right_df), ('left', left_df)]:
|
| 155 |
+
# Get unique categories from "PoS_word" column
|
| 156 |
+
print(side)
|
| 157 |
+
categories = df["PoS_word"].unique()
|
| 158 |
+
|
| 159 |
+
# category_models = {}
|
| 160 |
+
|
| 161 |
+
for category in categories:
|
| 162 |
+
print(f"Category: {category}, side: {side}")
|
| 163 |
+
|
| 164 |
+
# Filter data for the current category
|
| 165 |
+
category_data = df[df["PoS_word"] == category]
|
| 166 |
+
print(category_data.shape)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
|
| 170 |
+
category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
|
| 171 |
+
category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
|
| 172 |
+
category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# Splitting data into train and test
|
| 176 |
+
X = category_data["Affix"]
|
| 177 |
+
y = category_data["Tag"]
|
| 178 |
+
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# Feature extraction
|
| 183 |
+
vectorizer = joblib.load(f"artefacts/vectorizer_{category}_{side}.joblib")
|
| 184 |
+
X_train_tfidf = vectorizer.transform(X)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# X_test_vec = vectorizer.transform(X)
|
| 188 |
+
|
| 189 |
+
model = joblib.load(f"artefacts/model_{category}_{side}.joblib")
|
| 190 |
+
|
| 191 |
+
# Save the best model for the category
|
| 192 |
+
# category_models[category] = (model, vectorizer)
|
| 193 |
+
|
| 194 |
+
X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
|
| 195 |
+
# X
|
| 196 |
+
# Predict on the test data using the best model
|
| 197 |
+
y_pred = model.predict(X_train_combined)
|
| 198 |
+
|
| 199 |
+
category_data['Tag'] = y_pred
|
| 200 |
+
result_dfs.append(category_data)
|
| 201 |
+
# %%
|
| 202 |
+
|
| 203 |
+
pd.concat(result_dfs).to_csv('file_pred_12.csv', index=False)
|
| 204 |
+
|
| 205 |
+
# %%
|
train_fixed.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|