Zarinaaa commited on
Commit
7486641
·
1 Parent(s): 410f530

Special for morphological analysis

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
TAG.docx ADDED
Binary file (19.4 kB). View file
 
bert_model_variant.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertConfig, BertModel
2
+ import torch
3
+ import re
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from sklearn.model_selection import train_test_split, cross_validate
6
+ import pytorch_lightning as pl
7
+ import pandas as pd
8
+ from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
9
+ from torch.optim import AdamW
10
+ from sklearn.metrics import f1_score
11
+
12
+ MAX_LEN = 96
13
+ PAD_ID = 0
14
+
15
+ config = BertConfig(
16
+ vocab_size=40,
17
+ hidden_size=64,
18
+ num_hidden_layers=4,
19
+ num_attention_heads=4,
20
+ intermediate_size=256,
21
+ max_position_embeddings=MAX_LEN,
22
+ type_vocab_size=4
23
+ )
24
+
25
+
26
+
27
+ class MyDataset(Dataset):
28
+ def __init__(self, df, char2idx, label2idx, is_train=True):
29
+ super().__init__()
30
+ print(char2idx)
31
+ print(label2idx)
32
+ self.is_train = is_train
33
+ self.dataset = get_dataset3(df, char2idx, label2idx, is_train=is_train)
34
+
35
+ def __len__(self):
36
+ return len(self.dataset)
37
+
38
+ def __getitem__(self, idx):
39
+ return self.dataset[idx]
40
+
41
+
42
+ def collate_fn(self, batch):
43
+ collated = {
44
+ "input_ids": torch.IntTensor([(x[0] if self.is_train else x)["input_ids"] for x in batch]),
45
+ "attention_mask": torch.Tensor([(x[0] if self.is_train else x)["attention_mask"] for x in batch]),
46
+ "token_type_ids": torch.IntTensor([(x[0] if self.is_train else x)["token_type_ids"] for x in batch])
47
+ }
48
+ if self.is_train:
49
+ collated = collated, torch.IntTensor([x[1] for x in batch])
50
+
51
+ return collated
52
+
53
+
54
+ def get_preprocessed_dfs(folder):
55
+ df = pd.read_csv(f"{folder}/train_data.csv").drop_duplicates()
56
+ df.loc[:, "Tag"] = df.Tag.apply(lambda x: "CAUS_2" if x.startswith("CAUS_") and x != "CAUS_1" else x)
57
+
58
+ cats = ['FUT_INDF_3PLF', 'FUT_INDF_NEG', 'PST_INDF_PS', 'PCP_FUT_NEG', 'PCP_FUT_DEF', 'PRES_CONT', 'PRES_2SGF', 'POSS_2SGF', 'POSS_2PLF', 'NUM_APPR3', 'NUM_APPR2', 'NUM_APPR1', 'ADVV_CONT', 'ADJECTIVE', 'PST_ITER', 'PST_INDF', 'PST_EVID', 'PRES_PST', 'POSS_3SG', 'POSS_3PL', 'POSS_2SG', 'POSS_2PL', 'POSS_1SG', 'POSS_1PL', 'NUM_COLL', 'FUT_INDF', 'ADVV_SUC', 'ADVV_NEG', 'ADVV_INT', 'ADVV_ACC', 'PST_DEF', 'NUM_ORD', 'NUMERAL', 'IMP_SGF', 'IMP_PLF', 'FUT_DEF', 'PREC_1', 'PCP_PS', 'PCP_PR', 'JUS_SG', 'JUS_PL', 'IMP_SG', 'IMP_PL', 'HOR_SG', 'HOR_PL', 'DESIDE', 'CAUS_2', 'CAUS_1', 'INF_5', 'INF_4', 'INF_3', 'INF_2', 'INF_1', 'VERB', 'REFL', 'RECP', 'PRES', 'PREM', 'PERS', 'PASS', 'COND', 'COMP', '2SGF', '2PLF', 'SUC', 'OPT', 'NOM', 'NEG', 'NEG', 'LOC', 'INT', 'GEN', 'DAT', 'ACT', 'ACC', 'ABL', '3SG', '3PL', '2SG', '2PL', '1SG', '1PL', 'SG', 'PL']
59
+ cats = sorted([x.lower() for x in cats], key=lambda x: (len(x), x), reverse=True)
60
+
61
+ for col in df.columns:
62
+ df.loc[:, col] = df[col].apply(lambda x: x.strip().lower())
63
+
64
+ def tag2list(t):
65
+ res = []
66
+ for c in cats:
67
+ if c in t:
68
+ res.append(c)
69
+ t = t.replace(c, "")
70
+ return res
71
+
72
+ df.loc[:, "Tag"] = df.Tag.apply(tag2list)
73
+
74
+ tdf = pd.read_csv(f"{folder}/test_data.csv")
75
+ tdf.pop("Tag")
76
+ for col in tdf.columns:
77
+ tdf.loc[:, col] = tdf[col].apply(lambda x: x.strip().lower())
78
+
79
+ return {"train": df.rename(columns={x: x.lower() for x in df.columns}), "test": tdf.rename(columns={x: x.lower() for x in tdf.columns})}
80
+
81
+ def get_preprocessed_dfs2(folder):
82
+ df = pd.read_csv(f"{folder}/train_data.csv").drop_duplicates()
83
+ df.loc[:, "Tag"] = df.Tag.apply(lambda x: "CAUS_2" if x.startswith("CAUS_") and x != "CAUS_1" else x)
84
+
85
+ for col in df.columns:
86
+ df.loc[:, col] = df[col].apply(lambda x: x.strip().lower())
87
+
88
+ tdf = pd.read_csv(f"{folder}/test_data.csv")
89
+ tdf.pop("Tag")
90
+ for col in tdf.columns:
91
+ tdf.loc[:, col] = tdf[col].apply(lambda x: x.strip().lower())
92
+
93
+ return {"train": df.rename(columns={x: x.lower() for x in df.columns}), "test": tdf.rename(columns={x: x.lower() for x in tdf.columns})}
94
+
95
+ def get_splits(df, test_size=0.2):
96
+ unique_roots = df.root.unique()
97
+ print("unique roots", len(unique_roots))
98
+ train, validation = train_test_split(unique_roots, test_size=test_size, random_state=2023)
99
+ print("unique train roots", len(train))
100
+ print("unique validation roots", len(validation))
101
+ train_df = df[df.root.isin(train)]
102
+ validation_df = df[df.root.isin(validation)]
103
+
104
+ return train_df, validation_df
105
+
106
+ def get_char2idx(all_splits, special_chars=("<pad>", "<s>", "</s>")):
107
+ charset = set()
108
+ for split, df in all_splits.items():
109
+ charset = charset.union("".join(df.apply(lambda r: r.root + r.affix, axis=1)))
110
+ return {x: i for i, x in enumerate(list(special_chars) + sorted(charset))}
111
+
112
+ def get_dataset(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
113
+ pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
114
+
115
+ result = []
116
+
117
+ for r in split.itertuples():
118
+
119
+ input_ids = [char2idx["<s>"], pos2idx[r.pos_word], pos2idx[r.pos_root]]
120
+ attention_mask = [1, 1, 1]
121
+ token_type_ids = [0, 0, 0]
122
+
123
+ # print(r.word, r.root, r.affix)
124
+ for c in r.word:
125
+ input_ids.append(char2idx[c])
126
+ attention_mask.append(1)
127
+ token_type_ids.append(1)
128
+
129
+ for c in r.root:
130
+ input_ids.append(char2idx[c])
131
+ attention_mask.append(1)
132
+ token_type_ids.append(2)
133
+
134
+ for c in r.affix:
135
+ input_ids.append(char2idx[c])
136
+ attention_mask.append(1)
137
+ token_type_ids.append(3)
138
+
139
+ input_ids.append(char2idx["</s>"])
140
+ attention_mask.append(1)
141
+ token_type_ids.append(3)
142
+
143
+ input_ids = input_ids[:MAX_LEN]
144
+ attention_mask = attention_mask[:MAX_LEN]
145
+ token_type_ids = token_type_ids[:MAX_LEN]
146
+
147
+
148
+ for _ in range(MAX_LEN - len(input_ids)):
149
+ input_ids.append(char2idx["<pad>"])
150
+ attention_mask.append(0)
151
+ token_type_ids.append(3)
152
+
153
+ result.append(
154
+ {
155
+ "input_ids": input_ids,
156
+ "attention_mask": attention_mask,
157
+ "token_type_ids": token_type_ids,
158
+ }
159
+ )
160
+
161
+ if is_train:
162
+ result[-1] = (result[-1], [0 for _ in range(len(label2idx))])
163
+ for tag in r.tag:
164
+ result[-1][-1][label2idx[tag]] = 1
165
+
166
+
167
+ return result
168
+
169
+ def get_dataset3(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
170
+ pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
171
+
172
+ result = []
173
+
174
+ for xs, r in enumerate(split.itertuples()):
175
+
176
+ input_ids = [char2idx["<s>"], pos2idx[r.pos_root]]
177
+ attention_mask = [1, 1]
178
+ token_type_ids = [0, 0]
179
+
180
+ for c in r.root:
181
+ input_ids.append(char2idx[c])
182
+ attention_mask.append(1)
183
+ token_type_ids.append(1)
184
+
185
+ for c in r.affix:
186
+ input_ids.append(char2idx[c])
187
+ attention_mask.append(1)
188
+ token_type_ids.append(2)
189
+
190
+ input_ids.append(char2idx["</s>"])
191
+ attention_mask.append(1)
192
+ token_type_ids.append(2)
193
+
194
+ input_ids = input_ids[:MAX_LEN]
195
+ attention_mask = attention_mask[:MAX_LEN]
196
+ token_type_ids = token_type_ids[:MAX_LEN]
197
+
198
+
199
+ for _ in range(MAX_LEN - len(input_ids)):
200
+ input_ids.append(char2idx["<pad>"])
201
+ attention_mask.append(0)
202
+ token_type_ids.append(2)
203
+
204
+ result.append(
205
+ {
206
+ "input_ids": input_ids,
207
+ "attention_mask": attention_mask,
208
+ "token_type_ids": token_type_ids,
209
+ }
210
+ )
211
+
212
+ if is_train:
213
+ result[-1] = (result[-1], label2idx[r.tag])
214
+
215
+ if xs + 1 % 1000 == 0:
216
+ print(input_ids)
217
+ print(attention_mask)
218
+ print(token_type_ids)
219
+
220
+ return result
221
+
222
+ def get_dataset2(split, char2idx, label2idx, max_len=MAX_LEN, is_train=True):
223
+ pos2idx = {x: i for i, x in enumerate(["noun", "verb", "num", "adjective"])}
224
+
225
+ result = []
226
+
227
+ for xs, r in enumerate(split.itertuples()):
228
+
229
+ input_ids = [char2idx["<s>"], pos2idx[r.pos_word], pos2idx[r.pos_root]]
230
+ attention_mask = [1, 1, 1]
231
+ token_type_ids = [0, 0, 0]
232
+
233
+ # print(r.word, r.root, r.affix)
234
+ for c in r.word:
235
+ input_ids.append(char2idx[c])
236
+ attention_mask.append(1)
237
+ token_type_ids.append(1)
238
+
239
+ for c in r.root:
240
+ input_ids.append(char2idx[c])
241
+ attention_mask.append(1)
242
+ token_type_ids.append(2)
243
+
244
+ for c in r.affix:
245
+ input_ids.append(char2idx[c])
246
+ attention_mask.append(1)
247
+ token_type_ids.append(3)
248
+
249
+ input_ids.append(char2idx["</s>"])
250
+ attention_mask.append(1)
251
+ token_type_ids.append(3)
252
+
253
+ input_ids = input_ids[:MAX_LEN]
254
+ attention_mask = attention_mask[:MAX_LEN]
255
+ token_type_ids = token_type_ids[:MAX_LEN]
256
+
257
+
258
+ for _ in range(MAX_LEN - len(input_ids)):
259
+ input_ids.append(char2idx["<pad>"])
260
+ attention_mask.append(0)
261
+ token_type_ids.append(3)
262
+
263
+ result.append(
264
+ {
265
+ "input_ids": input_ids,
266
+ "attention_mask": attention_mask,
267
+ "token_type_ids": token_type_ids,
268
+ }
269
+ )
270
+
271
+ if is_train:
272
+ result[-1] = (result[-1], label2idx[r.tag])
273
+
274
+ if xs + 1 % 10000 == 0:
275
+ print(input_ids)
276
+ print(attention_mask)
277
+ print(token_type_ids)
278
+
279
+
280
+ return result
281
+
282
+ def train_model(epochs=100, batch_size=400, data_folder="../Downloads/"):
283
+ dfs = get_preprocessed_dfs2(data_folder)
284
+ train, val = get_splits(dfs["train"])
285
+ char2idx = get_char2idx(dfs)
286
+ # label2idx = {j: i for i, j in enumerate(sorted(set([x for y in dfs["train"].tag for x in y])))}
287
+ label2idx = {l: i for i, l in enumerate(dfs["train"].tag.unique())}
288
+
289
+ model = MyModel2(config, label2idx, char2idx, 0.5)
290
+ checkpoint_callback = ModelCheckpoint(
291
+ dirpath="fmicro_weights",
292
+ save_top_k=3,
293
+ monitor="fmicro",
294
+ mode="max",
295
+ filename="{epoch}-{step}",
296
+ )
297
+ trainer = pl.Trainer(
298
+ deterministic=True,
299
+ max_epochs=epochs,
300
+ callbacks=[EarlyStopping(monitor="fmicro", mode="max"), checkpoint_callback],
301
+ log_every_n_steps=30,
302
+ )
303
+
304
+ train_dataset = MyDataset(train, char2idx, label2idx)
305
+ validation_dataset = MyDataset(val, char2idx, label2idx)
306
+ trainer.fit(model, DataLoader(train_dataset, batch_size=400, collate_fn=train_dataset.collate_fn), DataLoader(validation_dataset, batch_size=400, collate_fn=validation_dataset.collate_fn))
307
+
308
+ best_model_path = [c for c in trainer.callbacks if isinstance(c, ModelCheckpoint)][0].best_model_path
309
+
310
+ model.load_state_dict(torch.load(best_model_path)["state_dict"])
311
+
312
+ return model, train, val, dfs["test"]
313
+
314
+
315
+ class MyModel(pl.LightningModule):
316
+ def __init__(self, config, label2idx, threshold, *args, **kwargs):
317
+ super().__init__(*args, **kwargs)
318
+ self.threshold = threshold
319
+ self.char2idx = char2idx
320
+ self.label2idx = label2idx
321
+ self.idx2label = {i: l for l, i in label2idx.items()}
322
+ self.bert = BertModel(config)
323
+ self.dropout = torch.nn.Dropout(0.3)
324
+ self.proj = torch.nn.Linear(config.hidden_size, len(label2idx))
325
+
326
+
327
+ def common_step(self, batch):
328
+ X, _ = batch
329
+ hidden = self.bert(**X)[1]
330
+ return self.proj(self.dropout(hidden))
331
+
332
+ def training_step(self, batch, batch_idx):
333
+ # print(batch)
334
+ logits = self.common_step(batch)
335
+ loss = torch.nn.BCEWithLogitsLoss()(logits, batch[1].float())
336
+ self.log("train_loss", loss.mean(), on_step=True, on_epoch=True, prog_bar=True)
337
+
338
+ return loss
339
+
340
+ def validation_step(self, batch, batch_idx):
341
+ # print(batch[0]["input_ids"])
342
+ # print(batch[0]["token_type_ids"])
343
+ logits = self.common_step(batch)
344
+ # print(logits)
345
+ # print(batch[1])
346
+ loss = torch.nn.BCEWithLogitsLoss()(logits, batch[1].float())
347
+ self.log("loss", loss.mean(), prog_bar=True, on_epoch=True)
348
+
349
+ return logits, loss
350
+
351
+ def test_step(self, batch, batch_idx):
352
+ return self.common_step((batch, []))
353
+
354
+ def forward(self, batch, batch_idx):
355
+ return self.common_step((batch, []))
356
+
357
+ def configure_optimizers(self):
358
+ return AdamW(params=self.parameters())
359
+
360
+ class MyModel2(pl.LightningModule):
361
+ def __init__(self, config, label2idx, char2idx, threshold, *args, **kwargs):
362
+ super().__init__(*args, **kwargs)
363
+ self.threshold = threshold
364
+ self.char2idx = char2idx
365
+ self.fscore = 0.0
366
+ self.label2idx = label2idx
367
+ self.idx2label = {i: l for l, i in label2idx.items()}
368
+ self.bert = BertModel(config)
369
+ self.dropout = torch.nn.Dropout(0.3)
370
+ self.proj = torch.nn.Linear(config.hidden_size, len(label2idx))
371
+
372
+
373
+ def common_step(self, batch):
374
+ X, _ = batch
375
+ hidden = self.bert(**X)[1]
376
+ return self.proj(self.dropout(hidden))
377
+
378
+ def training_step(self, batch, batch_idx):
379
+ # print(batch)
380
+ logits = self.common_step(batch)
381
+ loss = torch.nn.CrossEntropyLoss()(logits.view(-1, len(self.label2idx)), batch[1].view(-1).long())
382
+ self.log("train_loss", loss.mean(), on_step=True, on_epoch=True, prog_bar=True)
383
+
384
+ return loss
385
+
386
+ def validation_step(self, batch, batch_idx):
387
+ # print(batch[0]["input_ids"])
388
+ # print(batch[0]["token_type_ids"])
389
+ logits = self.common_step(batch)
390
+ # print(logits)
391
+ # print(batch[1])
392
+ loss = torch.nn.CrossEntropyLoss()(logits.view(-1, len(self.label2idx)), batch[1].view(-1).long())
393
+ for p in logits:
394
+ self.predos.append(self.idx2label[p.argmax().cpu().item()])
395
+ for t in batch[1]:
396
+ self.trues.append(self.idx2label[t.cpu().item()])
397
+ self.log("loss", loss.mean(), prog_bar=True, on_epoch=True)
398
+ self.log("fmicro", self.fscore, prog_bar=True, on_epoch=True)
399
+
400
+ return logits, loss
401
+
402
+ def on_validation_start(self):
403
+ self.predos = []
404
+ self.trues = []
405
+
406
+ def on_validation_end(self):
407
+ self.fscore = f1_score(self.trues, self.predos, average="micro")
408
+
409
+ def test_step(self, batch, batch_idx):
410
+ return self.common_step((batch, []))
411
+
412
+ def forward(self, batch, batch_idx):
413
+ return self.common_step((batch, []))
414
+
415
+ def configure_optimizers(self):
416
+ return AdamW(params=self.parameters())
417
+
418
+ def predict(self, dataloader):
419
+ pass
420
+
dev.ipynb ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 35,
13
+ "metadata": {},
14
+ "outputs": [],
15
+ "source": [
16
+ "import random\n",
17
+ "import numpy as np\n",
18
+ "import pandas as pd\n",
19
+ "from sklearn.model_selection import train_test_split\n",
20
+ "from sklearn.ensemble import RandomForestClassifier\n",
21
+ "from sklearn.metrics import f1_score\n",
22
+ "from sklearn.preprocessing import LabelEncoder"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 36,
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "SEED = 1\n",
32
+ "random.seed(SEED)\n",
33
+ "np.random.seed(SEED)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 37,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n",
43
+ "test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])\n",
44
+ "df = pd.concat([train, test], ignore_index=True)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 38,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]\n",
54
+ "y = df['Tag']"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 39,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "X_pr = pd.get_dummies(X)\n",
64
+ "le = LabelEncoder()\n",
65
+ "y = le.fit_transform(y)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 40,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "train_X = X_pr.iloc[:train.shape[0]]\n",
75
+ "train_y = y[:train.shape[0]]\n",
76
+ "train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 41,
82
+ "metadata": {},
83
+ "outputs": [
84
+ {
85
+ "data": {
86
+ "text/html": [
87
+ "<style>#sk-container-id-4 {color: black;background-color: white;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=1)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">RandomForestClassifier</label><div class=\"sk-toggleable__content\"><pre>RandomForestClassifier(random_state=1)</pre></div></div></div></div></div>"
88
+ ],
89
+ "text/plain": [
90
+ "RandomForestClassifier(random_state=1)"
91
+ ]
92
+ },
93
+ "execution_count": 41,
94
+ "metadata": {},
95
+ "output_type": "execute_result"
96
+ }
97
+ ],
98
+ "source": [
99
+ "rf = RandomForestClassifier(n_estimators=100, random_state=SEED)\n",
100
+ "rf.fit(train_X, train_y)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 42,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "rf_predict_result = rf.predict(val_X)"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 43,
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "name": "stdout",
119
+ "output_type": "stream",
120
+ "text": [
121
+ "F1 score: 0.9099025974025974\n"
122
+ ]
123
+ }
124
+ ],
125
+ "source": [
126
+ "f1_micro = f1_score(val_y, rf_predict_result, average='micro')\n",
127
+ "print(\"F1 score:\", f1_micro)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 44,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "test_X = X_pr.iloc[train.shape[0]:]\n",
137
+ "predictions = rf.predict(test_X)"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 45,
143
+ "metadata": {},
144
+ "outputs": [],
145
+ "source": [
146
+ "test['Tag'] = le.inverse_transform(predictions)\n",
147
+ "test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True)"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "code",
152
+ "execution_count": null,
153
+ "metadata": {},
154
+ "outputs": [],
155
+ "source": [
156
+ " "
157
+ ]
158
+ }
159
+ ],
160
+ "metadata": {
161
+ "kernelspec": {
162
+ "display_name": "myenv",
163
+ "language": "python",
164
+ "name": "python3"
165
+ },
166
+ "language_info": {
167
+ "codemirror_mode": {
168
+ "name": "ipython",
169
+ "version": 3
170
+ },
171
+ "file_extension": ".py",
172
+ "mimetype": "text/x-python",
173
+ "name": "python",
174
+ "nbconvert_exporter": "python",
175
+ "pygments_lexer": "ipython3",
176
+ "version": "3.11.0"
177
+ },
178
+ "orig_nbformat": 4
179
+ },
180
+ "nbformat": 4,
181
+ "nbformat_minor": 2
182
+ }
dev.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.metrics import f1_score
7
+ from sklearn.preprocessing import LabelEncoder
8
+
9
+ SEED = 1
10
+ random.seed(SEED)
11
+ np.random.seed(SEED)
12
+
13
+ train = pd.read_csv('train_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])
14
+ test = pd.read_csv('test_lr.csv').sort_values(by=['PoS_word', 'Tag', 'Affix'])
15
+ df = pd.concat([train, test], ignore_index=True)
16
+
17
+ X = df[['Word', 'Root', 'Affix', 'PoS_root', 'PoS_word']]
18
+ y = df['Tag']
19
+
20
+ X_pr = pd.get_dummies(X)
21
+ le = LabelEncoder()
22
+ y = le.fit_transform(y)
23
+
24
+ train_X = X_pr.iloc[:train.shape[0]]
25
+ train_y = y[:train.shape[0]]
26
+ train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.05, random_state=SEED)
27
+
28
+ rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
29
+ rf.fit(train_X, train_y)
30
+
31
+ rf_predict_result = rf.predict(val_X)
32
+
33
+ f1_micro = f1_score(val_y, rf_predict_result, average='micro')
34
+ print("F1 score:", f1_micro)
35
+
36
+ test_X = X_pr.iloc[train.shape[0]:]
37
+ predictions = rf.predict(test_X)
38
+
39
+ test['Tag'] = le.inverse_transform(predictions)
40
+ test[['Word', 'Root', 'Affix', 'Tag']].to_csv('my_submission2.csv', index=False, header=True)
image_2023-05-13_16-58-05.png ADDED
logistic_regression.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
test_fixed.csv ADDED
The diff for this file is too large to render. See raw diff
 
train.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split, GridSearchCV
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.metrics import accuracy_score, f1_score
7
+ import joblib
8
+ from scipy.sparse import hstack
9
+
10
+
11
+ # Read the data from the CSV file
12
+
13
+ from collections import defaultdict
14
+
15
+ def split_train_left_right(data):
16
+ sorted = data.sort_values(['Tag', 'Affix'])
17
+ sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
18
+
19
+ tags = defaultdict(list)
20
+
21
+ left = []
22
+ right = []
23
+
24
+ for i, row in sorted.iterrows():
25
+ # word = f"{row['Word']}{row['Affix']}"
26
+ word = row['Word']
27
+ tag = row['Tag']
28
+
29
+ if tags[word] and (tag not in tags[word]):
30
+ # print(tag not in tags['word'])
31
+ left.append(row)
32
+ else:
33
+ right.append(row)
34
+
35
+ tags[word].append(tag)
36
+
37
+ right_df = pd.DataFrame(right)
38
+ left_df = pd.DataFrame(left)
39
+
40
+ return right_df, left_df
41
+
42
+ filepath = "train_fixed.csv"
43
+ data = pd.read_csv(filepath)
44
+
45
+ right_df, left_df = split_train_left_right(data)
46
+ # right_df = pd.read_csv('right.csv')
47
+ # left_df = pd.read_csv('left.csv')
48
+
49
+
50
+ # %%
51
+ for (side, df) in [('right', right_df), ('left', left_df)]:
52
+ # Get unique categories from "PoS_word" column
53
+ categories = df["PoS_word"].unique()
54
+
55
+ category_res = {}
56
+
57
+ for category in categories:
58
+ print(f"Category: {category}")
59
+
60
+ # Filter data for the current category
61
+ category_data = df[df["PoS_word"] == category]
62
+ print(category_data.shape)
63
+
64
+ category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
65
+ category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
66
+ category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
67
+ category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
68
+
69
+ # Splitting data into train and test
70
+ X = category_data["Affix"]
71
+ y = category_data["Tag"]
72
+ # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
73
+
74
+ # Feature extraction
75
+ vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(1, 5))
76
+ X_train_tfidf = vectorizer.fit_transform(X)
77
+ # print(len(vectorizer.vocabulary_))
78
+
79
+ X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
80
+ # X_test_combined = hstack([X_test_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
81
+
82
+ # X_test_vec = vectorizer.transform(X)
83
+
84
+ model = RandomForestClassifier(n_estimators=300)
85
+ model.fit(X_train_combined, y)
86
+
87
+ # Save the best model for the category
88
+ # category_models[category] = (model, vectorizer)
89
+
90
+ # Predict on the test data using the best model
91
+ y_pred = model.predict(X_train_combined)
92
+
93
+ # res_df = pd.DataFrame()
94
+ # res_df['pred'] = y_pred
95
+ # res_df['orig'] = y
96
+ category_data['pred'] = y_pred
97
+ category_res[category] = category_data
98
+
99
+
100
+ # Calculate accuracy and F1 score
101
+ accuracy = accuracy_score(y, y_pred)
102
+ f1 = f1_score(y, y_pred, average="weighted")
103
+
104
+
105
+
106
+ print("Accuracy:", accuracy)
107
+ print("F1 Score:", f1)
108
+ print(model)
109
+
110
+ # Save the models and vectorizers
111
+ # for category, (model, vectorizer) in category_models.items():
112
+ model_filepath = f"artefacts/model_{category}_{side}.joblib"
113
+ vectorizer_filepath = f"artefacts/vectorizer_{category}_{side}.joblib"
114
+ joblib.dump(model, model_filepath)
115
+ joblib.dump(vectorizer, vectorizer_filepath)
116
+
117
+ # %%
118
+ filepath = "test_fixed.csv"
119
+ data = pd.read_csv(filepath)
120
+
121
+
122
+ def split_test_left_right(data):
123
+ sorted = data.sort_values(['Affix'])
124
+ # sorted = sorted.drop_duplicates(subset=['Word', 'Tag'])
125
+
126
+ tags = defaultdict(list)
127
+
128
+ left = []
129
+ right = []
130
+
131
+ for i, row in sorted.iterrows():
132
+ word = row['Word']
133
+
134
+ if tags[word]:
135
+ # print(tag not in tags['word'])
136
+ left.append(row)
137
+ else:
138
+ right.append(row)
139
+ tags[word].append(word)
140
+
141
+
142
+ right_df = pd.DataFrame(right)
143
+ left_df = pd.DataFrame(left)
144
+
145
+ return right_df, left_df
146
+
147
+ right_df, left_df = split_test_left_right(data)
148
+ # right_df = pd.read_csv('right.csv')
149
+ # left_df = pd.read_csv('left.csv')
150
+ # left_df[left_df['Word'] == 'божомолдчу']
151
+
152
+ # %%
153
+ result_dfs = []
154
+ for (side, df) in [('right', right_df), ('left', left_df)]:
155
+ # Get unique categories from "PoS_word" column
156
+ print(side)
157
+ categories = df["PoS_word"].unique()
158
+
159
+ # category_models = {}
160
+
161
+ for category in categories:
162
+ print(f"Category: {category}, side: {side}")
163
+
164
+ # Filter data for the current category
165
+ category_data = df[df["PoS_word"] == category]
166
+ print(category_data.shape)
167
+
168
+
169
+ category_data['text_length'] = category_data['Affix'].apply(lambda x: len(x))
170
+ category_data['word_length'] = category_data['Word'].apply(lambda x: len(x))
171
+ category_data['ү_count'] = category_data['Word'].apply(lambda x: x.count('ү'))
172
+ category_data['ө_count'] = category_data['Word'].apply(lambda x: x.count('ө'))
173
+
174
+
175
+ # Splitting data into train and test
176
+ X = category_data["Affix"]
177
+ y = category_data["Tag"]
178
+ # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
179
+
180
+
181
+
182
+ # Feature extraction
183
+ vectorizer = joblib.load(f"artefacts/vectorizer_{category}_{side}.joblib")
184
+ X_train_tfidf = vectorizer.transform(X)
185
+
186
+
187
+ # X_test_vec = vectorizer.transform(X)
188
+
189
+ model = joblib.load(f"artefacts/model_{category}_{side}.joblib")
190
+
191
+ # Save the best model for the category
192
+ # category_models[category] = (model, vectorizer)
193
+
194
+ X_train_combined = hstack([X_train_tfidf, category_data[['text_length', 'ү_count', 'ө_count']]])
195
+ # X
196
+ # Predict on the test data using the best model
197
+ y_pred = model.predict(X_train_combined)
198
+
199
+ category_data['Tag'] = y_pred
200
+ result_dfs.append(category_data)
201
+ # %%
202
+
203
+ pd.concat(result_dfs).to_csv('file_pred_12.csv', index=False)
204
+
205
+ # %%
train_fixed.csv ADDED
The diff for this file is too large to render. See raw diff