LCA-PORVID commited on
Commit
ebdb5af
·
verified ·
1 Parent(s): 0832b34

Upload 34 files

Browse files
Files changed (32) hide show
  1. .gitignore +195 -0
  2. pt_variety_identifier/.gitignore +1 -0
  3. pt_variety_identifier/__init__.py +0 -0
  4. pt_variety_identifier/src/__init__.py +0 -0
  5. pt_variety_identifier/src/bert/.gitignore +3 -0
  6. pt_variety_identifier/src/bert/data.py +53 -0
  7. pt_variety_identifier/src/bert/in/.gitkeep +0 -0
  8. pt_variety_identifier/src/bert/main.py +177 -0
  9. pt_variety_identifier/src/bert/model.py +51 -0
  10. pt_variety_identifier/src/bert/out/.gitkeep +0 -0
  11. pt_variety_identifier/src/bert/results.py +35 -0
  12. pt_variety_identifier/src/bert/tester.py +166 -0
  13. pt_variety_identifier/src/bert/trainer.py +108 -0
  14. pt_variety_identifier/src/data.py +106 -0
  15. pt_variety_identifier/src/delexicalizer.py +38 -0
  16. pt_variety_identifier/src/n_grams/.gitignore +2 -0
  17. pt_variety_identifier/src/n_grams/__init__.py +0 -0
  18. pt_variety_identifier/src/n_grams/data.py +20 -0
  19. pt_variety_identifier/src/n_grams/in/.gitkeep +0 -0
  20. pt_variety_identifier/src/n_grams/in/best_params.json +79 -0
  21. pt_variety_identifier/src/n_grams/in/params.json +45 -0
  22. pt_variety_identifier/src/n_grams/in/params1.json +47 -0
  23. pt_variety_identifier/src/n_grams/main.py +121 -0
  24. pt_variety_identifier/src/n_grams/model.py +99 -0
  25. pt_variety_identifier/src/n_grams/out/.gitkeep +0 -0
  26. pt_variety_identifier/src/n_grams/results.py +56 -0
  27. pt_variety_identifier/src/n_grams/tester.py +57 -0
  28. pt_variety_identifier/src/n_grams/trainer.py +73 -0
  29. pt_variety_identifier/src/results.py +41 -0
  30. pt_variety_identifier/src/tunning.py +47 -0
  31. pt_variety_identifier/src/utils.py +19 -0
  32. setup.py +24 -0
.gitignore ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ ### VisualStudioCode ###
177
+ .vscode/*
178
+ !.vscode/settings.json
179
+ !.vscode/tasks.json
180
+ !.vscode/launch.json
181
+ !.vscode/extensions.json
182
+ !.vscode/*.code-snippets
183
+
184
+ # Local History for Visual Studio Code
185
+ .history/
186
+
187
+ # Built Visual Studio Code Extensions
188
+ *.vsix
189
+
190
+ ### VisualStudioCode Patch ###
191
+ # Ignore all local history of files
192
+ .history
193
+ .ionide
194
+
195
+ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
pt_variety_identifier/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ thrash/*
pt_variety_identifier/__init__.py ADDED
File without changes
pt_variety_identifier/src/__init__.py ADDED
File without changes
pt_variety_identifier/src/bert/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.txt
2
+ *.pt
3
+ *.json
pt_variety_identifier/src/bert/data.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertTokenizerFast
3
+ from pt_variety_identifier.src.data import Data as BaseData
4
+ from torch.utils.data import DataLoader
5
+
6
+
7
+ class Data(BaseData):
8
+ def __init__(self, dataset_name, tokenizer_name, batch_size, test_set_list):
9
+ super().__init__(dataset_name=dataset_name, test_set_list=test_set_list)
10
+
11
+ self.tokenizer_name = tokenizer_name
12
+ self.tokenizer = BertTokenizerFast.from_pretrained(self.tokenizer_name)
13
+ self.batch_size = batch_size
14
+
15
+ def _tokenize(self, example):
16
+ return self.tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)
17
+
18
+ def _adapt_dataset(self, dataset):
19
+ dataset = dataset.map(self._tokenize, batched=True)
20
+
21
+ # Set the tensor type and the columns which the dataset should return
22
+ dataset.set_format(type='torch', columns=[
23
+ 'input_ids', 'attention_mask', 'label'])
24
+
25
+ return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
26
+
27
+ def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None):
28
+ dataset = super().load_domain(domain=domain, balance=balance,
29
+ pos_prob=pos_prob, ner_prob=ner_prob, sample_size=sample_size)
30
+
31
+ return self._adapt_dataset(dataset)
32
+
33
+ def load_validation_set(self):
34
+ dataset_dict = super().load_validation_set()
35
+
36
+ for domain in dataset_dict.keys():
37
+ dataset_dict[domain] = self._adapt_dataset(dataset_dict[domain])
38
+
39
+ return dataset_dict
40
+
41
+ def load_test_set(self, filter_label_2=False):
42
+ dataset_dict = super().load_test_set(filter_label_2)
43
+
44
+ for test_set in dataset_dict.keys():
45
+ dataset_dict[test_set] = self._adapt_dataset(
46
+ dataset_dict[test_set])
47
+
48
+ validation_dataset_dict = self.load_validation_set()
49
+
50
+ for val_set in validation_dataset_dict.keys():
51
+ dataset_dict[val_set] = validation_dataset_dict[val_set]
52
+
53
+ return dataset_dict
pt_variety_identifier/src/bert/in/.gitkeep ADDED
File without changes
pt_variety_identifier/src/bert/main.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import time
4
+ from pt_variety_identifier.src.utils import setup_logger, create_output_dir
5
+ from pt_variety_identifier.src.bert.data import Data
6
+ from tqdm import tqdm
7
+ from pt_variety_identifier.src.tunning import Tunning
8
+ from pt_variety_identifier.src.bert.trainer import Trainer
9
+ from pt_variety_identifier.src.bert.tester import Tester
10
+ from pt_variety_identifier.src.bert.results import Results
11
+ from pt_variety_identifier.src.bert.model import EnsembleIdentfier, LanguageIdentfier
12
+ import torch.multiprocessing as mp
13
+ from threading import Thread
14
+ import logging
15
+ import numpy as np
16
+
17
+ class Run:
18
+ def __init__(self, dataset_name, tokenizer_name, model_name, batch_size, test_set_list) -> None:
19
+ self.CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
20
+ self.CURRENT_TIME = int(time.time())
21
+
22
+ self.num_gpus = torch.cuda.device_count()
23
+ self.sem = mp.Semaphore(self.num_gpus)
24
+ self.gpus_free = [i for i in range(self.num_gpus)]
25
+
26
+ self.test_set_list = test_set_list
27
+
28
+ create_output_dir(self.CURRENT_PATH, self.CURRENT_TIME)
29
+
30
+ setup_logger(self.CURRENT_PATH, self.CURRENT_TIME)
31
+
32
+ self.data = Data(
33
+ dataset_name, tokenizer_name=tokenizer_name, batch_size=batch_size, test_set_list=test_set_list)
34
+
35
+ self._DOMAINS = ['literature', 'legal', 'politics', 'web', 'social_media', 'journalistic']
36
+
37
+ self.model_name = model_name
38
+
39
+
40
+ tqdm.pandas()
41
+
42
+ def tune_with_gpu(self):
43
+
44
+ threads = []
45
+
46
+ for pos_prob in tqdm(range(np.arange(0.0, 1.0, 0.1))):
47
+ for ner_prob in tqdm(range(np.arange(0.0, 1.0, 0.2))):
48
+
49
+ pos_prob = round(pos_prob, 2)
50
+ ner_prob = round(ner_prob, 2)
51
+
52
+ self.sem.acquire()
53
+
54
+ gpu_in_use = self.gpus_free.pop()
55
+
56
+ tuner = Tunning(self.data, self._DOMAINS,
57
+ Results, Trainer, Tester, 5_000,
58
+ self.CURRENT_PATH, self.CURRENT_TIME,
59
+ params={
60
+ 'epochs': 30,
61
+ 'early_stoping': 5,
62
+ 'model_name': self.model_name,
63
+ 'device': f"cuda:{gpu_in_use}",
64
+ 'sem': self.sem,
65
+ 'gpus_free': self.gpus_free,
66
+ })
67
+
68
+ thread = Thread(target=tuner.run, args=(
69
+ pos_prob, pos_prob, ner_prob, ner_prob), daemon=True
70
+ )
71
+
72
+ threads.append(thread)
73
+
74
+ for t in threads:
75
+ t.join()
76
+
77
+ def tune_with_cpu(self):
78
+ tuner = Tunning(self.data, self._DOMAINS,
79
+ Results, Trainer, Tester, 5_000,
80
+ self.CURRENT_PATH, self.CURRENT_TIME,
81
+ params={
82
+ 'epochs': 30,
83
+ 'early_stoping': 5,
84
+ 'model_name': self.model_name,
85
+ 'device': 'cpu',
86
+ })
87
+
88
+ tuner.run()
89
+
90
+ def tune(self):
91
+ if torch.cuda.is_available():
92
+ return self.tune_with_gpu()
93
+
94
+ return self.tune_with_cpu()
95
+
96
+ def _train_domain(self, domain, gpu):
97
+ logging.info(f"Training {domain} domain")
98
+
99
+ data = self.data.load_domain(domain, balance=True, pos_prob=None, ner_prob=None)
100
+
101
+ validation_dataset_dict = self.data.load_validation_set()
102
+
103
+ """
104
+ logging.info(f"Removing non training domains from validation set")
105
+
106
+ validation_dataset_dict = {
107
+ domain: validation_dataset_dict[domain]
108
+ }
109
+ """
110
+
111
+ trainer = Trainer(data, params={
112
+ 'epochs': 30,
113
+ 'early_stoping': 5,
114
+ 'model_name': self.model_name,
115
+ 'device': gpu,
116
+ 'CURRENT_PATH': self.CURRENT_PATH,
117
+ 'CURRENT_TIME': self.CURRENT_TIME,
118
+ 'training_domain': domain,
119
+ },validation_dataset_dict=validation_dataset_dict)
120
+
121
+ best_results = trainer.train()
122
+
123
+ logging.info(f"Best results for {domain} domain: {best_results}")
124
+
125
+ logging.info(f"Freeing cuda:{gpu[-1]}")
126
+
127
+ self.gpus_free.append(gpu[-1])
128
+
129
+ return self.sem.release()
130
+
131
+ def train(self):
132
+
133
+ threads = []
134
+
135
+ for domain in ['all']:
136
+ self.sem.acquire()
137
+
138
+ gpu_in_use = self.gpus_free.pop()
139
+
140
+ thread = Thread(target=self._train_domain, args=(domain, f"cuda:{gpu_in_use}"), daemon=True)
141
+
142
+ threads.append(thread)
143
+
144
+ thread.start()
145
+
146
+ for t in threads:
147
+ t.join()
148
+
149
+ def test(self):
150
+ model = LanguageIdentfier(self.model_name)
151
+
152
+ logging.info(f"Loading model from {os.path.join(self.CURRENT_PATH, 'out', str(self.CURRENT_TIME), 'models', 'all.pt')}")
153
+
154
+ model.load_state_dict(torch.load(os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models", "all.pt")))
155
+
156
+ model.eval()
157
+ model.to('cuda')
158
+
159
+ data = self.data.load_test_set(filter_label_2=True)
160
+
161
+ tester = Tester(data, model, None)
162
+
163
+ results = tester.validate()
164
+
165
+ logging.info(f"Results for all: {results}")
166
+
167
+ def test_ensemble(self):
168
+ data = self.data.load_test_set(filter_label_2=True)
169
+
170
+ ensemble = EnsembleIdentfier(os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models"), self.model_name)
171
+
172
+ tester = Tester(data, ensemble, None)
173
+
174
+ results = tester.test()
175
+
176
+ logging.info(f"Results for ensemble: {results}")
177
+
pt_variety_identifier/src/bert/model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import BertModel
3
+ import os
4
+
5
+ class EnsembleIdentfier(torch.nn.Module):
6
+ def __init__(self, models_path, model_name):
7
+ super().__init__()
8
+ self.model_name = model_name
9
+
10
+ self.models = torch.nn.ModuleList()
11
+ # List .pt files in models_path
12
+
13
+ for filename in os.listdir(models_path):
14
+ if filename.endswith(".pt"):
15
+ model = LanguageIdentfier(self.model_name)
16
+ model.load_state_dict(torch.load(os.path.join(models_path, filename)))
17
+ model.eval()
18
+ self.models.append(model)
19
+
20
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+
22
+ def forward(self, input_ids, attention_mask):
23
+ logits = torch.zeros(len(self.models), input_ids.shape[0]).to(self.device)
24
+
25
+ for i, model in enumerate(self.models):
26
+ model.to(self.device)
27
+ logits[i] = model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
28
+ model.cpu()
29
+
30
+ return logits
31
+
32
+
33
+ class LanguageIdentfier(torch.nn.Module):
34
+ def __init__(self, model_name):
35
+ super().__init__()
36
+ self.model = BertModel.from_pretrained(model_name)
37
+ self.dropout = torch.nn.Dropout(0.1)
38
+
39
+ self.linear = torch.nn.Linear(
40
+ self.model.config.hidden_size, 1)
41
+
42
+ self.sigmoid = torch.nn.Sigmoid()
43
+
44
+ def forward(self, input_ids, attention_mask):
45
+ outputs = self.model(input_ids, attention_mask=attention_mask)
46
+ pooled_output = outputs[1]
47
+ pooled_output = self.dropout(pooled_output)
48
+ logits = self.linear(pooled_output)
49
+ logits = self.sigmoid(logits)
50
+
51
+ return logits
pt_variety_identifier/src/bert/out/.gitkeep ADDED
File without changes
pt_variety_identifier/src/bert/results.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pt_variety_identifier.src.results import Results as BaseResults
2
+ import logging
3
+
4
+ class Results(BaseResults):
5
+ def __init__(self, filepath, DOMAINS) -> None:
6
+ super().__init__(filepath, DOMAINS)
7
+
8
+ def process(self, cross_domain_f1, train_domain, test_results, train_results, balance, pos_prob, ner_prob):
9
+ if cross_domain_f1 > self.best_f1_scores[train_domain]["cross_domain_f1"]:
10
+ logging.info(f"New best f1 score for {train_domain}")
11
+
12
+ self.best_f1_scores[train_domain]["cross_domain_f1"] = cross_domain_f1
13
+ self.best_f1_scores[train_domain]["test_results"] = test_results
14
+ self.best_f1_scores[train_domain]["balance"] = balance
15
+ self.best_f1_scores[train_domain]["pos_prob"] = pos_prob
16
+ self.best_f1_scores[train_domain]["ner_prob"] = ner_prob
17
+
18
+ logging.info(
19
+ f"Saving best cross_domain_f1 scores to file")
20
+
21
+ self.best_final_results()
22
+
23
+ #TODO: Save PyTorch model
24
+
25
+ self.best_intermediate_results({
26
+ "domain": train_domain,
27
+ "balance": balance,
28
+ "pos_prob": pos_prob,
29
+ "ner_prob": ner_prob,
30
+ "train": train_results,
31
+ "test": {
32
+ 'all': test_results,
33
+ 'cross_domain_f1': cross_domain_f1
34
+ }
35
+ })
pt_variety_identifier/src/bert/tester.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import evaluate
3
+ from tqdm import tqdm
4
+ import logging
5
+
6
+ class Tester:
7
+ def __init__(self, test_dataset_dict, model, train_domain) -> None:
8
+ self.test_dataset_dict = test_dataset_dict
9
+ self.model = model
10
+ self.train_domain = train_domain
11
+
12
+ self.accuracy = evaluate.load("accuracy")
13
+ self.f1 = evaluate.load("f1")
14
+ self.precision = evaluate.load("precision")
15
+ self.recall = evaluate.load("recall")
16
+ self.loss_fn = torch.nn.BCELoss()
17
+
18
+ self.device = torch.device(
19
+ "cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ def _validate(self, test_dataset):
22
+ with torch.no_grad():
23
+ total_loss = 0
24
+
25
+ for batch in tqdm(test_dataset):
26
+ input_ids = batch['input_ids'].to(self.device)
27
+ attention_mask = batch['attention_mask'].to(self.device)
28
+ labels = batch['label'].to(self.device)
29
+
30
+ logits = self.model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
31
+
32
+ loss = self.loss_fn(logits, labels.float())
33
+
34
+ # If logits is bigger than 0.5, it's 1, otherwise it's 0
35
+
36
+ predictions = (logits > 0.5).long()
37
+
38
+ # Detach from GPU
39
+ predictions = predictions.cpu()
40
+ labels = labels.cpu()
41
+
42
+ accuracy = self.accuracy.add_batch(
43
+ predictions=predictions, references=labels)
44
+
45
+ f1 = self.f1.add_batch(
46
+ predictions=predictions, references=labels)
47
+
48
+ precision = self.precision.add_batch(
49
+ predictions=predictions, references=labels)
50
+
51
+ recall = self.recall.add_batch(
52
+ predictions=predictions, references=labels)
53
+
54
+ total_loss += loss.item()
55
+
56
+ accuracy = self.accuracy.compute()['accuracy']
57
+ f1 = self.f1.compute()['f1']
58
+ precision = self.precision.compute()['precision']
59
+ recall = self.recall.compute()['recall']
60
+ total_loss = total_loss / len(test_dataset)
61
+
62
+ return accuracy, f1, precision, recall, total_loss
63
+
64
+
65
+ def validate(self):
66
+ self.model.eval()
67
+ self.model.to(self.device)
68
+
69
+ results = {}
70
+ average_results = {}
71
+
72
+ for domain in self.test_dataset_dict.keys():
73
+ logging.info(f"Testing {domain} domain...")
74
+ accuracy, f1, precision, recall, total_loss = self._validate(self.test_dataset_dict[domain])
75
+
76
+ results[domain] = {
77
+ 'accuracy': accuracy,
78
+ 'f1': f1,
79
+ 'precision': precision,
80
+ 'recall': recall,
81
+ 'loss': total_loss
82
+ }
83
+
84
+ # Remove key for train domain
85
+ if self.train_domain in results.keys():
86
+ results.pop(self.train_domain)
87
+
88
+ if len(results.keys()) == 0:
89
+ logging.info("Only one domain to test, returning results")
90
+ return results
91
+
92
+ # Calculate the average of all domains except the train domain
93
+ for metric in ['accuracy', 'f1', 'precision', 'recall', 'loss']:
94
+ average_results[metric] = sum([results[domain][metric] for domain in results.keys()]) / len(results.keys())
95
+
96
+ return results, average_results
97
+
98
+ # Migrate this method to Model
99
+ def _bagging(self, logits):
100
+ # Average the logits
101
+ return torch.mean(logits, dim=0)
102
+
103
+
104
+ def _test(self, test_dataset):
105
+ with torch.no_grad():
106
+ total_loss = 0
107
+
108
+ for batch in tqdm(test_dataset):
109
+ input_ids = batch['input_ids'].to(self.device)
110
+ attention_mask = batch['attention_mask'].to(self.device)
111
+ labels = batch['label'].to(self.device)
112
+
113
+ logits = self.model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
114
+
115
+ logits = self._bagging(logits)
116
+
117
+ loss = self.loss_fn(logits, labels.float())
118
+
119
+ # If logits is bigger than 0.5, it's 1, otherwise it's 0
120
+ predictions = (logits > 0.5).long()
121
+
122
+ # Detach from GPU
123
+ predictions = predictions.cpu()
124
+ labels = labels.cpu()
125
+
126
+ accuracy = self.accuracy.add_batch(
127
+ predictions=predictions, references=labels)
128
+
129
+ f1 = self.f1.add_batch(
130
+ predictions=predictions, references=labels)
131
+
132
+ precision = self.precision.add_batch(
133
+ predictions=predictions, references=labels)
134
+
135
+ recall = self.recall.add_batch(
136
+ predictions=predictions, references=labels)
137
+
138
+ total_loss += loss.item()
139
+
140
+ accuracy = self.accuracy.compute()['accuracy']
141
+ f1 = self.f1.compute()['f1']
142
+ precision = self.precision.compute()['precision']
143
+ recall = self.recall.compute()['recall']
144
+ total_loss = total_loss / len(test_dataset)
145
+
146
+ return accuracy, f1, precision, recall, total_loss
147
+
148
+ def test(self):
149
+ results={}
150
+
151
+ with torch.no_grad():
152
+ for test_set in self.test_dataset_dict.keys():
153
+ logging.info(f"Testing {test_set} dataset")
154
+ accuracy, f1, precision, recall, total_loss = self._test(self.test_dataset_dict[test_set])
155
+
156
+ results[test_set] = {
157
+ 'accuracy': accuracy,
158
+ 'f1': f1,
159
+ 'precision': precision,
160
+ 'recall': recall,
161
+ 'loss': total_loss
162
+ }
163
+
164
+ logging.info(f"Results for {test_set} dataset: {results[test_set]}")
165
+
166
+ return results
pt_variety_identifier/src/bert/trainer.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from tqdm import tqdm
3
+ import logging
4
+ from pt_variety_identifier.src.bert.model import LanguageIdentfier
5
+ from pt_variety_identifier.src.bert.tester import Tester
6
+ import math
7
+ import os
8
+
9
+
10
+ class Trainer:
11
+ def __init__(self, train_dataset, params, validation_dataset_dict=None) -> None:
12
+ self.train_dataset = train_dataset
13
+
14
+ self.model = LanguageIdentfier(params['model_name'])
15
+
16
+ self.epochs = params['epochs']
17
+ self.lr = 1e-5
18
+ self.loss_fn = torch.nn.BCELoss()
19
+ self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr)
20
+ self.early_stoping = params['early_stoping']
21
+
22
+ self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
23
+ self.optimizer, patience=self.early_stoping//2, verbose=True)
24
+
25
+ self.device = params['device']
26
+ self.CURRENT_PATH = params['CURRENT_PATH']
27
+ self.CURRENT_TIME = params['CURRENT_TIME']
28
+ self.training_domain = params['training_domain'] if 'training_domain' in params else 'all'
29
+
30
+ self.validator = None
31
+
32
+ print(f"Using {self.device} device")
33
+
34
+ if validation_dataset_dict:
35
+ self.validator = Tester(
36
+ test_dataset_dict=validation_dataset_dict,
37
+ model=self.model,
38
+ train_domain=self.training_domain,
39
+ )
40
+
41
+ def _epoch_iter(self):
42
+ self.model.train()
43
+ self.model.to(self.device)
44
+ self.optimizer.zero_grad()
45
+
46
+ with torch.enable_grad():
47
+ total_loss = 0
48
+
49
+ for batch in tqdm(self.train_dataset):
50
+ input_ids = batch['input_ids'].to(self.device)
51
+ attention_mask = batch['attention_mask'].to(self.device)
52
+ labels = batch['label'].to(self.device, dtype=torch.float)
53
+
54
+ outputs = self.model(
55
+ input_ids, attention_mask=attention_mask).squeeze(dim=1)
56
+ loss = self.loss_fn(outputs, labels)
57
+
58
+ loss.backward()
59
+
60
+ self.optimizer.step()
61
+ self.optimizer.zero_grad()
62
+
63
+ total_loss += loss.item()
64
+
65
+ self.scheduler.step(total_loss)
66
+
67
+ return total_loss / len(self.train_dataset)
68
+
69
+ def train(self):
70
+ logging.info(f"Training model in {self.device}...")
71
+
72
+ best_results = {
73
+ 'f1': -math.inf,
74
+ 'accuracy': -math.inf,
75
+ 'precision': -math.inf,
76
+ 'recall': -math.inf,
77
+ 'loss': math.inf
78
+ }
79
+
80
+ for epoch in tqdm(range(self.epochs)):
81
+ training_loss = self._epoch_iter()
82
+
83
+ if self.validator:
84
+ results = self.validator.validate()
85
+
86
+ logging.info(f"Results for {self.training_domain} domain: {results} Epoch: {epoch}")
87
+
88
+ if results['loss'] < best_results['loss'] and results['f1'] > best_results['f1']:
89
+ logging.info(
90
+ f"Saving best model... Domain:{self.training_domain} F1:{results['f1']} and Test Loss:{results['loss']}")
91
+
92
+ best_results['loss'] = results['loss']
93
+ best_results['accuracy'] = results['accuracy']
94
+ best_results['f1'] = results['f1']
95
+ best_results['recall'] = results['recall']
96
+ best_results['precision'] = results['precision']
97
+
98
+ torch.save(self.model.state_dict(), os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models", f'{self.training_domain}.pt'))
99
+ else:
100
+ logging.info(f"Not saving model... F1:{results['f1']} and Test Loss:{results['loss']}")
101
+
102
+ logging.info(f"Epoch {epoch} Training Loss: {training_loss}")
103
+
104
+ if training_loss < 0.1:
105
+ logging.info(f"Training Loss is too low, stoping training...")
106
+ break
107
+
108
+ return best_results
pt_variety_identifier/src/data.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset
2
+ import logging
3
+ from imblearn.under_sampling import RandomUnderSampler
4
+ import pandas as pd
5
+ from pt_variety_identifier.src.delexicalizer import Delexicalizer
6
+ import pandas as pd
7
+
8
+
9
+ class Data:
10
+ def __init__(self, dataset_name, test_set_list) -> None:
11
+ self._DOMAINS = ['journalistic', 'literature',
12
+ 'legal', 'politics', 'web', 'social_media']
13
+
14
+ self.dataset_name = dataset_name
15
+ self.test_set_list = test_set_list
16
+
17
+ def balance_dataset(self, dataset):
18
+ df_dataset = pd.DataFrame(
19
+ {'text': dataset['text'], 'label': dataset['label']})
20
+
21
+ logging.info(
22
+ f"Classe Balance Before Undersampling: {df_dataset['label'].value_counts()}")
23
+
24
+ rus = RandomUnderSampler(random_state=42)
25
+
26
+ X_res, y_res = rus.fit_resample(
27
+ df_dataset['text'].to_numpy().reshape(-1, 1), df_dataset['label'].to_numpy())
28
+
29
+ df_dataset = pd.DataFrame({'text': X_res.reshape(-1), 'label': y_res})
30
+
31
+ logging.info(
32
+ f"Classe Balance After Undersampling: {df_dataset['label'].value_counts()}")
33
+
34
+ return Dataset.from_pandas(df_dataset)
35
+
36
+ def _load_domain_all(self, balance):
37
+ dataset_return = None
38
+
39
+ for domain in self._DOMAINS:
40
+ dataset = load_dataset(self.dataset_name, domain, split='train')
41
+
42
+ if balance:
43
+ logging.info(f"Balancing Training Dataset {domain}")
44
+ dataset = self.balance_dataset(dataset)
45
+
46
+ if dataset_return is None:
47
+ dataset_return = dataset
48
+ else:
49
+ dataset_return = concatenate_datasets(
50
+ [dataset_return, dataset])
51
+
52
+ return dataset_return
53
+
54
+ def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None):
55
+
56
+ logging.info(f"Loading {domain} dataset")
57
+
58
+ if domain == 'all':
59
+ dataset = self._load_domain_all(balance)
60
+ else:
61
+ dataset = load_dataset(self.dataset_name, domain, split='train')
62
+
63
+ dataset = dataset.shuffle(seed=42)
64
+
65
+ if balance:
66
+ logging.info("Balancing Training Dataset")
67
+ dataset = self.balance_dataset(dataset)
68
+
69
+ if sample_size != None:
70
+ logging.info("Sampling Training Dataset")
71
+ dataset = dataset.shuffle(
72
+ seed=42).select(range(sample_size))
73
+
74
+ df_train = dataset.to_pandas()
75
+
76
+ if pos_prob and ner_prob:
77
+ delexicalizer = Delexicalizer(pos_prob, ner_prob)
78
+
79
+ logging.info("Delexicalizing Training Dataset")
80
+
81
+ df_train['text'] = df_train['text'].progress_apply(
82
+ delexicalizer.delexicalize)
83
+
84
+ return Dataset.from_pandas(df_train)
85
+
86
+ def load_validation_set(self):
87
+ dataset_return = {}
88
+
89
+ for domain in self._DOMAINS:
90
+ dataset_return[domain] = load_dataset(
91
+ "LCA-PORVID/portuguese_vid", domain, split='test').shuffle(seed=42)
92
+
93
+ return dataset_return
94
+
95
+ def load_test_set(self, filter_label_2=False):
96
+ dataset_return = {}
97
+
98
+ for test_set in self.test_set_list:
99
+ dataset_return[test_set] = load_dataset(test_set, split='test')
100
+
101
+ if filter_label_2:
102
+ logging.info("Filtering label 2 from test set")
103
+ dataset_return[test_set] = dataset_return[test_set].filter(
104
+ lambda example: example['label'] != 2)
105
+
106
+ return dataset_return
pt_variety_identifier/src/delexicalizer.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import random
3
+
4
+
5
+ class Delexicalizer:
6
+ def __init__(self, prob_pos_tag, prob_ner_tag, spacy_model="pt_core_news_sm") -> None:
7
+
8
+ if not spacy_model in spacy.util.get_installed_models():
9
+ spacy.cli.download(spacy_model)
10
+
11
+ self.nlp = spacy.load(spacy_model, enable=["parser", "tagger", "ner"])
12
+
13
+ if prob_pos_tag < 0 or prob_pos_tag > 1:
14
+ raise ValueError("prob_pos_tag must be between 0 and 1")
15
+
16
+ if prob_ner_tag < 0 or prob_ner_tag > 1:
17
+ raise ValueError("prob_ner_tag must be between 0 and 1")
18
+
19
+ self.prob_pos_tag = prob_pos_tag
20
+ self.prob_ner_tag = prob_ner_tag
21
+
22
+ def delexicalize(self, text):
23
+ doc = self.nlp(text)
24
+
25
+ list_tokens = []
26
+
27
+ for token in doc:
28
+
29
+ if token.ent_type > 0 and random.uniform(0, 1) < self.prob_ner_tag:
30
+ list_tokens.append(token.ent_type_)
31
+
32
+ elif random.uniform(0, 1) < self.prob_pos_tag:
33
+ list_tokens.append(token.pos_)
34
+
35
+ else:
36
+ list_tokens.append(token.text)
37
+
38
+ return ' '.join(list_tokens)
pt_variety_identifier/src/n_grams/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ out/*
2
+ !out/.gitkeep
pt_variety_identifier/src/n_grams/__init__.py ADDED
File without changes
pt_variety_identifier/src/n_grams/data.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pt_variety_identifier.src.data import Data as DataBase
2
+
3
+
4
+ class Data(DataBase):
5
+ def __init__(self, dataset_name, test_set_list) -> None:
6
+ self._DOMAINS = ['journalistic', 'literature',
7
+ 'legal', 'politics', 'web', 'social_media']
8
+
9
+ self.dataset_name = dataset_name
10
+ self.test_set_list = test_set_list
11
+
12
+ def load_test_set(self, filter_label_2=False):
13
+ dataset_return = super().load_test_set(filter_label_2)
14
+
15
+ validation_dataset_dict = self.load_validation_set()
16
+
17
+ for key in validation_dataset_dict:
18
+ dataset_return[key] = validation_dataset_dict[key]
19
+
20
+ return dataset_return
pt_variety_identifier/src/n_grams/in/.gitkeep ADDED
File without changes
pt_variety_identifier/src/n_grams/in/best_params.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "literature": {
3
+ "tfidf": {
4
+ "tfidf__ngram_range": [
5
+ 1,
6
+ 4
7
+ ],
8
+ "tfidf__max_features": 50000,
9
+ "tfidf__lowercase": false,
10
+ "tfidf__analyzer": "char"
11
+ }
12
+ },
13
+ "legal": {
14
+ "tfidf": {
15
+ "tfidf__ngram_range": [
16
+ 1,
17
+ 3
18
+ ],
19
+ "tfidf__max_features": 50000,
20
+ "tfidf__lowercase": false,
21
+ "tfidf__analyzer": "word"
22
+ }
23
+ },
24
+ "politics": {
25
+ "tfidf": {
26
+ "tfidf__ngram_range": [
27
+ 1,
28
+ 1
29
+ ],
30
+ "tfidf__max_features": 50000,
31
+ "tfidf__lowercase": true,
32
+ "tfidf__analyzer": "word"
33
+ }
34
+ },
35
+ "web": {
36
+ "tfidf": {
37
+ "tfidf__ngram_range": [
38
+ 1,
39
+ 1
40
+ ],
41
+ "tfidf__max_features": 10000,
42
+ "tfidf__lowercase": true,
43
+ "tfidf__analyzer": "word"
44
+ }
45
+ },
46
+ "social_media": {
47
+ "tfidf": {
48
+ "tfidf__ngram_range": [
49
+ 1,
50
+ 1
51
+ ],
52
+ "tfidf__max_features": 500,
53
+ "tfidf__lowercase": false,
54
+ "tfidf__analyzer": "word"
55
+ }
56
+ },
57
+ "journalistic": {
58
+ "tfidf": {
59
+ "tfidf__ngram_range": [
60
+ 1,
61
+ 2
62
+ ],
63
+ "tfidf__max_features": 10000,
64
+ "tfidf__lowercase": false,
65
+ "tfidf__analyzer": "word"
66
+ }
67
+ },
68
+ "all": {
69
+ "tfidf": {
70
+ "tfidf__ngram_range": [
71
+ 1,
72
+ 3
73
+ ],
74
+ "tfidf__max_features": 50000,
75
+ "tfidf__lowercase": false,
76
+ "tfidf__analyzer": "word"
77
+ }
78
+ }
79
+ }
pt_variety_identifier/src/n_grams/in/params.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tfidf__max_features": [
3
+ 100,
4
+ 500,
5
+ 1000,
6
+ 5000,
7
+ 10000,
8
+ 50000,
9
+ 100000
10
+ ],
11
+ "tfidf__ngram_range": [
12
+ [
13
+ 1,
14
+ 1
15
+ ],
16
+ [
17
+ 1,
18
+ 2
19
+ ],
20
+ [
21
+ 1,
22
+ 3
23
+ ],
24
+ [
25
+ 1,
26
+ 4
27
+ ],
28
+ [
29
+ 1,
30
+ 5
31
+ ],
32
+ [
33
+ 1,
34
+ 10
35
+ ]
36
+ ],
37
+ "tfidf__lowercase": [
38
+ true,
39
+ false
40
+ ],
41
+ "tfidf__analyzer": [
42
+ "word",
43
+ "char"
44
+ ]
45
+ }
pt_variety_identifier/src/n_grams/in/params1.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+
3
+ "tfidf__max_features": [
4
+ 100,
5
+ 500,
6
+ 1000,
7
+ 5000,
8
+ 10000,
9
+ 50000,
10
+ 100000
11
+ ],
12
+ "tfidf__ngram_range": [
13
+ [
14
+ 1,
15
+ 1
16
+ ],
17
+ [
18
+ 1,
19
+ 2
20
+ ],
21
+ [
22
+ 1,
23
+ 3
24
+ ],
25
+ [
26
+ 1,
27
+ 4
28
+ ],
29
+ [
30
+ 1,
31
+ 5
32
+ ],
33
+ [
34
+ 1,
35
+ 10
36
+ ]
37
+ ],
38
+ "tfidf__lowercase": [
39
+ true,
40
+ false
41
+ ],
42
+ "tfidf__analyzer": [
43
+ "word",
44
+ "char",
45
+ "char_wb"
46
+ ]
47
+ }
pt_variety_identifier/src/n_grams/main.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from time import time
3
+ import json
4
+ from pt_variety_identifier.src.n_grams.data import Data
5
+ from pt_variety_identifier.src.n_grams.results import Results
6
+ from pt_variety_identifier.src.n_grams.trainer import Trainer
7
+ from pt_variety_identifier.src.n_grams.tester import Tester
8
+ from tqdm import tqdm
9
+ from pt_variety_identifier.src.utils import setup_logger, create_output_dir
10
+ from pt_variety_identifier.src.tunning import Tunning
11
+ import logging
12
+ from joblib import dump, load
13
+ from pt_variety_identifier.src.n_grams.model import EnsembleIdentfier, LanguageIdentifier
14
+
15
+
16
+ class Run:
17
+ def __init__(self, dataset_name, test_set_list) -> None:
18
+ self.CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
19
+ self.CURRENT_TIME = str(int(time()))
20
+ self.params = self.load_params()
21
+
22
+ create_output_dir(self.CURRENT_PATH, self.CURRENT_TIME)
23
+ setup_logger(self.CURRENT_PATH, self.CURRENT_TIME)
24
+
25
+ self.data = Data(dataset_name, test_set_list)
26
+
27
+ self._DOMAINS = ['literature', 'journalistic',
28
+ 'legal', 'politics', 'web', 'social_media']
29
+
30
+ # Enable progress bar for pandas
31
+ tqdm.pandas()
32
+
33
+ self.tuner = Tunning(self.data, self._DOMAINS, Results, Trainer, Tester, sample_size=5_000,
34
+ CURRENT_PATH=self.CURRENT_PATH, CURRENT_TIME=self.CURRENT_TIME, params=self.params)
35
+
36
+ def load_params(self):
37
+
38
+ f = open(os.path.join(self.CURRENT_PATH, "in", "params.json"),
39
+ "r", encoding="utf-8")
40
+
41
+ # Fail if params.json does not exist
42
+ if f == None:
43
+ raise FileNotFoundError("params.json not found")
44
+
45
+ dict_obj = json.load(f)
46
+
47
+ if 'tfidf__ngram_range' in dict_obj:
48
+ # Cast tfidf__ngram_range to tuple
49
+ for idx, elem in enumerate(dict_obj['tfidf__ngram_range']):
50
+ dict_obj['tfidf__ngram_range'][idx] = tuple(elem)
51
+
52
+ return dict_obj
53
+
54
+ def tune(self):
55
+ return self.tuner.run()
56
+
57
+ def train(self):
58
+ with open(os.path.join(self.CURRENT_PATH, "in", "best_params.json"), "r", encoding="utf-8") as f:
59
+ best_params = json.load(f)
60
+
61
+ for domain in ['all']:
62
+ logging.info(f"Training {domain} domain")
63
+
64
+ data = self.data.load_domain(
65
+ domain, balance=True, pos_prob=None, ner_prob=None)
66
+
67
+ validation_dataset_dict = self.data.load_validation_set()
68
+
69
+ """
70
+ logging.info(
71
+ f"Removing non training domains from validation set")
72
+ validation_dataset_dict = {
73
+ domain: validation_dataset_dict[domain]
74
+ }
75
+ """
76
+
77
+ trainer = Trainer(
78
+ train_dataset=data,
79
+ params=best_params[domain]["tfidf"]
80
+ )
81
+
82
+ best_pipeline = trainer.train()
83
+
84
+ tester = Tester(
85
+ test_dataset_dict=validation_dataset_dict,
86
+ pipeline=best_pipeline,
87
+ train_domain=domain
88
+ )
89
+
90
+ results = tester.test()
91
+
92
+ logging.info(f"Results for {domain} domain: {results}")
93
+
94
+ logging.info(f"Save Model for {domain} domain")
95
+
96
+ dump(best_pipeline, os.path.join(
97
+ self.CURRENT_PATH, "out", self.CURRENT_TIME, "models", f"{domain}_model.joblib"))
98
+
99
+ def test(self):
100
+ test_data = self.data.load_test_set(filter_label_2=True)
101
+
102
+ pipeline = load(os.path.join(
103
+ self.CURRENT_PATH, "out", self.CURRENT_TIME, "models", "all_model.joblib"))
104
+
105
+ tester = Tester(test_data, pipeline, None)
106
+
107
+ results = tester.test()
108
+
109
+ logging.info(f"Results for test set: {results}")
110
+
111
+ def test_ensemble(self):
112
+ test_data = self.data.load_test_set(filter_label_2=True)
113
+
114
+ ensemble = EnsembleIdentfier(os.path.join(
115
+ self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models"))
116
+
117
+ tester = Tester(test_data, ensemble, None)
118
+
119
+ results = tester.test()
120
+
121
+ logging.info(f"Results for ensemble: {results}")
pt_variety_identifier/src/n_grams/model.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.tokenize import word_tokenize
3
+ from sklearn.pipeline import Pipeline
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.naive_bayes import BernoulliNB
6
+ from sklearn.base import BaseEstimator
7
+ from joblib import load
8
+ import os
9
+ import math
10
+ from tqdm import tqdm
11
+ import logging
12
+
13
+
14
+ class EnsembleIdentfier(BaseEstimator):
15
+ def __init__(self, models_path) -> None:
16
+ super().__init__()
17
+ self.models = []
18
+
19
+ for filename in os.listdir(models_path):
20
+ if filename.endswith(".joblib"):
21
+ logging.info(f"Loading model {filename}")
22
+ model = load(os.path.join(models_path, filename))
23
+ self.models.append(model)
24
+
25
+ def _bagging(self, predictions_proba):
26
+ # Initialize best_predictions with the first prediction
27
+ best_prediction = None
28
+ best_proba = -math.inf
29
+
30
+ for prediction_proba in predictions_proba:
31
+ pred_0_label = prediction_proba[0][0]
32
+ pred_1_label = prediction_proba[0][1]
33
+
34
+ if pred_0_label > best_proba:
35
+ best_prediction = 0
36
+ best_proba = pred_0_label
37
+
38
+ if pred_1_label > best_proba:
39
+ best_prediction = 1
40
+ best_proba = pred_1_label
41
+
42
+ return best_prediction
43
+
44
+ def predict(self, X):
45
+ return self.predict_proba(X)
46
+
47
+ def predict_proba(self, X):
48
+ final_predictions = []
49
+
50
+ for i in tqdm(range(len(X))):
51
+ predictions = []
52
+
53
+ for model in self.models:
54
+ predictions.append(model.predict_proba([X[i]]))
55
+
56
+ final_predictions.append(self._bagging(predictions))
57
+
58
+ return final_predictions
59
+
60
+
61
+ class LanguageIdentifier(BaseEstimator):
62
+ def __init__(self, params: dict) -> None:
63
+ nltk.download("stopwords")
64
+ nltk.download("punkt")
65
+
66
+ self.pipeline = Pipeline([
67
+ ('tfidf', TfidfVectorizer(
68
+ tokenizer=lambda text: word_tokenize(
69
+ text, language='portuguese'),
70
+ stop_words=nltk.corpus.stopwords.words('portuguese'),
71
+ ngram_range=(params['tfidf__ngram_range'][0],
72
+ params['tfidf__ngram_range'][1]),
73
+ max_features=params['tfidf__max_features'],
74
+ analyzer=params['tfidf__analyzer'],
75
+ lowercase=params['tfidf__lowercase']
76
+ )),
77
+ ('clf', BernoulliNB())
78
+ ])
79
+
80
+ def fit(self, X, y):
81
+ return self.pipeline.fit(X, y)
82
+
83
+ def predict(self, X):
84
+ return self.pipeline.predict(X)
85
+
86
+ def predict_proba(self, X):
87
+ return self.pipeline.predict_proba(X)
88
+
89
+ def score(self, X, y):
90
+ return self.pipeline.score(X, y)
91
+
92
+ def get_params(self, deep=True):
93
+ return self.pipeline.get_params(deep)
94
+
95
+ def set_params(self, **params):
96
+ return self.pipeline.set_params(**params)
97
+
98
+ def __str__(self) -> str:
99
+ return self.pipeline.__str__()
pt_variety_identifier/src/n_grams/out/.gitkeep ADDED
File without changes
pt_variety_identifier/src/n_grams/results.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pt_variety_identifier.src.results import Results as BaseResults
2
+ import logging
3
+ import os
4
+ from joblib import dump
5
+ import pandas as pd
6
+
7
+
8
+ class Results(BaseResults):
9
+ def __init__(self, filepath, DOMAINS) -> None:
10
+ super().__init__(filepath, DOMAINS)
11
+
12
+ def process(self, cross_domain_f1, train_domain, test_results, grid_results, balance, pos_prob, ner_prob):
13
+
14
+ if cross_domain_f1 > self.best_f1_scores[train_domain]["cross_domain_f1"]:
15
+ logging.info(f"New best f1 score for {train_domain}")
16
+
17
+ self.best_f1_scores[train_domain]["cross_domain_f1"] = cross_domain_f1
18
+ self.best_f1_scores[train_domain]["test_results"] = test_results
19
+ self.best_f1_scores[train_domain]["params"] = grid_results.best_params_
20
+ self.best_f1_scores[train_domain]["balance"] = balance
21
+ self.best_f1_scores[train_domain]["pos_prob"] = pos_prob
22
+ self.best_f1_scores[train_domain]["ner_prob"] = ner_prob
23
+
24
+ logging.info(
25
+ f"Saving best cross_domain_f1 scores to file")
26
+
27
+ self.best_final_results()
28
+
29
+ with open(os.path.join(self.filepath, "models", f"{train_domain}.joblib"), "wb") as f:
30
+ dump(grid_results.best_estimator_, f)
31
+
32
+ self.best_intermediate_results({
33
+ "domain": train_domain,
34
+ "balance": balance,
35
+ "pos_prob": pos_prob,
36
+ "ner_prob": ner_prob,
37
+ "train": {
38
+ "best_score": grid_results.best_score_,
39
+ },
40
+ "test": {
41
+ 'all': test_results,
42
+ 'cross_domain_f1': cross_domain_f1
43
+ },
44
+ "best_params": grid_results.best_params_
45
+ })
46
+
47
+ self.other_results({
48
+ "domain": train_domain,
49
+ "balance": balance,
50
+ "pos_prob": pos_prob,
51
+ "ner_prob": ner_prob,
52
+ "train": {
53
+ "cv_results": pd.DataFrame(grid_results.cv_results_).to_json()
54
+ },
55
+ "test": test_results,
56
+ })
pt_variety_identifier/src/n_grams/tester.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import evaluate
2
+ import logging
3
+
4
+
5
+ class Tester:
6
+ def __init__(self, test_dataset_dict, pipeline, train_domain) -> None:
7
+ self.test_dataset_dict = test_dataset_dict
8
+ self.accuracy = evaluate.load("accuracy")
9
+ self.f1 = evaluate.load("f1")
10
+ self.precision = evaluate.load("precision")
11
+ self.recall = evaluate.load("recall")
12
+ self.pipeline = pipeline
13
+ self.train_domain = train_domain
14
+
15
+ def _test(self, test_dataset):
16
+ predictions = self.pipeline.predict(test_dataset['text'])
17
+
18
+ accuracy = self.accuracy.compute(
19
+ references=test_dataset['label'], predictions=predictions)['accuracy']
20
+
21
+ f1 = self.f1.compute(
22
+ references=test_dataset['label'], predictions=predictions)['f1']
23
+
24
+ precision = self.precision.compute(
25
+ references=test_dataset['label'], predictions=predictions)['precision']
26
+
27
+ recall = self.recall.compute(
28
+ references=test_dataset['label'], predictions=predictions)['recall']
29
+
30
+ return accuracy, f1, precision, recall
31
+
32
+ def test(self):
33
+ results = {}
34
+
35
+ for domain in self.test_dataset_dict.keys():
36
+ logging.info(f"Testing {domain} domain")
37
+
38
+ test_dataset = self.test_dataset_dict[domain]
39
+
40
+ accuracy, f1, precision, recall = self._test(test_dataset)
41
+
42
+ results[domain] = {
43
+ 'accuracy': accuracy,
44
+ 'f1': f1,
45
+ 'precision': precision,
46
+ 'recall': recall
47
+ }
48
+
49
+ if len(results.keys()) == 1:
50
+ logging.info("Only one domain to test")
51
+ return results
52
+
53
+ # Calculate the average of all domains except the train domain
54
+ average_f1 = sum([results[domain]['f1'] for domain in results.keys(
55
+ ) if domain != self.train_domain]) / (len(results.keys()) - 1)
56
+
57
+ return results, average_f1
pt_variety_identifier/src/n_grams/trainer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.naive_bayes import MultinomialNB, BernoulliNB
2
+ from sklearn.model_selection import RandomizedSearchCV
3
+ from sklearn.model_selection import StratifiedKFold
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ import nltk
7
+ from nltk.tokenize import word_tokenize
8
+ import numpy as np
9
+ import logging
10
+ from pt_variety_identifier.src.n_grams.model import LanguageIdentifier
11
+
12
+
13
+ class Trainer:
14
+ def __init__(self, train_dataset, params) -> None:
15
+ self.train_dataset = train_dataset
16
+ self.model = LanguageIdentifier(params)
17
+
18
+ def train(self):
19
+ logging.info("Training model...")
20
+
21
+ fitted_model = self.model.fit(
22
+ np.array(self.train_dataset['text']), np.array(self.train_dataset['label']))
23
+
24
+ logging.info("Training finished!")
25
+
26
+ return fitted_model
27
+
28
+
29
+ """
30
+
31
+ class Trainer:
32
+ def __init__(self, train_dataset, params, n_iter=500) -> None:
33
+
34
+ nltk.download("stopwords")
35
+ nltk.download("punkt")
36
+
37
+ self.pipeline = Pipeline([
38
+ ('tfidf', TfidfVectorizer(
39
+ tokenizer=lambda text: word_tokenize(
40
+ text, language='portuguese'),
41
+ stop_words=nltk.corpus.stopwords.words('portuguese')
42
+ )),
43
+ ('clf', BernoulliNB())
44
+ ])
45
+
46
+ self.params = params
47
+ self.n_iter = n_iter
48
+
49
+ self.cv = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)
50
+
51
+ self.search = RandomizedSearchCV(
52
+ self.pipeline,
53
+ self.params,
54
+ scoring='f1_macro',
55
+ n_jobs=-1,
56
+ n_iter=self.n_iter,
57
+ cv=self.cv,
58
+ error_score='raise'
59
+ )
60
+
61
+ self.train_dataset = train_dataset
62
+
63
+ def train(self):
64
+ logging.info("Training model...")
65
+
66
+ results = self.search.fit(
67
+ np.array(self.train_dataset['text']), np.array(self.train_dataset['label']))
68
+
69
+ logging.info("Training finished!")
70
+
71
+ return results, results.best_estimator_
72
+
73
+ """
pt_variety_identifier/src/results.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+
5
+
6
+ class Results:
7
+
8
+ def __init__(self, filepath, DOMAINS) -> None:
9
+ self.filepath = filepath
10
+ self.best_intermediate_results_list = []
11
+ self.other_results_list = []
12
+ self.DOMAINS = DOMAINS
13
+
14
+ self.best_f1_scores = {
15
+ domain: {
16
+ "cross_domain_f1": -math.inf,
17
+ "params": {},
18
+ "balance": None,
19
+ "pos_prob": None,
20
+ "ner_prob": None
21
+ }
22
+ for domain in self.DOMAINS
23
+ }
24
+
25
+ def best_intermediate_results(self, result):
26
+ self.best_intermediate_results_list.append(result)
27
+
28
+ with open(os.path.join(self.filepath, 'best_intermediate_self.json'), "w", encoding="utf-8") as f:
29
+ json.dump(self.best_intermediate_results_list, f, ensure_ascii=False,
30
+ indent=4)
31
+
32
+ def best_final_results(self):
33
+ with open(os.path.join(self.filepath, 'best_final_self.json'), "w", encoding="utf-8") as f:
34
+ json.dump(self.best_f1_scores, f, ensure_ascii=False, indent=4)
35
+
36
+ def other_results(self, result):
37
+ self.other_results_list.append(result)
38
+
39
+ with open(os.path.join(self.filepath, 'other_self.json'), "w", encoding="utf-8") as f:
40
+ json.dump(self.other_results_list, f, ensure_ascii=False,
41
+ indent=4)
pt_variety_identifier/src/tunning.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import logging
3
+ import os
4
+
5
+
6
+ class Tunning:
7
+ def __init__(self, data, domains, Results, Trainer, Tester, sample_size, CURRENT_PATH, CURRENT_TIME, params=None) -> None:
8
+ self.data = data
9
+ self.Trainer = Trainer
10
+ self.Tester = Tester
11
+ self._DOMAINS = domains
12
+ self.sample_size = sample_size
13
+ self.CURRENT_PATH = CURRENT_PATH
14
+ self.CURRENT_TIME = CURRENT_TIME
15
+
16
+ self.results = Results(os.path.join(
17
+ self.CURRENT_PATH, "out", str(CURRENT_TIME)), self._DOMAINS)
18
+
19
+ self.params = params
20
+
21
+ def run(self, start_pos_prob=0.0, stop_pos_prob=1.0, start_ner_prob=0.0, stop_ner_prob=1.0):
22
+
23
+ logging.info(f"Start pos_prob={start_pos_prob}, stop_pos_prob={stop_pos_prob}")
24
+
25
+ test_dataset = self.data.load_test_set()
26
+
27
+ for pos_prob in np.arange(start_pos_prob, stop_pos_prob + 0.1, 0.1):
28
+ for ner_prob in np.arange(start_ner_prob, stop_ner_prob + 0.1, 0.1):
29
+ for domain in self._DOMAINS:
30
+ logging.info(
31
+ f"Running {domain} pos_prob={pos_prob}, ner_prob={ner_prob}")
32
+
33
+ dataset = self.data.load_domain(
34
+ domain, balance=True, pos_prob=pos_prob, ner_prob=ner_prob, sample_size=self.sample_size)
35
+
36
+ trainer = self.Trainer(dataset, self.params)
37
+
38
+ results, best_model = trainer.train()
39
+
40
+ validation_results = self.Tester(
41
+ test_dataset, best_model, train_domain=domain).validate()
42
+
43
+ logging.info(
44
+ f"Cross domain f1 score: {validation_results['f1']} | test_results: {validation_results}")
45
+
46
+ self.results.process(validation_results['f1'], domain, validation_results,
47
+ results, balance=True, pos_prob=pos_prob, ner_prob=ner_prob)
pt_variety_identifier/src/utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+
4
+
5
+ def setup_logger(CURRENT_PATH, CURRENT_TIME):
6
+ print(
7
+ f"Logging to {os.path.join(CURRENT_PATH, 'out', str(CURRENT_TIME), 'logs', 'log.txt')}")
8
+ logging.basicConfig(filename=os.path.join(CURRENT_PATH, "out", str(CURRENT_TIME), "logs", "log.txt"), filemode='w',
9
+ format='%(asctime)s - %(levelname)s - %(message)s',
10
+ level=logging.INFO)
11
+
12
+
13
+ def create_output_dir(CURRENT_PATH, CURRENT_TIME):
14
+ os.mkdir(os.path.join(CURRENT_PATH,
15
+ "out", str(CURRENT_TIME)))
16
+ os.mkdir(os.path.join(CURRENT_PATH, "out",
17
+ str(CURRENT_TIME), "logs"))
18
+ os.mkdir(os.path.join(CURRENT_PATH, "out",
19
+ str(CURRENT_TIME), "models"))
setup.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='pt_variety_identifier',
5
+ version='0.0.1',
6
+ description='Identify the variety of Portuguese used in a text',
7
+ install_requires=[
8
+ 'pandas',
9
+ 'datasets',
10
+ 'zstandard',
11
+ 'clean-text[gpl]',
12
+ 'fasttext-langdetect',
13
+ 'numpy',
14
+ 'tqdm',
15
+ 'imbalanced-learn',
16
+ 'spacy[cuda11x]',
17
+ 'evaluate',
18
+ 'nltk',
19
+ 'transformers',
20
+ 'torch',
21
+ ],
22
+ packages=find_packages(),
23
+ author='John Doe'
24
+ )