LCA-PORVID commited on Feb 12, 2024

Commit

ebdb5af

verified ·

1 Parent(s): 0832b34

Upload 34 files

Browse files

Files changed (32) hide show

.gitignore +195 -0
pt_variety_identifier/.gitignore +1 -0
pt_variety_identifier/__init__.py +0 -0
pt_variety_identifier/src/__init__.py +0 -0
pt_variety_identifier/src/bert/.gitignore +3 -0
pt_variety_identifier/src/bert/data.py +53 -0
pt_variety_identifier/src/bert/in/.gitkeep +0 -0
pt_variety_identifier/src/bert/main.py +177 -0
pt_variety_identifier/src/bert/model.py +51 -0
pt_variety_identifier/src/bert/out/.gitkeep +0 -0
pt_variety_identifier/src/bert/results.py +35 -0
pt_variety_identifier/src/bert/tester.py +166 -0
pt_variety_identifier/src/bert/trainer.py +108 -0
pt_variety_identifier/src/data.py +106 -0
pt_variety_identifier/src/delexicalizer.py +38 -0
pt_variety_identifier/src/n_grams/.gitignore +2 -0
pt_variety_identifier/src/n_grams/__init__.py +0 -0
pt_variety_identifier/src/n_grams/data.py +20 -0
pt_variety_identifier/src/n_grams/in/.gitkeep +0 -0
pt_variety_identifier/src/n_grams/in/best_params.json +79 -0
pt_variety_identifier/src/n_grams/in/params.json +45 -0
pt_variety_identifier/src/n_grams/in/params1.json +47 -0
pt_variety_identifier/src/n_grams/main.py +121 -0
pt_variety_identifier/src/n_grams/model.py +99 -0
pt_variety_identifier/src/n_grams/out/.gitkeep +0 -0
pt_variety_identifier/src/n_grams/results.py +56 -0
pt_variety_identifier/src/n_grams/tester.py +57 -0
pt_variety_identifier/src/n_grams/trainer.py +73 -0
pt_variety_identifier/src/results.py +41 -0
pt_variety_identifier/src/tunning.py +47 -0
pt_variety_identifier/src/utils.py +19 -0
setup.py +24 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,195 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode

pt_variety_identifier/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ thrash/*

pt_variety_identifier/__init__.py ADDED Viewed

File without changes

pt_variety_identifier/src/__init__.py ADDED Viewed

File without changes

pt_variety_identifier/src/bert/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.txt
+*.pt
+*.json

pt_variety_identifier/src/bert/data.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from transformers import BertTokenizerFast
+from pt_variety_identifier.src.data import Data as BaseData
+from torch.utils.data import DataLoader
+class Data(BaseData):
+    def __init__(self, dataset_name, tokenizer_name, batch_size, test_set_list):
+        super().__init__(dataset_name=dataset_name, test_set_list=test_set_list)
+        self.tokenizer_name = tokenizer_name
+        self.tokenizer = BertTokenizerFast.from_pretrained(self.tokenizer_name)
+        self.batch_size = batch_size
+    def _tokenize(self, example):
+        return self.tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)
+    def _adapt_dataset(self, dataset):
+        dataset = dataset.map(self._tokenize, batched=True)
+        # Set the tensor type and the columns which the dataset should return
+        dataset.set_format(type='torch', columns=[
+                           'input_ids', 'attention_mask', 'label'])
+        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+    def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None):
+        dataset = super().load_domain(domain=domain, balance=balance,
+                                      pos_prob=pos_prob, ner_prob=ner_prob, sample_size=sample_size)
+        return self._adapt_dataset(dataset)
+    def load_validation_set(self):
+        dataset_dict = super().load_validation_set()
+        for domain in dataset_dict.keys():
+            dataset_dict[domain] = self._adapt_dataset(dataset_dict[domain])
+        return dataset_dict
+    def load_test_set(self, filter_label_2=False):
+        dataset_dict = super().load_test_set(filter_label_2)
+        for test_set in dataset_dict.keys():
+            dataset_dict[test_set] = self._adapt_dataset(
+                dataset_dict[test_set])
+        validation_dataset_dict = self.load_validation_set()
+        for val_set in validation_dataset_dict.keys():
+            dataset_dict[val_set] = validation_dataset_dict[val_set]
+        return dataset_dict

pt_variety_identifier/src/bert/in/.gitkeep ADDED Viewed

File without changes

pt_variety_identifier/src/bert/main.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import os
+import time
+from pt_variety_identifier.src.utils import setup_logger, create_output_dir
+from pt_variety_identifier.src.bert.data import Data
+from tqdm import tqdm
+from pt_variety_identifier.src.tunning import Tunning
+from pt_variety_identifier.src.bert.trainer import Trainer
+from pt_variety_identifier.src.bert.tester import Tester
+from pt_variety_identifier.src.bert.results import Results
+from pt_variety_identifier.src.bert.model import EnsembleIdentfier, LanguageIdentfier
+import torch.multiprocessing as mp
+from threading import Thread
+import logging
+import numpy as np
+class Run:
+    def __init__(self, dataset_name, tokenizer_name, model_name, batch_size, test_set_list) -> None:
+        self.CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+        self.CURRENT_TIME = int(time.time())
+        self.num_gpus = torch.cuda.device_count()
+        self.sem = mp.Semaphore(self.num_gpus)
+        self.gpus_free = [i for i in range(self.num_gpus)]
+        self.test_set_list = test_set_list
+        create_output_dir(self.CURRENT_PATH, self.CURRENT_TIME)
+        setup_logger(self.CURRENT_PATH, self.CURRENT_TIME)
+        self.data = Data(
+            dataset_name, tokenizer_name=tokenizer_name, batch_size=batch_size, test_set_list=test_set_list)
+        self._DOMAINS = ['literature', 'legal', 'politics', 'web', 'social_media', 'journalistic']
+        self.model_name = model_name
+        tqdm.pandas()
+    def tune_with_gpu(self):
+        threads = []
+        for pos_prob in tqdm(range(np.arange(0.0, 1.0, 0.1))):
+            for ner_prob in tqdm(range(np.arange(0.0, 1.0, 0.2))):
+                pos_prob = round(pos_prob, 2)
+                ner_prob = round(ner_prob, 2)
+                self.sem.acquire()
+                gpu_in_use = self.gpus_free.pop()
+                tuner = Tunning(self.data, self._DOMAINS,
+                                Results, Trainer, Tester, 5_000,
+                                self.CURRENT_PATH, self.CURRENT_TIME,
+                                params={
+                                    'epochs': 30,
+                                    'early_stoping': 5,
+                                    'model_name': self.model_name,
+                                    'device': f"cuda:{gpu_in_use}",
+                                    'sem': self.sem,
+                                    'gpus_free': self.gpus_free,
+                                })
+                thread = Thread(target=tuner.run, args=(
+                    pos_prob, pos_prob, ner_prob, ner_prob), daemon=True
+                )
+                threads.append(thread)
+            for t in threads:
+                t.join()
+    def tune_with_cpu(self):
+        tuner = Tunning(self.data, self._DOMAINS,
+                        Results, Trainer, Tester, 5_000,
+                        self.CURRENT_PATH, self.CURRENT_TIME,
+                        params={
+                            'epochs': 30,
+                            'early_stoping': 5,
+                            'model_name': self.model_name,
+                            'device': 'cpu',
+                        })
+        tuner.run()
+    def tune(self):
+        if torch.cuda.is_available():
+            return self.tune_with_gpu()
+        return self.tune_with_cpu()
+    def _train_domain(self, domain, gpu):
+        logging.info(f"Training {domain} domain")
+        data = self.data.load_domain(domain, balance=True, pos_prob=None, ner_prob=None)
+        validation_dataset_dict = self.data.load_validation_set()
+        """
+        logging.info(f"Removing non training domains from validation set")
+        validation_dataset_dict = {
+            domain: validation_dataset_dict[domain]
+        }
+        """
+        trainer = Trainer(data, params={
+            'epochs': 30,
+            'early_stoping': 5,
+            'model_name': self.model_name,
+            'device': gpu,
+            'CURRENT_PATH': self.CURRENT_PATH,
+            'CURRENT_TIME': self.CURRENT_TIME,
+            'training_domain': domain,
+        },validation_dataset_dict=validation_dataset_dict)
+        best_results = trainer.train()
+        logging.info(f"Best results for {domain} domain: {best_results}")
+        logging.info(f"Freeing cuda:{gpu[-1]}")
+        self.gpus_free.append(gpu[-1])
+        return self.sem.release()
+    def train(self):
+        threads = []
+        for domain in ['all']:
+            self.sem.acquire()
+            gpu_in_use = self.gpus_free.pop()
+            thread = Thread(target=self._train_domain, args=(domain, f"cuda:{gpu_in_use}"), daemon=True)
+            threads.append(thread)
+            thread.start()
+        for t in threads:
+            t.join()
+    def test(self):
+        model = LanguageIdentfier(self.model_name)
+        logging.info(f"Loading model from {os.path.join(self.CURRENT_PATH, 'out', str(self.CURRENT_TIME), 'models', 'all.pt')}")
+        model.load_state_dict(torch.load(os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models", "all.pt")))
+        model.eval()
+        model.to('cuda')
+        data = self.data.load_test_set(filter_label_2=True)
+        tester = Tester(data, model, None)
+        results = tester.validate()
+        logging.info(f"Results for all: {results}")
+    def test_ensemble(self):
+        data = self.data.load_test_set(filter_label_2=True)
+        ensemble = EnsembleIdentfier(os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models"), self.model_name)
+        tester = Tester(data, ensemble, None)
+        results = tester.test()
+        logging.info(f"Results for ensemble: {results}")

pt_variety_identifier/src/bert/model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+from transformers import BertModel
+import os
+class EnsembleIdentfier(torch.nn.Module):
+    def __init__(self, models_path, model_name):
+        super().__init__()
+        self.model_name = model_name
+        self.models = torch.nn.ModuleList()
+        # List .pt files in models_path
+        for filename in os.listdir(models_path):
+            if filename.endswith(".pt"):
+                model = LanguageIdentfier(self.model_name)
+                model.load_state_dict(torch.load(os.path.join(models_path, filename)))
+                model.eval()
+                self.models.append(model)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    def forward(self, input_ids, attention_mask):
+        logits = torch.zeros(len(self.models), input_ids.shape[0]).to(self.device)
+        for i, model in enumerate(self.models):
+            model.to(self.device)
+            logits[i] = model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
+            model.cpu()
+        return logits
+class LanguageIdentfier(torch.nn.Module):
+    def __init__(self, model_name):
+        super().__init__()
+        self.model = BertModel.from_pretrained(model_name)
+        self.dropout = torch.nn.Dropout(0.1)
+        self.linear = torch.nn.Linear(
+            self.model.config.hidden_size, 1)
+        self.sigmoid = torch.nn.Sigmoid()
+    def forward(self, input_ids, attention_mask):
+        outputs = self.model(input_ids, attention_mask=attention_mask)
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.linear(pooled_output)
+        logits = self.sigmoid(logits)
+        return logits

pt_variety_identifier/src/bert/out/.gitkeep ADDED Viewed

File without changes

pt_variety_identifier/src/bert/results.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from pt_variety_identifier.src.results import Results as BaseResults
+import logging
+class Results(BaseResults):
+    def __init__(self, filepath, DOMAINS) -> None:
+        super().__init__(filepath, DOMAINS)
+    def process(self, cross_domain_f1, train_domain, test_results, train_results, balance, pos_prob, ner_prob):
+        if cross_domain_f1 > self.best_f1_scores[train_domain]["cross_domain_f1"]:
+            logging.info(f"New best f1 score for {train_domain}")
+            self.best_f1_scores[train_domain]["cross_domain_f1"] = cross_domain_f1
+            self.best_f1_scores[train_domain]["test_results"] = test_results
+            self.best_f1_scores[train_domain]["balance"] = balance
+            self.best_f1_scores[train_domain]["pos_prob"] = pos_prob
+            self.best_f1_scores[train_domain]["ner_prob"] = ner_prob
+            logging.info(
+                f"Saving best cross_domain_f1 scores to file")
+            self.best_final_results()
+            #TODO: Save PyTorch model
+        self.best_intermediate_results({
+            "domain": train_domain,
+            "balance": balance,
+            "pos_prob": pos_prob,
+            "ner_prob": ner_prob,
+            "train": train_results,
+            "test": {
+                'all': test_results,
+                'cross_domain_f1': cross_domain_f1
+            }
+        })

pt_variety_identifier/src/bert/tester.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import torch
+import evaluate
+from tqdm import tqdm
+import logging
+class Tester:
+    def __init__(self, test_dataset_dict, model, train_domain) -> None:
+        self.test_dataset_dict = test_dataset_dict
+        self.model = model
+        self.train_domain = train_domain
+        self.accuracy = evaluate.load("accuracy")
+        self.f1 = evaluate.load("f1")
+        self.precision = evaluate.load("precision")
+        self.recall = evaluate.load("recall")
+        self.loss_fn = torch.nn.BCELoss()
+        self.device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu")
+    def _validate(self, test_dataset):
+        with torch.no_grad():
+            total_loss = 0
+            for batch in tqdm(test_dataset):
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                labels = batch['label'].to(self.device)
+                logits = self.model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
+                loss = self.loss_fn(logits, labels.float())
+                # If logits is bigger than 0.5, it's 1, otherwise it's 0
+                predictions = (logits > 0.5).long()
+                # Detach from GPU
+                predictions = predictions.cpu()
+                labels = labels.cpu()
+                accuracy = self.accuracy.add_batch(
+                    predictions=predictions, references=labels)
+                f1 = self.f1.add_batch(
+                    predictions=predictions, references=labels)
+                precision = self.precision.add_batch(
+                    predictions=predictions, references=labels)
+                recall = self.recall.add_batch(
+                    predictions=predictions, references=labels)
+                total_loss += loss.item()
+            accuracy = self.accuracy.compute()['accuracy']
+            f1 = self.f1.compute()['f1']
+            precision = self.precision.compute()['precision']
+            recall = self.recall.compute()['recall']
+            total_loss = total_loss / len(test_dataset)
+            return accuracy, f1, precision, recall, total_loss
+    def validate(self):
+        self.model.eval()
+        self.model.to(self.device)
+        results = {}
+        average_results = {}
+        for domain in self.test_dataset_dict.keys():
+            logging.info(f"Testing {domain} domain...")
+            accuracy, f1, precision, recall, total_loss = self._validate(self.test_dataset_dict[domain])
+            results[domain] = {
+                'accuracy': accuracy,
+                'f1': f1,
+                'precision': precision,
+                'recall': recall,
+                'loss': total_loss
+            }
+        # Remove key for train domain
+        if self.train_domain in results.keys():
+            results.pop(self.train_domain)
+        if len(results.keys()) == 0:
+            logging.info("Only one domain to test, returning results")
+            return results
+        # Calculate the average of all domains except the train domain
+        for metric in ['accuracy', 'f1', 'precision', 'recall', 'loss']:
+            average_results[metric] = sum([results[domain][metric] for domain in results.keys()]) / len(results.keys())
+        return results, average_results
+    # Migrate this method to Model
+    def _bagging(self, logits):
+        # Average the logits
+        return torch.mean(logits, dim=0)
+    def _test(self, test_dataset):
+        with torch.no_grad():
+            total_loss = 0
+            for batch in tqdm(test_dataset):
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                labels = batch['label'].to(self.device)
+                logits = self.model(input_ids, attention_mask=attention_mask).squeeze(dim=1)
+                logits = self._bagging(logits)
+                loss = self.loss_fn(logits, labels.float())
+                # If logits is bigger than 0.5, it's 1, otherwise it's 0
+                predictions = (logits > 0.5).long()
+                # Detach from GPU
+                predictions = predictions.cpu()
+                labels = labels.cpu()
+                accuracy = self.accuracy.add_batch(
+                    predictions=predictions, references=labels)
+                f1 = self.f1.add_batch(
+                    predictions=predictions, references=labels)
+                precision = self.precision.add_batch(
+                    predictions=predictions, references=labels)
+                recall = self.recall.add_batch(
+                    predictions=predictions, references=labels)
+                total_loss += loss.item()
+            accuracy = self.accuracy.compute()['accuracy']
+            f1 = self.f1.compute()['f1']
+            precision = self.precision.compute()['precision']
+            recall = self.recall.compute()['recall']
+            total_loss = total_loss / len(test_dataset)
+            return accuracy, f1, precision, recall, total_loss
+    def test(self):
+        results={}
+        with torch.no_grad():
+            for test_set in self.test_dataset_dict.keys():
+                logging.info(f"Testing {test_set} dataset")
+                accuracy, f1, precision, recall, total_loss = self._test(self.test_dataset_dict[test_set])
+                results[test_set] = {
+                    'accuracy': accuracy,
+                    'f1': f1,
+                    'precision': precision,
+                    'recall': recall,
+                    'loss': total_loss
+                }
+                logging.info(f"Results for {test_set} dataset: {results[test_set]}")
+        return results

pt_variety_identifier/src/bert/trainer.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import torch
+from tqdm import tqdm
+import logging
+from pt_variety_identifier.src.bert.model import LanguageIdentfier
+from pt_variety_identifier.src.bert.tester import Tester
+import math
+import os
+class Trainer:
+    def __init__(self, train_dataset, params, validation_dataset_dict=None) -> None:
+        self.train_dataset = train_dataset
+        self.model = LanguageIdentfier(params['model_name'])
+        self.epochs = params['epochs']
+        self.lr = 1e-5
+        self.loss_fn = torch.nn.BCELoss()
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr)
+        self.early_stoping = params['early_stoping']
+        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+            self.optimizer, patience=self.early_stoping//2, verbose=True)
+        self.device = params['device']
+        self.CURRENT_PATH = params['CURRENT_PATH']
+        self.CURRENT_TIME = params['CURRENT_TIME']
+        self.training_domain = params['training_domain'] if 'training_domain' in params else 'all'
+        self.validator = None
+        print(f"Using {self.device} device")
+        if validation_dataset_dict:
+            self.validator = Tester(
+                test_dataset_dict=validation_dataset_dict,
+                model=self.model,
+                train_domain=self.training_domain,
+            )
+    def _epoch_iter(self):
+        self.model.train()
+        self.model.to(self.device)
+        self.optimizer.zero_grad()
+        with torch.enable_grad():
+            total_loss = 0
+            for batch in tqdm(self.train_dataset):
+                input_ids = batch['input_ids'].to(self.device)
+                attention_mask = batch['attention_mask'].to(self.device)
+                labels = batch['label'].to(self.device, dtype=torch.float)
+                outputs = self.model(
+                    input_ids, attention_mask=attention_mask).squeeze(dim=1)
+                loss = self.loss_fn(outputs, labels)
+                loss.backward()
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                total_loss += loss.item()
+            self.scheduler.step(total_loss)
+            return total_loss / len(self.train_dataset)
+    def train(self):
+        logging.info(f"Training model in {self.device}...")
+        best_results = {
+            'f1': -math.inf,
+            'accuracy': -math.inf,
+            'precision': -math.inf,
+            'recall': -math.inf,
+            'loss': math.inf
+        }
+        for epoch in tqdm(range(self.epochs)):
+            training_loss = self._epoch_iter()
+            if self.validator:
+                results = self.validator.validate()
+                logging.info(f"Results for {self.training_domain} domain: {results} Epoch: {epoch}")
+                if results['loss'] < best_results['loss'] and results['f1'] > best_results['f1']:
+                    logging.info(
+                        f"Saving best model... Domain:{self.training_domain} F1:{results['f1']} and Test Loss:{results['loss']}")
+                    best_results['loss'] = results['loss']
+                    best_results['accuracy'] = results['accuracy']
+                    best_results['f1'] = results['f1']
+                    best_results['recall'] = results['recall']
+                    best_results['precision'] = results['precision']
+                    torch.save(self.model.state_dict(), os.path.join(self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models", f'{self.training_domain}.pt'))
+                else:
+                    logging.info(f"Not saving model... F1:{results['f1']} and Test Loss:{results['loss']}")
+            logging.info(f"Epoch {epoch} Training Loss: {training_loss}")
+            if training_loss < 0.1:
+                logging.info(f"Training Loss is too low, stoping training...")
+                break
+        return best_results

pt_variety_identifier/src/data.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset
+import logging
+from imblearn.under_sampling import RandomUnderSampler
+import pandas as pd
+from pt_variety_identifier.src.delexicalizer import Delexicalizer
+import pandas as pd
+class Data:
+    def __init__(self, dataset_name, test_set_list) -> None:
+        self._DOMAINS = ['journalistic', 'literature',
+                         'legal', 'politics', 'web', 'social_media']
+        self.dataset_name = dataset_name
+        self.test_set_list = test_set_list
+    def balance_dataset(self, dataset):
+        df_dataset = pd.DataFrame(
+            {'text': dataset['text'], 'label': dataset['label']})
+        logging.info(
+            f"Classe Balance Before Undersampling: {df_dataset['label'].value_counts()}")
+        rus = RandomUnderSampler(random_state=42)
+        X_res, y_res = rus.fit_resample(
+            df_dataset['text'].to_numpy().reshape(-1, 1), df_dataset['label'].to_numpy())
+        df_dataset = pd.DataFrame({'text': X_res.reshape(-1), 'label': y_res})
+        logging.info(
+            f"Classe Balance After Undersampling: {df_dataset['label'].value_counts()}")
+        return Dataset.from_pandas(df_dataset)
+    def _load_domain_all(self, balance):
+        dataset_return = None
+        for domain in self._DOMAINS:
+            dataset = load_dataset(self.dataset_name, domain, split='train')
+            if balance:
+                logging.info(f"Balancing Training Dataset {domain}")
+                dataset = self.balance_dataset(dataset)
+            if dataset_return is None:
+                dataset_return = dataset
+            else:
+                dataset_return = concatenate_datasets(
+                    [dataset_return, dataset])
+        return dataset_return
+    def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None):
+        logging.info(f"Loading {domain} dataset")
+        if domain == 'all':
+            dataset = self._load_domain_all(balance)
+        else:
+            dataset = load_dataset(self.dataset_name, domain, split='train')
+        dataset = dataset.shuffle(seed=42)
+        if balance:
+            logging.info("Balancing Training Dataset")
+            dataset = self.balance_dataset(dataset)
+        if sample_size != None:
+            logging.info("Sampling Training Dataset")
+            dataset = dataset.shuffle(
+                seed=42).select(range(sample_size))
+        df_train = dataset.to_pandas()
+        if pos_prob and ner_prob:
+            delexicalizer = Delexicalizer(pos_prob, ner_prob)
+            logging.info("Delexicalizing Training Dataset")
+            df_train['text'] = df_train['text'].progress_apply(
+                delexicalizer.delexicalize)
+        return Dataset.from_pandas(df_train)
+    def load_validation_set(self):
+        dataset_return = {}
+        for domain in self._DOMAINS:
+            dataset_return[domain] = load_dataset(
+                "LCA-PORVID/portuguese_vid", domain, split='test').shuffle(seed=42)
+        return dataset_return
+    def load_test_set(self, filter_label_2=False):
+        dataset_return = {}
+        for test_set in self.test_set_list:
+            dataset_return[test_set] = load_dataset(test_set, split='test')
+            if filter_label_2:
+                logging.info("Filtering label 2 from test set")
+                dataset_return[test_set] = dataset_return[test_set].filter(
+                    lambda example: example['label'] != 2)
+        return dataset_return

pt_variety_identifier/src/delexicalizer.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import spacy
+import random
+class Delexicalizer:
+    def __init__(self, prob_pos_tag, prob_ner_tag, spacy_model="pt_core_news_sm") -> None:
+        if not spacy_model in spacy.util.get_installed_models():
+            spacy.cli.download(spacy_model)
+        self.nlp = spacy.load(spacy_model, enable=["parser", "tagger", "ner"])
+        if prob_pos_tag < 0 or prob_pos_tag > 1:
+            raise ValueError("prob_pos_tag must be between 0 and 1")
+        if prob_ner_tag < 0 or prob_ner_tag > 1:
+            raise ValueError("prob_ner_tag must be between 0 and 1")
+        self.prob_pos_tag = prob_pos_tag
+        self.prob_ner_tag = prob_ner_tag
+    def delexicalize(self, text):
+        doc = self.nlp(text)
+        list_tokens = []
+        for token in doc:
+            if token.ent_type > 0 and random.uniform(0, 1) < self.prob_ner_tag:
+                list_tokens.append(token.ent_type_)
+            elif random.uniform(0, 1) < self.prob_pos_tag:
+                list_tokens.append(token.pos_)
+            else:
+                list_tokens.append(token.text)
+        return ' '.join(list_tokens)

pt_variety_identifier/src/n_grams/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ out/*
2	+ !out/.gitkeep

pt_variety_identifier/src/n_grams/__init__.py ADDED Viewed

File without changes

pt_variety_identifier/src/n_grams/data.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pt_variety_identifier.src.data import Data as DataBase
+class Data(DataBase):
+    def __init__(self, dataset_name, test_set_list) -> None:
+        self._DOMAINS = ['journalistic', 'literature',
+                         'legal', 'politics', 'web', 'social_media']
+        self.dataset_name = dataset_name
+        self.test_set_list = test_set_list
+    def load_test_set(self, filter_label_2=False):
+        dataset_return = super().load_test_set(filter_label_2)
+        validation_dataset_dict = self.load_validation_set()
+        for key in validation_dataset_dict:
+            dataset_return[key] = validation_dataset_dict[key]
+        return dataset_return

pt_variety_identifier/src/n_grams/in/.gitkeep ADDED Viewed

File without changes

pt_variety_identifier/src/n_grams/in/best_params.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+    "literature": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                4
+            ],
+            "tfidf__max_features": 50000,
+            "tfidf__lowercase": false,
+            "tfidf__analyzer": "char"
+        }
+    },
+    "legal": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                3
+            ],
+            "tfidf__max_features": 50000,
+            "tfidf__lowercase": false,
+            "tfidf__analyzer": "word"
+        }
+    },
+    "politics": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                1
+            ],
+            "tfidf__max_features": 50000,
+            "tfidf__lowercase": true,
+            "tfidf__analyzer": "word"
+        }
+    },
+    "web": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                1
+            ],
+            "tfidf__max_features": 10000,
+            "tfidf__lowercase": true,
+            "tfidf__analyzer": "word"
+        }
+    },
+    "social_media": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                1
+            ],
+            "tfidf__max_features": 500,
+            "tfidf__lowercase": false,
+            "tfidf__analyzer": "word"
+        }
+    },
+    "journalistic": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                2
+            ],
+            "tfidf__max_features": 10000,
+            "tfidf__lowercase": false,
+            "tfidf__analyzer": "word"
+        }
+    },
+    "all": {
+        "tfidf": {
+            "tfidf__ngram_range": [
+                1,
+                3
+            ],
+            "tfidf__max_features": 50000,
+            "tfidf__lowercase": false,
+            "tfidf__analyzer": "word"
+        }
+    }
+}

pt_variety_identifier/src/n_grams/in/params.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "tfidf__max_features": [
+        100,
+        500,
+        1000,
+        5000,
+        10000,
+        50000,
+        100000
+    ],
+    "tfidf__ngram_range": [
+        [
+            1,
+            1
+        ],
+        [
+            1,
+            2
+        ],
+        [
+            1,
+            3
+        ],
+        [
+            1,
+            4
+        ],
+        [
+            1,
+            5
+        ],
+        [
+            1,
+            10
+        ]
+    ],
+    "tfidf__lowercase": [
+        true,
+        false
+    ],
+    "tfidf__analyzer": [
+        "word",
+        "char"
+    ]
+}

pt_variety_identifier/src/n_grams/in/params1.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+    "tfidf__max_features": [
+        100,
+        500,
+        1000,
+        5000,
+        10000,
+        50000,
+        100000
+    ],
+    "tfidf__ngram_range": [
+        [
+            1,
+            1
+        ],
+        [
+            1,
+            2
+        ],
+        [
+            1,
+            3
+        ],
+        [
+            1,
+            4
+        ],
+        [
+            1,
+            5
+        ],
+        [
+            1,
+            10
+        ]
+    ],
+    "tfidf__lowercase": [
+        true,
+        false
+    ],
+    "tfidf__analyzer": [
+        "word",
+        "char",
+        "char_wb"
+    ]
+}

pt_variety_identifier/src/n_grams/main.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+from time import time
+import json
+from pt_variety_identifier.src.n_grams.data import Data
+from pt_variety_identifier.src.n_grams.results import Results
+from pt_variety_identifier.src.n_grams.trainer import Trainer
+from pt_variety_identifier.src.n_grams.tester import Tester
+from tqdm import tqdm
+from pt_variety_identifier.src.utils import setup_logger, create_output_dir
+from pt_variety_identifier.src.tunning import Tunning
+import logging
+from joblib import dump, load
+from pt_variety_identifier.src.n_grams.model import EnsembleIdentfier, LanguageIdentifier
+class Run:
+    def __init__(self, dataset_name, test_set_list) -> None:
+        self.CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+        self.CURRENT_TIME = str(int(time()))
+        self.params = self.load_params()
+        create_output_dir(self.CURRENT_PATH, self.CURRENT_TIME)
+        setup_logger(self.CURRENT_PATH, self.CURRENT_TIME)
+        self.data = Data(dataset_name, test_set_list)
+        self._DOMAINS = ['literature', 'journalistic',
+                         'legal', 'politics', 'web', 'social_media']
+        # Enable progress bar for pandas
+        tqdm.pandas()
+        self.tuner = Tunning(self.data, self._DOMAINS, Results, Trainer, Tester, sample_size=5_000,
+                             CURRENT_PATH=self.CURRENT_PATH, CURRENT_TIME=self.CURRENT_TIME, params=self.params)
+    def load_params(self):
+        f = open(os.path.join(self.CURRENT_PATH, "in", "params.json"),
+                 "r", encoding="utf-8")
+        # Fail if params.json does not exist
+        if f == None:
+            raise FileNotFoundError("params.json not found")
+        dict_obj = json.load(f)
+        if 'tfidf__ngram_range' in dict_obj:
+            # Cast tfidf__ngram_range to tuple
+            for idx, elem in enumerate(dict_obj['tfidf__ngram_range']):
+                dict_obj['tfidf__ngram_range'][idx] = tuple(elem)
+        return dict_obj
+    def tune(self):
+        return self.tuner.run()
+    def train(self):
+        with open(os.path.join(self.CURRENT_PATH, "in", "best_params.json"), "r", encoding="utf-8") as f:
+            best_params = json.load(f)
+            for domain in ['all']:
+                logging.info(f"Training {domain} domain")
+                data = self.data.load_domain(
+                    domain, balance=True, pos_prob=None, ner_prob=None)
+                validation_dataset_dict = self.data.load_validation_set()
+                """
+                logging.info(
+                    f"Removing non training domains from validation set")
+                validation_dataset_dict = {
+                    domain: validation_dataset_dict[domain]
+                }
+                """
+                trainer = Trainer(
+                    train_dataset=data,
+                    params=best_params[domain]["tfidf"]
+                )
+                best_pipeline = trainer.train()
+                tester = Tester(
+                    test_dataset_dict=validation_dataset_dict,
+                    pipeline=best_pipeline,
+                    train_domain=domain
+                )
+                results = tester.test()
+                logging.info(f"Results for {domain} domain: {results}")
+                logging.info(f"Save Model for {domain} domain")
+                dump(best_pipeline, os.path.join(
+                    self.CURRENT_PATH, "out", self.CURRENT_TIME, "models", f"{domain}_model.joblib"))
+    def test(self):
+        test_data = self.data.load_test_set(filter_label_2=True)
+        pipeline = load(os.path.join(
+            self.CURRENT_PATH, "out", self.CURRENT_TIME, "models", "all_model.joblib"))
+        tester = Tester(test_data, pipeline, None)
+        results = tester.test()
+        logging.info(f"Results for test set: {results}")
+    def test_ensemble(self):
+        test_data = self.data.load_test_set(filter_label_2=True)
+        ensemble = EnsembleIdentfier(os.path.join(
+            self.CURRENT_PATH, "out", str(self.CURRENT_TIME), "models"))
+        tester = Tester(test_data, ensemble, None)
+        results = tester.test()
+        logging.info(f"Results for ensemble: {results}")

pt_variety_identifier/src/n_grams/model.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import nltk
+from nltk.tokenize import word_tokenize
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.base import BaseEstimator
+from joblib import load
+import os
+import math
+from tqdm import tqdm
+import logging
+class EnsembleIdentfier(BaseEstimator):
+    def __init__(self, models_path) -> None:
+        super().__init__()
+        self.models = []
+        for filename in os.listdir(models_path):
+            if filename.endswith(".joblib"):
+                logging.info(f"Loading model {filename}")
+                model = load(os.path.join(models_path, filename))
+                self.models.append(model)
+    def _bagging(self, predictions_proba):
+        # Initialize best_predictions with the first prediction
+        best_prediction = None
+        best_proba = -math.inf
+        for prediction_proba in predictions_proba:
+            pred_0_label = prediction_proba[0][0]
+            pred_1_label = prediction_proba[0][1]
+            if pred_0_label > best_proba:
+                best_prediction = 0
+                best_proba = pred_0_label
+            if pred_1_label > best_proba:
+                best_prediction = 1
+                best_proba = pred_1_label
+        return best_prediction
+    def predict(self, X):
+        return self.predict_proba(X)
+    def predict_proba(self, X):
+        final_predictions = []
+        for i in tqdm(range(len(X))):
+            predictions = []
+            for model in self.models:
+                predictions.append(model.predict_proba([X[i]]))
+            final_predictions.append(self._bagging(predictions))
+        return final_predictions
+class LanguageIdentifier(BaseEstimator):
+    def __init__(self, params: dict) -> None:
+        nltk.download("stopwords")
+        nltk.download("punkt")
+        self.pipeline = Pipeline([
+            ('tfidf', TfidfVectorizer(
+                tokenizer=lambda text: word_tokenize(
+                    text, language='portuguese'),
+                stop_words=nltk.corpus.stopwords.words('portuguese'),
+                ngram_range=(params['tfidf__ngram_range'][0],
+                             params['tfidf__ngram_range'][1]),
+                max_features=params['tfidf__max_features'],
+                analyzer=params['tfidf__analyzer'],
+                lowercase=params['tfidf__lowercase']
+            )),
+            ('clf', BernoulliNB())
+        ])
+    def fit(self, X, y):
+        return self.pipeline.fit(X, y)
+    def predict(self, X):
+        return self.pipeline.predict(X)
+    def predict_proba(self, X):
+        return self.pipeline.predict_proba(X)
+    def score(self, X, y):
+        return self.pipeline.score(X, y)
+    def get_params(self, deep=True):
+        return self.pipeline.get_params(deep)
+    def set_params(self, **params):
+        return self.pipeline.set_params(**params)
+    def __str__(self) -> str:
+        return self.pipeline.__str__()

pt_variety_identifier/src/n_grams/out/.gitkeep ADDED Viewed

File without changes

pt_variety_identifier/src/n_grams/results.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from pt_variety_identifier.src.results import Results as BaseResults
+import logging
+import os
+from joblib import dump
+import pandas as pd
+class Results(BaseResults):
+    def __init__(self, filepath, DOMAINS) -> None:
+        super().__init__(filepath, DOMAINS)
+    def process(self, cross_domain_f1, train_domain, test_results, grid_results, balance, pos_prob, ner_prob):
+        if cross_domain_f1 > self.best_f1_scores[train_domain]["cross_domain_f1"]:
+            logging.info(f"New best f1 score for {train_domain}")
+            self.best_f1_scores[train_domain]["cross_domain_f1"] = cross_domain_f1
+            self.best_f1_scores[train_domain]["test_results"] = test_results
+            self.best_f1_scores[train_domain]["params"] = grid_results.best_params_
+            self.best_f1_scores[train_domain]["balance"] = balance
+            self.best_f1_scores[train_domain]["pos_prob"] = pos_prob
+            self.best_f1_scores[train_domain]["ner_prob"] = ner_prob
+            logging.info(
+                f"Saving best cross_domain_f1 scores to file")
+            self.best_final_results()
+            with open(os.path.join(self.filepath, "models", f"{train_domain}.joblib"), "wb") as f:
+                dump(grid_results.best_estimator_, f)
+        self.best_intermediate_results({
+            "domain": train_domain,
+            "balance": balance,
+            "pos_prob": pos_prob,
+            "ner_prob": ner_prob,
+            "train": {
+                "best_score": grid_results.best_score_,
+            },
+            "test": {
+                'all': test_results,
+                'cross_domain_f1': cross_domain_f1
+            },
+            "best_params": grid_results.best_params_
+        })
+        self.other_results({
+            "domain": train_domain,
+            "balance": balance,
+            "pos_prob": pos_prob,
+            "ner_prob": ner_prob,
+            "train": {
+                "cv_results": pd.DataFrame(grid_results.cv_results_).to_json()
+            },
+            "test": test_results,
+        })

pt_variety_identifier/src/n_grams/tester.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import evaluate
+import logging
+class Tester:
+    def __init__(self, test_dataset_dict, pipeline, train_domain) -> None:
+        self.test_dataset_dict = test_dataset_dict
+        self.accuracy = evaluate.load("accuracy")
+        self.f1 = evaluate.load("f1")
+        self.precision = evaluate.load("precision")
+        self.recall = evaluate.load("recall")
+        self.pipeline = pipeline
+        self.train_domain = train_domain
+    def _test(self, test_dataset):
+        predictions = self.pipeline.predict(test_dataset['text'])
+        accuracy = self.accuracy.compute(
+            references=test_dataset['label'], predictions=predictions)['accuracy']
+        f1 = self.f1.compute(
+            references=test_dataset['label'], predictions=predictions)['f1']
+        precision = self.precision.compute(
+            references=test_dataset['label'], predictions=predictions)['precision']
+        recall = self.recall.compute(
+            references=test_dataset['label'], predictions=predictions)['recall']
+        return accuracy, f1, precision, recall
+    def test(self):
+        results = {}
+        for domain in self.test_dataset_dict.keys():
+            logging.info(f"Testing {domain} domain")
+            test_dataset = self.test_dataset_dict[domain]
+            accuracy, f1, precision, recall = self._test(test_dataset)
+            results[domain] = {
+                'accuracy': accuracy,
+                'f1': f1,
+                'precision': precision,
+                'recall': recall
+            }
+        if len(results.keys()) == 1:
+            logging.info("Only one domain to test")
+            return results
+        # Calculate the average of all domains except the train domain
+        average_f1 = sum([results[domain]['f1'] for domain in results.keys(
+        ) if domain != self.train_domain]) / (len(results.keys()) - 1)
+        return results, average_f1

pt_variety_identifier/src/n_grams/trainer.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from sklearn.naive_bayes import MultinomialNB, BernoulliNB
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import StratifiedKFold
+from sklearn.pipeline import Pipeline
+from sklearn.feature_extraction.text import TfidfVectorizer
+import nltk
+from nltk.tokenize import word_tokenize
+import numpy as np
+import logging
+from pt_variety_identifier.src.n_grams.model import LanguageIdentifier
+class Trainer:
+    def __init__(self, train_dataset, params) -> None:
+        self.train_dataset = train_dataset
+        self.model = LanguageIdentifier(params)
+    def train(self):
+        logging.info("Training model...")
+        fitted_model = self.model.fit(
+            np.array(self.train_dataset['text']), np.array(self.train_dataset['label']))
+        logging.info("Training finished!")
+        return fitted_model
+"""
+class Trainer:
+    def __init__(self, train_dataset, params, n_iter=500) -> None:
+        nltk.download("stopwords")
+        nltk.download("punkt")
+        self.pipeline = Pipeline([
+            ('tfidf', TfidfVectorizer(
+                tokenizer=lambda text: word_tokenize(
+                    text, language='portuguese'),
+                stop_words=nltk.corpus.stopwords.words('portuguese')
+            )),
+            ('clf', BernoulliNB())
+        ])
+        self.params = params
+        self.n_iter = n_iter
+        self.cv = StratifiedKFold(n_splits=2, random_state=42, shuffle=True)
+        self.search = RandomizedSearchCV(
+            self.pipeline,
+            self.params,
+            scoring='f1_macro',
+            n_jobs=-1,
+            n_iter=self.n_iter,
+            cv=self.cv,
+            error_score='raise'
+        )
+        self.train_dataset = train_dataset
+    def train(self):
+        logging.info("Training model...")
+        results = self.search.fit(
+            np.array(self.train_dataset['text']), np.array(self.train_dataset['label']))
+        logging.info("Training finished!")
+        return results, results.best_estimator_
+"""

pt_variety_identifier/src/results.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import json
+import math
+class Results:
+    def __init__(self, filepath, DOMAINS) -> None:
+        self.filepath = filepath
+        self.best_intermediate_results_list = []
+        self.other_results_list = []
+        self.DOMAINS = DOMAINS
+        self.best_f1_scores = {
+            domain: {
+                "cross_domain_f1": -math.inf,
+                "params": {},
+                "balance": None,
+                "pos_prob": None,
+                "ner_prob": None
+            }
+            for domain in self.DOMAINS
+        }
+    def best_intermediate_results(self, result):
+        self.best_intermediate_results_list.append(result)
+        with open(os.path.join(self.filepath, 'best_intermediate_self.json'), "w", encoding="utf-8") as f:
+            json.dump(self.best_intermediate_results_list, f, ensure_ascii=False,
+                      indent=4)
+    def best_final_results(self):
+        with open(os.path.join(self.filepath, 'best_final_self.json'), "w", encoding="utf-8") as f:
+            json.dump(self.best_f1_scores, f, ensure_ascii=False, indent=4)
+    def other_results(self, result):
+        self.other_results_list.append(result)
+        with open(os.path.join(self.filepath, 'other_self.json'), "w", encoding="utf-8") as f:
+            json.dump(self.other_results_list, f, ensure_ascii=False,
+                      indent=4)

pt_variety_identifier/src/tunning.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import numpy as np
+import logging
+import os
+class Tunning:
+    def __init__(self, data, domains, Results, Trainer, Tester, sample_size, CURRENT_PATH, CURRENT_TIME, params=None) -> None:
+        self.data = data
+        self.Trainer = Trainer
+        self.Tester = Tester
+        self._DOMAINS = domains
+        self.sample_size = sample_size
+        self.CURRENT_PATH = CURRENT_PATH
+        self.CURRENT_TIME = CURRENT_TIME
+        self.results = Results(os.path.join(
+            self.CURRENT_PATH, "out", str(CURRENT_TIME)), self._DOMAINS)
+        self.params = params
+    def run(self, start_pos_prob=0.0, stop_pos_prob=1.0, start_ner_prob=0.0, stop_ner_prob=1.0):
+        logging.info(f"Start pos_prob={start_pos_prob}, stop_pos_prob={stop_pos_prob}")
+        test_dataset = self.data.load_test_set()
+        for pos_prob in np.arange(start_pos_prob, stop_pos_prob + 0.1, 0.1):
+            for ner_prob in np.arange(start_ner_prob, stop_ner_prob + 0.1, 0.1):
+                for domain in self._DOMAINS:
+                    logging.info(
+                        f"Running {domain} pos_prob={pos_prob}, ner_prob={ner_prob}")
+                    dataset = self.data.load_domain(
+                        domain, balance=True, pos_prob=pos_prob, ner_prob=ner_prob, sample_size=self.sample_size)
+                    trainer = self.Trainer(dataset, self.params)
+                    results, best_model = trainer.train()
+                    validation_results = self.Tester(
+                        test_dataset, best_model, train_domain=domain).validate()
+                    logging.info(
+                        f"Cross domain f1 score: {validation_results['f1']} | test_results: {validation_results}")
+                    self.results.process(validation_results['f1'], domain, validation_results,
+                                         results, balance=True, pos_prob=pos_prob, ner_prob=ner_prob)

pt_variety_identifier/src/utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import logging
+def setup_logger(CURRENT_PATH, CURRENT_TIME):
+    print(
+        f"Logging to {os.path.join(CURRENT_PATH, 'out', str(CURRENT_TIME), 'logs', 'log.txt')}")
+    logging.basicConfig(filename=os.path.join(CURRENT_PATH, "out", str(CURRENT_TIME), "logs", "log.txt"), filemode='w',
+                        format='%(asctime)s - %(levelname)s - %(message)s',
+                        level=logging.INFO)
+def create_output_dir(CURRENT_PATH, CURRENT_TIME):
+    os.mkdir(os.path.join(CURRENT_PATH,
+             "out", str(CURRENT_TIME)))
+    os.mkdir(os.path.join(CURRENT_PATH, "out",
+             str(CURRENT_TIME), "logs"))
+    os.mkdir(os.path.join(CURRENT_PATH, "out",
+                          str(CURRENT_TIME), "models"))

setup.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from setuptools import setup, find_packages
+setup(
+    name='pt_variety_identifier',
+    version='0.0.1',
+    description='Identify the variety of Portuguese used in a text',
+    install_requires=[
+        'pandas',
+        'datasets',
+        'zstandard',
+        'clean-text[gpl]',
+        'fasttext-langdetect',
+        'numpy',
+        'tqdm',
+        'imbalanced-learn',
+        'spacy[cuda11x]',
+        'evaluate',
+        'nltk',
+        'transformers',
+        'torch',
+    ],
+    packages=find_packages(),
+    author='John Doe'
+)