| from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset | |
| import logging | |
| from imblearn.under_sampling import RandomUnderSampler | |
| import pandas as pd | |
| from pt_variety_identifier.src.delexicalizer import Delexicalizer | |
| import pandas as pd | |
| class Data: | |
| def __init__(self, dataset_name, test_set_list) -> None: | |
| self._DOMAINS = ['journalistic', 'literature', | |
| 'legal', 'politics', 'web', 'social_media'] | |
| self.dataset_name = dataset_name | |
| self.test_set_list = test_set_list | |
| def balance_dataset(self, dataset): | |
| df_dataset = pd.DataFrame( | |
| {'text': dataset['text'], 'label': dataset['label']}) | |
| logging.info( | |
| f"Classe Balance Before Undersampling: {df_dataset['label'].value_counts()}") | |
| rus = RandomUnderSampler(random_state=42) | |
| X_res, y_res = rus.fit_resample( | |
| df_dataset['text'].to_numpy().reshape(-1, 1), df_dataset['label'].to_numpy()) | |
| df_dataset = pd.DataFrame({'text': X_res.reshape(-1), 'label': y_res}) | |
| logging.info( | |
| f"Classe Balance After Undersampling: {df_dataset['label'].value_counts()}") | |
| return Dataset.from_pandas(df_dataset) | |
| def _load_domain_all(self, balance): | |
| dataset_return = None | |
| for domain in self._DOMAINS: | |
| dataset = load_dataset(self.dataset_name, domain, split='train') | |
| if balance: | |
| logging.info(f"Balancing Training Dataset {domain}") | |
| dataset = self.balance_dataset(dataset) | |
| if dataset_return is None: | |
| dataset_return = dataset | |
| else: | |
| dataset_return = concatenate_datasets( | |
| [dataset_return, dataset]) | |
| return dataset_return | |
| def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None): | |
| logging.info(f"Loading {domain} dataset") | |
| if domain == 'all': | |
| dataset = self._load_domain_all(balance) | |
| else: | |
| dataset = load_dataset(self.dataset_name, domain, split='train') | |
| dataset = dataset.shuffle(seed=42) | |
| if balance: | |
| logging.info("Balancing Training Dataset") | |
| dataset = self.balance_dataset(dataset) | |
| if sample_size != None: | |
| logging.info("Sampling Training Dataset") | |
| dataset = dataset.shuffle( | |
| seed=42).select(range(sample_size)) | |
| df_train = dataset.to_pandas() | |
| if pos_prob and ner_prob: | |
| delexicalizer = Delexicalizer(pos_prob, ner_prob) | |
| logging.info("Delexicalizing Training Dataset") | |
| df_train['text'] = df_train['text'].progress_apply( | |
| delexicalizer.delexicalize) | |
| return Dataset.from_pandas(df_train) | |
| def load_validation_set(self): | |
| dataset_return = {} | |
| for domain in self._DOMAINS: | |
| dataset_return[domain] = load_dataset( | |
| "LCA-PORVID/portuguese_vid", domain, split='test').shuffle(seed=42) | |
| return dataset_return | |
| def load_test_set(self, filter_label_2=False): | |
| dataset_return = {} | |
| for test_set in self.test_set_list: | |
| dataset_return[test_set] = load_dataset(test_set, split='test') | |
| if filter_label_2: | |
| logging.info("Filtering label 2 from test set") | |
| dataset_return[test_set] = dataset_return[test_set].filter( | |
| lambda example: example['label'] != 2) | |
| return dataset_return | |