from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset import logging from imblearn.under_sampling import RandomUnderSampler import pandas as pd from pt_variety_identifier.src.delexicalizer import Delexicalizer import pandas as pd class Data: def __init__(self, dataset_name, test_set_list) -> None: self._DOMAINS = ['journalistic', 'literature', 'legal', 'politics', 'web', 'social_media'] self.dataset_name = dataset_name self.test_set_list = test_set_list def balance_dataset(self, dataset): df_dataset = pd.DataFrame( {'text': dataset['text'], 'label': dataset['label']}) logging.info( f"Classe Balance Before Undersampling: {df_dataset['label'].value_counts()}") rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample( df_dataset['text'].to_numpy().reshape(-1, 1), df_dataset['label'].to_numpy()) df_dataset = pd.DataFrame({'text': X_res.reshape(-1), 'label': y_res}) logging.info( f"Classe Balance After Undersampling: {df_dataset['label'].value_counts()}") return Dataset.from_pandas(df_dataset) def _load_domain_all(self, balance): dataset_return = None for domain in self._DOMAINS: dataset = load_dataset(self.dataset_name, domain, split='train') if balance: logging.info(f"Balancing Training Dataset {domain}") dataset = self.balance_dataset(dataset) if dataset_return is None: dataset_return = dataset else: dataset_return = concatenate_datasets( [dataset_return, dataset]) return dataset_return def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None): logging.info(f"Loading {domain} dataset") if domain == 'all': dataset = self._load_domain_all(balance) else: dataset = load_dataset(self.dataset_name, domain, split='train') dataset = dataset.shuffle(seed=42) if balance: logging.info("Balancing Training Dataset") dataset = self.balance_dataset(dataset) if sample_size != None: logging.info("Sampling Training Dataset") dataset = dataset.shuffle( seed=42).select(range(sample_size)) df_train = dataset.to_pandas() if pos_prob and ner_prob: delexicalizer = Delexicalizer(pos_prob, ner_prob) logging.info("Delexicalizing Training Dataset") df_train['text'] = df_train['text'].progress_apply( delexicalizer.delexicalize) return Dataset.from_pandas(df_train) def load_validation_set(self): dataset_return = {} for domain in self._DOMAINS: dataset_return[domain] = load_dataset( "LCA-PORVID/portuguese_vid", domain, split='test').shuffle(seed=42) return dataset_return def load_test_set(self, filter_label_2=False): dataset_return = {} for test_set in self.test_set_list: dataset_return[test_set] = load_dataset(test_set, split='test') if filter_label_2: logging.info("Filtering label 2 from test set") dataset_return[test_set] = dataset_return[test_set].filter( lambda example: example['label'] != 2) return dataset_return