File size: 3,526 Bytes
ebdb5af | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | from datasets import load_dataset, concatenate_datasets, DatasetDict, Dataset
import logging
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from pt_variety_identifier.src.delexicalizer import Delexicalizer
import pandas as pd
class Data:
def __init__(self, dataset_name, test_set_list) -> None:
self._DOMAINS = ['journalistic', 'literature',
'legal', 'politics', 'web', 'social_media']
self.dataset_name = dataset_name
self.test_set_list = test_set_list
def balance_dataset(self, dataset):
df_dataset = pd.DataFrame(
{'text': dataset['text'], 'label': dataset['label']})
logging.info(
f"Classe Balance Before Undersampling: {df_dataset['label'].value_counts()}")
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(
df_dataset['text'].to_numpy().reshape(-1, 1), df_dataset['label'].to_numpy())
df_dataset = pd.DataFrame({'text': X_res.reshape(-1), 'label': y_res})
logging.info(
f"Classe Balance After Undersampling: {df_dataset['label'].value_counts()}")
return Dataset.from_pandas(df_dataset)
def _load_domain_all(self, balance):
dataset_return = None
for domain in self._DOMAINS:
dataset = load_dataset(self.dataset_name, domain, split='train')
if balance:
logging.info(f"Balancing Training Dataset {domain}")
dataset = self.balance_dataset(dataset)
if dataset_return is None:
dataset_return = dataset
else:
dataset_return = concatenate_datasets(
[dataset_return, dataset])
return dataset_return
def load_domain(self, domain, balance, pos_prob, ner_prob, sample_size=None):
logging.info(f"Loading {domain} dataset")
if domain == 'all':
dataset = self._load_domain_all(balance)
else:
dataset = load_dataset(self.dataset_name, domain, split='train')
dataset = dataset.shuffle(seed=42)
if balance:
logging.info("Balancing Training Dataset")
dataset = self.balance_dataset(dataset)
if sample_size != None:
logging.info("Sampling Training Dataset")
dataset = dataset.shuffle(
seed=42).select(range(sample_size))
df_train = dataset.to_pandas()
if pos_prob and ner_prob:
delexicalizer = Delexicalizer(pos_prob, ner_prob)
logging.info("Delexicalizing Training Dataset")
df_train['text'] = df_train['text'].progress_apply(
delexicalizer.delexicalize)
return Dataset.from_pandas(df_train)
def load_validation_set(self):
dataset_return = {}
for domain in self._DOMAINS:
dataset_return[domain] = load_dataset(
"LCA-PORVID/portuguese_vid", domain, split='test').shuffle(seed=42)
return dataset_return
def load_test_set(self, filter_label_2=False):
dataset_return = {}
for test_set in self.test_set_list:
dataset_return[test_set] = load_dataset(test_set, split='test')
if filter_label_2:
logging.info("Filtering label 2 from test set")
dataset_return[test_set] = dataset_return[test_set].filter(
lambda example: example['label'] != 2)
return dataset_return
|