AU-VN-ResearchGroup's picture
src
e8e72fb
import sys
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import warnings
import os
warnings.filterwarnings("ignore")
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.abspath(os.path.join(current_dir, os.pardir, os.pardir))
sys.path.append(project_root)
from src.utils import get_information, split_into_char
from src.config.configs import *
params = Params()
class Dataset(object):
def __init__(self, train_txt, val_txt, test_txt, num_inputs = 5):
params = Params()
self.train_txt = train_txt
self.val_txt = val_txt
self.test_txt = test_txt
self.classes = None
self.batch_size = params.BATCH_SIZE
# Data attributes
self.train_samples, self.val_samples, self.test_samples = self._get_samples()
# Data dataframe
self.train_df, self.val_df, self.test_df = self._get_dataframe()
# Word-level input
self.train_sentences, self.val_sentences, self.test_sentences = self._get_word_sentences()
# Char-level input
self.train_char, self.val_char, self.test_char = self._get_char_sentences()
# Positional input
## Line_ids
if num_inputs == 5:
self.line_ids_one_hot, self.line_ids_val_one_hot, self.line_ids_test_one_hot = self._get_lines_id()
## Length_lines
self.length_lines_one_hot, self.length_lines_val_one_hot, self.length_lines_test_one_hot = self._get_length_lines()
## Total_lengths
self.total_lines_one_hot, self.total_lines_val_one_hot, self.total_lines_test_one_hot = self._get_total_lines()
# Label one-hot
self.y_train_one_hot, self.y_val_one_hot, self.y_test_one_hot = self._one_hot_encoder()
# Label indexes
self.y_train, self.y_val, self.y_test = self._label_encoder()
def _get_samples(self):
"""
Get data features
"""
self.train_samples = get_information(self.train_txt)
self.val_samples = get_information(self.val_txt)
self.test_samples = get_information(self.test_txt)
return self.train_samples, self.val_samples, self.test_samples
def _get_dataframe(self):
"""
Create dataframe for visualization from samples
"""
self.train_df = pd.DataFrame(self.train_samples)
self.val_df = pd.DataFrame(self.val_samples)
self.test_df = pd.DataFrame(self.test_samples)
return self.train_df, self.val_df, self.test_df
def _get_word_sentences(self):
"""
Get list of sentences of from train/val/test set (Word-level tokens)
"""
self.train_sentences = self.train_df['text']
self.val_sentences = self.val_df['text']
self.test_sentences = self.test_df['text']
return self.train_sentences, self.val_sentences, self.test_sentences
def _get_char_sentences(self):
"""
Get list of chars of sentences from train/val/test set (Char-level tokens)
"""
self.train_char = [split_into_char(line) for line in self.train_df['text']]
self.val_char = [split_into_char(line) for line in self.val_df['text']]
self.test_char = [split_into_char(line) for line in self.test_df['text']]
return self.train_char, self.val_char, self.test_char
def _one_hot_encoder(self):
"""
Get one-hot label vector from labels
"""
one_hot_encoder = OneHotEncoder(sparse=False)
self.y_train_one_hot = one_hot_encoder.fit_transform(self.train_df["target"].to_numpy().reshape(-1, 1))
self.y_val_one_hot = one_hot_encoder.fit_transform(self.val_df["target"].to_numpy().reshape(-1, 1))
self.y_test_one_hot = one_hot_encoder.fit_transform(self.test_df["target"].to_numpy().reshape(-1, 1))
return self.y_train_one_hot, self.y_val_one_hot, self.y_test_one_hot
def _label_encoder(self) :
"""
Get index class from labels
"""
label_encoder = LabelEncoder()
self.y_train = label_encoder.fit_transform(self.train_df["target"].to_numpy())
self.y_val = label_encoder.transform(self.val_df["target"].to_numpy())
self.y_test = label_encoder.transform(self.test_df["target"].to_numpy())
# Get label index
self.classes = label_encoder.classes_
return self.y_train, self.y_val, self.y_test
#-----------------------POSITIONAL FEATURES------------------------------------------
def _get_lines_id(self):
"""
Get line_ids feature
"""
try:
self.line_ids_one_hot = tf.one_hot(self.train_df['line_id'].to_numpy(), depth = params.LINE_IDS_DEPTH)
self.line_ids_val_one_hot = tf.one_hot(self.val_df['line_id'].to_numpy(), depth = params.LINE_IDS_DEPTH)
self.line_ids_test_one_hot = tf.one_hot(self.test_df['line_id'].to_numpy(), depth = params.LINE_IDS_DEPTH)
except Exception as e:
print("Error encoding line_ids:", e)
return self.line_ids_one_hot, self.line_ids_val_one_hot, self.line_ids_test_one_hot
def _get_length_lines(self):
try:
self.length_lines_one_hot = tf.one_hot(self.train_df['length_lines'].to_numpy(), depth = params.LENGTH_LINES_DEPTH)
self.length_lines_val_one_hot = tf.one_hot(self.val_df['length_lines'].to_numpy(), depth = params.LENGTH_LINES_DEPTH)
self.length_lines_test_one_hot = tf.one_hot(self.test_df['length_lines'].to_numpy(), depth = params.LENGTH_LINES_DEPTH)
except Exception as e:
print("Error encoding length_lines:", e)
return self.length_lines_one_hot, self.length_lines_val_one_hot, self.length_lines_test_one_hot
def _get_total_lines(self):
try:
self.total_lines_one_hot = tf.one_hot(self.train_df['total_lines'].to_numpy(), depth = params.TOTAL_LINES_DEPTH)
self.total_lines_val_one_hot = tf.one_hot(self.val_df['total_lines'].to_numpy(), depth = params.TOTAL_LINES_DEPTH)
self.total_lines_test_one_hot = tf.one_hot(self.test_df['total_lines'].to_numpy(), depth = params.TOTAL_LINES_DEPTH)
except Exception as e:
print("Error encoding line_ids:", e)
return self.total_lines_one_hot, self.total_lines_val_one_hot, self.total_lines_test_one_hot
# ----------------------------------------------------------------------------------------
# -------------------CREATE DATASET------------------------------------------------------
def _get_word_dataset(self):
"""
Get dataset with 1 input: Word-level token
....
"""
# Train set
word_input_data = tf.data.Dataset.from_tensor_slices(self.train_sentences)
word_input_label = tf.data.Dataset.from_tensor_slices(self.y_train_one_hot)
word_train_dataset = tf.data.Dataset.zip((word_input_data, word_input_label))
word_train_dataset = word_train_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Val set
word_val_data = tf.data.Dataset.from_tensor_slices(self.val_sentences)
word_val_label = tf.data.Dataset.from_tensor_slices(self.y_val_one_hot)
word_val_dataset = tf.data.Dataset.zip((word_val_data, word_val_label))
word_val_dataset = word_val_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Test_set
word_test_data = tf.data.Dataset.from_tensor_slices(self.test_sentences)
word_test_label = tf.data.Dataset.from_tensor_slices(self.y_test_one_hot)
word_test_dataset = tf.data.Dataset.zip((word_test_data, word_test_label))
word_test_dataset = word_test_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
return word_train_dataset, word_val_dataset, word_test_dataset
def _get_word_char_dataset(self):
"""
Get dataset with hybrid inputs: Word-level tokens and char-level token
"""
# Train set
word_char_data = tf.data.Dataset.from_tensor_slices((self.train_sentences, self.train_char))
word_char_label = tf.data.Dataset.from_tensor_slices(self.y_train_one_hot)
word_char_dataset = tf.data.Dataset.zip((word_char_data, word_char_label))
word_char_dataset = word_char_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Val set
word_char_val_data = tf.data.Dataset.from_tensor_slices((self.val_sentences, self.val_char))
word_char_val_label = tf.data.Dataset.from_tensor_slices(self.y_val_one_hot)
word_char_val_dataset = tf.data.Dataset.zip((word_char_val_data, word_char_val_label))
word_char_val_dataset = word_char_val_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
#Test set
word_char_test_data = tf.data.Dataset.from_tensor_slices((self.test_sentences, self.test_char))
word_char_test_label = tf.data.Dataset.from_tensor_slices(self.y_test_one_hot)
word_char_test_dataset = tf.data.Dataset.zip((word_char_test_data, word_char_test_label))
word_char_test_dataset = word_char_test_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
return word_char_dataset, word_char_val_dataset, word_char_test_dataset
def _get_tetra_dataset(self):
"""
Get tetra inputs dataset: word-level tokens, char-level tokens, lines_ids tokens, total_lines tokens
"""
# Train set
tetra_data = tf.data.Dataset.from_tensor_slices((self.train_sentences, self.train_char,
self.line_ids_one_hot, self.total_lines_one_hot))
tetra_label = tf.data.Dataset.from_tensor_slices(self.y_train_one_hot)
tetra_train_dataset = tf.data.Dataset.zip((tetra_data, tetra_label))
tetra_train_dataset = tetra_train_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Val set
tetra_val_data = tf.data.Dataset.from_tensor_slices((self.val_sentences, self.val_char,
self.line_ids_val_one_hot, self.total_lines_val_one_hot))
tetra_val_label = tf.data.Dataset.from_tensor_slices(self.y_val_one_hot)
tetra_val_dataset = tf.data.Dataset.zip((tetra_val_data, tetra_val_label))
tetra_val_dataset = tetra_val_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Test set
tetra_test_data = tf.data.Dataset.from_tensor_slices((self.test_sentences, self.test_char,
self.line_ids_test_one_hot, self.total_lines_test_one_hot))
tetra_test_label = tf.data.Dataset.from_tensor_slices(self.y_val_one_hot)
tetra_test_dataset = tf.data.Dataset.zip((tetra_test_data, tetra_test_label))
tetra_test_dataset = tetra_test_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
return tetra_train_dataset, tetra_val_dataset, tetra_test_dataset
def _get_penta_dataset(self):
"""
Get penta inputs dataset: Word-level tokens, char-level tokens, line_ids tokens, length_lines tokens, total_lines tokens
"""
# Train set
penta_data = tf.data.Dataset.from_tensor_slices((self.train_sentences, self.train_char,
self.line_ids_one_hot, self.length_lines_one_hot, self.total_lines_one_hot))
penta_label = tf.data.Dataset.from_tensor_slices(self.y_train_one_hot)
penta_train_dataset = tf.data.Dataset.zip((penta_data, penta_label))
penta_train_dataset = penta_train_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Val set
penta_val_data = tf.data.Dataset.from_tensor_slices((self.val_sentences, self.val_char,
self.line_ids_val_one_hot, self.length_lines_val_one_hot, self.total_lines_val_one_hot))
penta_val_label = tf.data.Dataset.from_tensor_slices(self.y_val_one_hot)
penta_val_dataset = tf.data.Dataset.zip((penta_val_data, penta_val_label))
penta_val_dataset = penta_val_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
# Test set
penta_test_data = tf.data.Dataset.from_tensor_slices((self.test_sentences, self.test_char,
self.line_ids_test_one_hot, self.length_lines_test_one_hot, self.total_lines_test_one_hot))
penta_test_label = tf.data.Dataset.from_tensor_slices(self.y_test_one_hot)
penta_test_dataset = tf.data.Dataset.zip((penta_test_data, penta_test_label))
penta_test_dataset = penta_test_dataset.batch(self.batch_size).prefetch(tf.data.AUTOTUNE)
return penta_train_dataset, penta_val_dataset, penta_test_dataset
if __name__ == "__main__":
params = Params()
dataset = Dataset(train_txt=params.TRAIN_DIR, val_txt=params.VAL_DIR, test_txt=params.TEST_DIR)
word_char_train, word_char_val,_ = dataset._get_word_char_dataset()
print("Done")