|
|
import os |
|
|
from dotenv import load_dotenv |
|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
from transformers import BertTokenizerFast, AutoTokenizer |
|
|
from datasets import Dataset, DatasetDict, load_dataset |
|
|
|
|
|
from src.utils import ( |
|
|
detect_language, |
|
|
add_emoji_tokens, |
|
|
add_new_line_token, |
|
|
user_id, |
|
|
) |
|
|
from src.utils.text_functions import clean_text |
|
|
from src.utils.s3 import read_csv, save_csv |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
class MLMDataset: |
|
|
def __init__( |
|
|
self, |
|
|
s3: bool = False, |
|
|
bucket: str = "lebesgue-data-science", |
|
|
folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data/pretrain", |
|
|
s3_folder: str = "transformers/data/pretrain", |
|
|
): |
|
|
self.s3 = s3 |
|
|
self.bucket = bucket |
|
|
|
|
|
if self.s3: |
|
|
self.folder = s3_folder |
|
|
else: |
|
|
self.folder = folder |
|
|
|
|
|
self.primaries_path = f"{self.folder}/primaries.csv" |
|
|
self.competitors_path = f"{self.folder}/competitor_ads.csv" |
|
|
self.ad_copies_path = f"{self.folder}/ad_copies.csv" |
|
|
self.english_copies_path = f"{self.folder}/english_copies.csv" |
|
|
self.train_path = f"{self.folder}/train.csv" |
|
|
self.val_path = f"{self.folder}/val.csv" |
|
|
self.test_path = f"{self.folder}/test.csv" |
|
|
|
|
|
self.tokenizer_id = f"{user_id}/lebesgue_ad_tokenizer" |
|
|
|
|
|
self.hub_datasetdict_id = f"{user_id}/lebesgue_ad_datasets" |
|
|
|
|
|
@property |
|
|
def primaries(self) -> pd.DataFrame: |
|
|
df = read_csv(self.primaries_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def competitors(self) -> pd.DataFrame: |
|
|
df = read_csv(self.competitors_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def ad_copies(self) -> pd.DataFrame: |
|
|
df = read_csv(self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def english_copies(self) -> pd.DataFrame: |
|
|
args = {"lineterminator": "\n"} |
|
|
df = read_csv( |
|
|
self.english_copies_path, |
|
|
s3=self.s3, |
|
|
s3_args={"bucket": self.bucket} | args, |
|
|
pd_args=args, |
|
|
) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def train(self) -> pd.DataFrame: |
|
|
df = read_csv(self.train_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def val(self) -> pd.DataFrame: |
|
|
df = read_csv(self.val_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def test(self) -> pd.DataFrame: |
|
|
df = read_csv(self.test_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
return df |
|
|
|
|
|
@property |
|
|
def datasets(self) -> DatasetDict: |
|
|
return load_dataset(self.hub_datasetdict_id) |
|
|
|
|
|
def tokenizer(self, checkpoint: str = "bert-base-uncased") -> AutoTokenizer: |
|
|
|
|
|
return AutoTokenizer.from_pretrained(f"{self.tokenizer_id}_{checkpoint}") |
|
|
|
|
|
def concat_and_remove_duplicates(self) -> pd.DataFrame: |
|
|
|
|
|
comp = self.competitors |
|
|
prim = self.primaries |
|
|
|
|
|
primaries = prim.value.to_list() |
|
|
primaries = [primary for primary in primaries if type(primary) == list] |
|
|
|
|
|
list_of_primaries = [] |
|
|
for primary in primaries: |
|
|
list_of_primaries.extend(primary) |
|
|
|
|
|
competitors = comp.ad_text.to_list() |
|
|
|
|
|
ad_copies = list_of_primaries + competitors |
|
|
ad_copies = pd.Series(ad_copies).drop_duplicates() |
|
|
ad_copies = pd.DataFrame(ad_copies, columns=["text"]) |
|
|
save_csv( |
|
|
df=ad_copies, |
|
|
path=self.ad_copies_path, |
|
|
s3=self.s3, |
|
|
s3_args={"bucket": self.bucket}, |
|
|
) |
|
|
|
|
|
def get_language(self) -> pd.DataFrame: |
|
|
ad_copies = self.ad_copies |
|
|
ad_copies["language"] = ad_copies.text.apply(lambda text: detect_language(text)) |
|
|
save_csv( |
|
|
df=ad_copies, |
|
|
path=self.ad_copies_path, |
|
|
s3=self.s3, |
|
|
s3_args={"bucket": self.bucket}, |
|
|
) |
|
|
return ad_copies |
|
|
|
|
|
def filter_english(self) -> pd.DataFrame: |
|
|
ad_copies = self.ad_copies |
|
|
english = ad_copies[ad_copies.language == "en"] |
|
|
save_csv( |
|
|
df=english, |
|
|
path=self.english_copies_path, |
|
|
s3=self.s3, |
|
|
s3_args={"bucket": self.bucket}, |
|
|
) |
|
|
return english |
|
|
|
|
|
def clean_english(self) -> pd.DataFrame: |
|
|
english = self.english_copies |
|
|
english["text_clean"] = english.text.apply(clean_text) |
|
|
|
|
|
|
|
|
english = english[english.text_clean.apply(len) != 0] |
|
|
save_csv( |
|
|
df=english, |
|
|
path=self.english_copies_path, |
|
|
s3=self.s3, |
|
|
s3_args={"bucket": self.bucket}, |
|
|
) |
|
|
return english |
|
|
|
|
|
def train_tokenizer(self, checkpoint: str = "bert-base-uncased"): |
|
|
|
|
|
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") |
|
|
tokenizer = add_emoji_tokens(tokenizer=tokenizer) |
|
|
tokenizer = add_new_line_token(tokenizer=tokenizer) |
|
|
|
|
|
tokenizer.push_to_hub(f"{self.tokenizer_id}_{checkpoint}") |
|
|
|
|
|
def get_tokenizer(self): |
|
|
return BertTokenizerFast.from_pretrained(self.tokenizer_id) |
|
|
|
|
|
def split_into_train_and_test( |
|
|
self, |
|
|
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
|
|
df = self.english_copies |
|
|
train, test = train_test_split(df, train_size=0.9, random_state=42) |
|
|
train, val = train_test_split(train, train_size=0.85, random_state=42) |
|
|
|
|
|
dataset_dict = DatasetDict() |
|
|
|
|
|
for df, local_path, dataset_dict_key in zip( |
|
|
[train, val, test], |
|
|
[self.train_path, self.val_path, self.train_path], |
|
|
["train", "val", "test"], |
|
|
): |
|
|
save_csv(df=df, path=local_path, s3=self.s3, s3_args={"bucket": self.bucket}) |
|
|
df_hf = Dataset.from_pandas(df, preserve_index=False) |
|
|
dataset_dict[dataset_dict_key] = df_hf |
|
|
|
|
|
dataset_dict.push_to_hub(self.hub_datasetdict_id) |
|
|
|
|
|
return train, val, test |
|
|
|
|
|
|
|
|
mlm_dataset = MLMDataset() |
|
|
|
|
|
mlm_dataset_s3 = MLMDataset(s3=True) |
|
|
|