import os from dotenv import load_dotenv import pandas as pd from sklearn.model_selection import train_test_split from transformers import BertTokenizerFast, AutoTokenizer from datasets import Dataset, DatasetDict, load_dataset from src.utils import ( detect_language, add_emoji_tokens, add_new_line_token, user_id, ) from src.utils.text_functions import clean_text from src.utils.s3 import read_csv, save_csv load_dotenv() class MLMDataset: def __init__( self, s3: bool = False, bucket: str = "lebesgue-data-science", folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data/pretrain", s3_folder: str = "transformers/data/pretrain", ): self.s3 = s3 self.bucket = bucket if self.s3: self.folder = s3_folder else: self.folder = folder self.primaries_path = f"{self.folder}/primaries.csv" self.competitors_path = f"{self.folder}/competitor_ads.csv" self.ad_copies_path = f"{self.folder}/ad_copies.csv" self.english_copies_path = f"{self.folder}/english_copies.csv" self.train_path = f"{self.folder}/train.csv" self.val_path = f"{self.folder}/val.csv" self.test_path = f"{self.folder}/test.csv" self.tokenizer_id = f"{user_id}/lebesgue_ad_tokenizer" self.hub_datasetdict_id = f"{user_id}/lebesgue_ad_datasets" @property def primaries(self) -> pd.DataFrame: df = read_csv(self.primaries_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def competitors(self) -> pd.DataFrame: df = read_csv(self.competitors_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def ad_copies(self) -> pd.DataFrame: df = read_csv(self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def english_copies(self) -> pd.DataFrame: args = {"lineterminator": "\n"} df = read_csv( self.english_copies_path, s3=self.s3, s3_args={"bucket": self.bucket} | args, pd_args=args, ) return df @property def train(self) -> pd.DataFrame: df = read_csv(self.train_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def val(self) -> pd.DataFrame: df = read_csv(self.val_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def test(self) -> pd.DataFrame: df = read_csv(self.test_path, s3=self.s3, s3_args={"bucket": self.bucket}) return df @property def datasets(self) -> DatasetDict: return load_dataset(self.hub_datasetdict_id) def tokenizer(self, checkpoint: str = "bert-base-uncased") -> AutoTokenizer: return AutoTokenizer.from_pretrained(f"{self.tokenizer_id}_{checkpoint}") def concat_and_remove_duplicates(self) -> pd.DataFrame: comp = self.competitors prim = self.primaries primaries = prim.value.to_list() primaries = [primary for primary in primaries if type(primary) == list] list_of_primaries = [] for primary in primaries: list_of_primaries.extend(primary) competitors = comp.ad_text.to_list() ad_copies = list_of_primaries + competitors ad_copies = pd.Series(ad_copies).drop_duplicates() ad_copies = pd.DataFrame(ad_copies, columns=["text"]) save_csv( df=ad_copies, path=self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}, ) def get_language(self) -> pd.DataFrame: ad_copies = self.ad_copies ad_copies["language"] = ad_copies.text.apply(lambda text: detect_language(text)) save_csv( df=ad_copies, path=self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}, ) return ad_copies def filter_english(self) -> pd.DataFrame: ad_copies = self.ad_copies english = ad_copies[ad_copies.language == "en"] save_csv( df=english, path=self.english_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}, ) return english def clean_english(self) -> pd.DataFrame: english = self.english_copies english["text_clean"] = english.text.apply(clean_text) # remove empty ones english = english[english.text_clean.apply(len) != 0] save_csv( df=english, path=self.english_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}, ) return english def train_tokenizer(self, checkpoint: str = "bert-base-uncased"): tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer = add_emoji_tokens(tokenizer=tokenizer) tokenizer = add_new_line_token(tokenizer=tokenizer) tokenizer.push_to_hub(f"{self.tokenizer_id}_{checkpoint}") def get_tokenizer(self): return BertTokenizerFast.from_pretrained(self.tokenizer_id) def split_into_train_and_test( self, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: df = self.english_copies train, test = train_test_split(df, train_size=0.9, random_state=42) train, val = train_test_split(train, train_size=0.85, random_state=42) dataset_dict = DatasetDict() for df, local_path, dataset_dict_key in zip( [train, val, test], [self.train_path, self.val_path, self.train_path], ["train", "val", "test"], ): save_csv(df=df, path=local_path, s3=self.s3, s3_args={"bucket": self.bucket}) df_hf = Dataset.from_pandas(df, preserve_index=False) dataset_dict[dataset_dict_key] = df_hf dataset_dict.push_to_hub(self.hub_datasetdict_id) return train, val, test mlm_dataset = MLMDataset() mlm_dataset_s3 = MLMDataset(s3=True)