diff --git a/src/MLM/__init__.py b/src/MLM/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4ca0003249df372948d4ffac1e7f3fa9c3c75a7b --- /dev/null +++ b/src/MLM/__init__.py @@ -0,0 +1,2 @@ +from .training_scripts.train_with_trainer import train_with_trainer +from .datasets.preprocess_dataset import preprocess_dataset diff --git a/src/MLM/__pycache__/__init__.cpython-310.pyc b/src/MLM/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c11dc21d73b65e69e2008441c1298c97681c65d3 Binary files /dev/null and b/src/MLM/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/MLM/datasets/MLMDataset.py b/src/MLM/datasets/MLMDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a427824c1123411a184af76a44036d994e71e4b2 --- /dev/null +++ b/src/MLM/datasets/MLMDataset.py @@ -0,0 +1,193 @@ +import os +from dotenv import load_dotenv +import pandas as pd +from sklearn.model_selection import train_test_split +from transformers import BertTokenizerFast, AutoTokenizer +from datasets import Dataset, DatasetDict, load_dataset + +from src.utils import ( + detect_language, + add_emoji_tokens, + add_new_line_token, + user_id, +) +from src.utils.text_functions import clean_text +from src.utils.s3 import read_csv, save_csv + +load_dotenv() + + +class MLMDataset: + def __init__( + self, + s3: bool = False, + bucket: str = "lebesgue-data-science", + folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data/pretrain", + s3_folder: str = "transformers/data/pretrain", + ): + self.s3 = s3 + self.bucket = bucket + + if self.s3: + self.folder = s3_folder + else: + self.folder = folder + + self.primaries_path = f"{self.folder}/primaries.csv" + self.competitors_path = f"{self.folder}/competitor_ads.csv" + self.ad_copies_path = f"{self.folder}/ad_copies.csv" + self.english_copies_path = f"{self.folder}/english_copies.csv" + self.train_path = f"{self.folder}/train.csv" + self.val_path = f"{self.folder}/val.csv" + self.test_path = f"{self.folder}/test.csv" + + self.tokenizer_id = f"{user_id}/lebesgue_ad_tokenizer" + + self.hub_datasetdict_id = f"{user_id}/lebesgue_ad_datasets" + + @property + def primaries(self) -> pd.DataFrame: + df = read_csv(self.primaries_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def competitors(self) -> pd.DataFrame: + df = read_csv(self.competitors_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def ad_copies(self) -> pd.DataFrame: + df = read_csv(self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def english_copies(self) -> pd.DataFrame: + args = {"lineterminator": "\n"} + df = read_csv( + self.english_copies_path, + s3=self.s3, + s3_args={"bucket": self.bucket} | args, + pd_args=args, + ) + return df + + @property + def train(self) -> pd.DataFrame: + df = read_csv(self.train_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def val(self) -> pd.DataFrame: + df = read_csv(self.val_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def test(self) -> pd.DataFrame: + df = read_csv(self.test_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def datasets(self) -> DatasetDict: + return load_dataset(self.hub_datasetdict_id) + + def tokenizer(self, checkpoint: str = "bert-base-uncased") -> AutoTokenizer: + + return AutoTokenizer.from_pretrained(f"{self.tokenizer_id}_{checkpoint}") + + def concat_and_remove_duplicates(self) -> pd.DataFrame: + + comp = self.competitors + prim = self.primaries + + primaries = prim.value.to_list() + primaries = [primary for primary in primaries if type(primary) == list] + + list_of_primaries = [] + for primary in primaries: + list_of_primaries.extend(primary) + + competitors = comp.ad_text.to_list() + + ad_copies = list_of_primaries + competitors + ad_copies = pd.Series(ad_copies).drop_duplicates() + ad_copies = pd.DataFrame(ad_copies, columns=["text"]) + save_csv( + df=ad_copies, + path=self.ad_copies_path, + s3=self.s3, + s3_args={"bucket": self.bucket}, + ) + + def get_language(self) -> pd.DataFrame: + ad_copies = self.ad_copies + ad_copies["language"] = ad_copies.text.apply(lambda text: detect_language(text)) + save_csv( + df=ad_copies, + path=self.ad_copies_path, + s3=self.s3, + s3_args={"bucket": self.bucket}, + ) + return ad_copies + + def filter_english(self) -> pd.DataFrame: + ad_copies = self.ad_copies + english = ad_copies[ad_copies.language == "en"] + save_csv( + df=english, + path=self.english_copies_path, + s3=self.s3, + s3_args={"bucket": self.bucket}, + ) + return english + + def clean_english(self) -> pd.DataFrame: + english = self.english_copies + english["text_clean"] = english.text.apply(clean_text) + + # remove empty ones + english = english[english.text_clean.apply(len) != 0] + save_csv( + df=english, + path=self.english_copies_path, + s3=self.s3, + s3_args={"bucket": self.bucket}, + ) + return english + + def train_tokenizer(self, checkpoint: str = "bert-base-uncased"): + + tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + tokenizer = add_emoji_tokens(tokenizer=tokenizer) + tokenizer = add_new_line_token(tokenizer=tokenizer) + + tokenizer.push_to_hub(f"{self.tokenizer_id}_{checkpoint}") + + def get_tokenizer(self): + return BertTokenizerFast.from_pretrained(self.tokenizer_id) + + def split_into_train_and_test( + self, + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + df = self.english_copies + train, test = train_test_split(df, train_size=0.9, random_state=42) + train, val = train_test_split(train, train_size=0.85, random_state=42) + + dataset_dict = DatasetDict() + + for df, local_path, dataset_dict_key in zip( + [train, val, test], + [self.train_path, self.val_path, self.train_path], + ["train", "val", "test"], + ): + save_csv(df=df, path=local_path, s3=self.s3, s3_args={"bucket": self.bucket}) + df_hf = Dataset.from_pandas(df, preserve_index=False) + dataset_dict[dataset_dict_key] = df_hf + + dataset_dict.push_to_hub(self.hub_datasetdict_id) + + return train, val, test + + +mlm_dataset = MLMDataset() + +mlm_dataset_s3 = MLMDataset(s3=True) diff --git a/src/MLM/datasets/__init__.py b/src/MLM/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8182932094c04b6c6981ac0fed1019a65574f2ca --- /dev/null +++ b/src/MLM/datasets/__init__.py @@ -0,0 +1,2 @@ +from .preprocess_dataset import preprocess_dataset +from .MLMDataset import MLMDataset, mlm_dataset, mlm_dataset_s3 diff --git a/src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc b/src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3dbfba37defb5e1309d1b4a0f1f02a7f940aed03 Binary files /dev/null and b/src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc differ diff --git a/src/MLM/datasets/__pycache__/__init__.cpython-310.pyc b/src/MLM/datasets/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aae4cb80b4620da538f5280f78a60daf2c60bebd Binary files /dev/null and b/src/MLM/datasets/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc b/src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8087fb263b31d5c6c0ce0c490cf6caadd0e2a4f Binary files /dev/null and b/src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc differ diff --git a/src/MLM/datasets/preprocess_dataset.py b/src/MLM/datasets/preprocess_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..cd1eb9698b8fa1a26bed7302200d57a1e115e157 --- /dev/null +++ b/src/MLM/datasets/preprocess_dataset.py @@ -0,0 +1,39 @@ +from datasets import Dataset, DatasetDict +from transformers import AutoTokenizer + +def preprocess_dataset(dataset: Dataset | DatasetDict, tokenizer: AutoTokenizer) -> Dataset | DatasetDict: + + tokenized_dataset = dataset.map( + lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text", 'text_clean', 'language'] + ) + + return tokenized_dataset.map(group_texts, batched=True) + + + + + + + +def tokenize_function(examples, tokenizer: AutoTokenizer): + + result = tokenizer(examples["text"]) + + if tokenizer.is_fast: + result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] + + return result + + +def group_texts(examples, chunk_size: int = 128): + + concatinated_examples = {k : sum(examples[k], []) for k in examples.keys()} + + total_length = len(concatinated_examples["input_ids"]) + total_length = (total_length // chunk_size) * chunk_size + + result = {k : [t[i : i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatinated_examples.items()} + result["labels"] = result["input_ids"].copy() + + return result + diff --git a/src/MLM/mask_and_unmask.py b/src/MLM/mask_and_unmask.py new file mode 100644 index 0000000000000000000000000000000000000000..c0fc67c93f6b8cc323ac8f506bfefb6c2e7886c1 --- /dev/null +++ b/src/MLM/mask_and_unmask.py @@ -0,0 +1,32 @@ +from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizerFast, DataCollatorForLanguageModeling +import torch + + +def mask_and_unmask( + text: str, + tokenizer: AutoTokenizer | BertTokenizerFast, + model: AutoModelForMaskedLM, + data_collator: DataCollatorForLanguageModeling, +) -> str: + + collator_input = tokenizer(text) + collator_input["labels"] = collator_input["input_ids"].copy() + collator_output = data_collator([collator_input]) + masked_text = tokenizer.decode(collator_output["input_ids"][0]) + + pred_dict = {"masked_text": masked_text} + + inputs = tokenizer(masked_text, return_tensors="pt", padding="max_length", truncation=True) + token_logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits + all_masked_token_index = torch.argwhere(inputs["input_ids"] == tokenizer.mask_token_id) + if all_masked_token_index.size()[0] != 0: + + for i, masked_index_token in enumerate(all_masked_token_index[:, 1]): + # print(masked_index_token) + masked_token_logits = token_logits[0, masked_index_token, :] + # print(masked_token_logits) + top_5_tokens = torch.argsort(masked_token_logits, descending=True)[:5].tolist() + value = tokenizer.decode(collator_output["labels"][0, masked_index_token - 1]) + pred_dict[value] = [tokenizer.decode(token) for token in top_5_tokens] + + return pred_dict diff --git a/src/MLM/training_scripts/__init__.py b/src/MLM/training_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e603ad4e5f1d0a21a3724f17f368f5f5745ca142 --- /dev/null +++ b/src/MLM/training_scripts/__init__.py @@ -0,0 +1 @@ +from .train_with_trainer import train_with_trainer diff --git a/src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc b/src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..939b05ce4fc0629fb99a432d1ff52f92e8c57ea7 Binary files /dev/null and b/src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc b/src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a65c3453250e26ffdbdf77d2bf72be09a2f7750 Binary files /dev/null and b/src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc differ diff --git a/src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc b/src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..984e509f684472f36b7fd95b23d1b2cca29a0c0b Binary files /dev/null and b/src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc differ diff --git a/src/MLM/training_scripts/train_with_trainer.py b/src/MLM/training_scripts/train_with_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..93716da143c0242e9bcdb886aebe6d53b6ec0b81 --- /dev/null +++ b/src/MLM/training_scripts/train_with_trainer.py @@ -0,0 +1,50 @@ +from transformers import AutoModelForMaskedLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import Dataset, DatasetDict +from transformers import DataCollatorForLanguageModeling + +from src.MLM.datasets.preprocess_dataset import preprocess_dataset +from src.MLM.training_scripts.utils import get_new_model_name + + +def train_with_trainer( + model_checkpoint: str, + tokenizer: AutoTokenizer, + dataset: DatasetDict, + model_name: str | None = None, + data_collator=None, + num_epochs: int = 3, +): + + model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) + + model_name = get_new_model_name(model_checkpoint=model_checkpoint, model_name=model_name) + + dataset = preprocess_dataset(dataset=dataset, tokenizer=tokenizer) + + if data_collator is None: + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) + + training_args = TrainingArguments( + model_name, + evaluation_strategy="epoch", + learning_rate=2e-5, + weight_decay=0.01, + push_to_hub=True, + report_to="wandb", + run_name=model_name, + num_train_epochs=num_epochs, + save_total_limit=1, + save_strategy="epoch", + ) + + print(f"device: {training_args.device}") + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["val"], + data_collator=data_collator, + ) + + trainer.train() diff --git a/src/MLM/training_scripts/utils.py b/src/MLM/training_scripts/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca19d6f371ac1f51c818b31d7f29e49aace3c6e --- /dev/null +++ b/src/MLM/training_scripts/utils.py @@ -0,0 +1,9 @@ +def get_new_model_name(model_checkpoint: str, model_name: str = None) -> str: + if model_name is None: + old_version_number = int(model_checkpoint[-2:]) + new_version_number = str(old_version_number + 1).zfill(2) + model_name = f"{model_checkpoint[:-2]}{new_version_number}" + elif not model_name[-2:].isnumeric(): + model_name = model_name + "_00" + + return model_name diff --git a/src/regression/.gitignore b/src/regression/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..643cb1816408b3073fbe1bed5996a5d1b7c15836 --- /dev/null +++ b/src/regression/.gitignore @@ -0,0 +1 @@ +runs/ \ No newline at end of file diff --git a/src/regression/HF/__init__.py b/src/regression/HF/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4d54258b004b23bc729b4cddb37660cc362f98f4 --- /dev/null +++ b/src/regression/HF/__init__.py @@ -0,0 +1,2 @@ +from .configs import * +from .models import * diff --git a/src/regression/HF/__pycache__/__init__.cpython-310.pyc b/src/regression/HF/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80664d1ab6b171a4dd13420c395cdcfb5e3f2a12 Binary files /dev/null and b/src/regression/HF/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/HF/configs/FullModelConfigHF.py b/src/regression/HF/configs/FullModelConfigHF.py new file mode 100644 index 0000000000000000000000000000000000000000..554cf2efdce037c12ff8a666fdb3bd1e7c059269 --- /dev/null +++ b/src/regression/HF/configs/FullModelConfigHF.py @@ -0,0 +1,25 @@ +from transformers import PretrainedConfig +from src.regression.PL import EncoderPL, DecoderPL +from typing import List + + +class FullModelConfigHF(PretrainedConfig): + + model_type = "full_model" + + def __init__( + self, + tokenizer_ckpt: str = "", + bert_ckpt: str = "", + decoder_ckpt: str = "", + layer_norm: bool = True, + nontext_features: List[str] = ["aov"], + **kwargs, + ): + + self.tokenizer_ckpt = tokenizer_ckpt + self.bert_ckpt = bert_ckpt + self.decoder_ckpt = decoder_ckpt + self.nontext_features = nontext_features + self.layer_norm = layer_norm + super().__init__(**kwargs) diff --git a/src/regression/HF/configs/__init__.py b/src/regression/HF/configs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7b8276dbb673def8517a6da81736841f94b74e75 --- /dev/null +++ b/src/regression/HF/configs/__init__.py @@ -0,0 +1 @@ +from .FullModelConfigHF import FullModelConfigHF diff --git a/src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc b/src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6a610a1ad42faca1a55197bec0b842e9ea3ed7e0 Binary files /dev/null and b/src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc differ diff --git a/src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc b/src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3740f0302dec81fe4550910340042a1e07baa8a9 Binary files /dev/null and b/src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/HF/models/FullModelHF.py b/src/regression/HF/models/FullModelHF.py new file mode 100644 index 0000000000000000000000000000000000000000..0020a96ff0f58ec8fce29efe4fd45b960cd3f9bc --- /dev/null +++ b/src/regression/HF/models/FullModelHF.py @@ -0,0 +1,43 @@ +from transformers import PreTrainedModel +from transformers import AutoModelForMaskedLM, AutoTokenizer +from pytorch_lightning.loggers import WandbLogger + +from src.regression.PL import FullModelPL, EncoderPL, DecoderPL +from src.regression.HF.configs import FullModelConfigHF + +from config import DEVICE + + +class FullModelHF(PreTrainedModel): + config_class = FullModelConfigHF + + def __init__(self, config): + + super().__init__(config) + + self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_ckpt) + mlm_bert = AutoModelForMaskedLM.from_pretrained(config.bert_ckpt) + self.bert = mlm_bert.distilbert + + encoder = EncoderPL(tokenizer=self.tokenizer, bert=self.bert).to(DEVICE) + + wandb_logger = WandbLogger( + project="transformers", + entity="sanjin_juric_fot", + # log_model=True, + # reinit=True, + ) + + artifact = wandb_logger.use_artifact(config.decoder_ckpt) + artifact_dir = artifact.download() + decoder = DecoderPL.load_from_checkpoint(artifact_dir + "/" + "model.ckpt").to(DEVICE) + + self.model = FullModelPL( + encoder=encoder, + decoder=decoder, + layer_norm=config.layer_norm, + nontext_features=config.nontext_features, + ).to(DEVICE) + + def forward(self, input): + return self.model._get_loss(input) diff --git a/src/regression/HF/models/__init__.py b/src/regression/HF/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72c3edfac241c28e959ea41d30e12da1a8778ebe --- /dev/null +++ b/src/regression/HF/models/__init__.py @@ -0,0 +1 @@ +from .FullModelHF import FullModelHF diff --git a/src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc b/src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54acaf00447a288bf84d67d60ee4598ed4c79a09 Binary files /dev/null and b/src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc differ diff --git a/src/regression/HF/models/__pycache__/__init__.cpython-310.pyc b/src/regression/HF/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c27417c924b8ad2de1ba969b3bfa539ff320e6d1 Binary files /dev/null and b/src/regression/HF/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/PL/DecoderPL.py b/src/regression/PL/DecoderPL.py new file mode 100644 index 0000000000000000000000000000000000000000..fd2489145f476682dfe22b2e36d618f64da6ba2e --- /dev/null +++ b/src/regression/PL/DecoderPL.py @@ -0,0 +1,180 @@ +import emoji +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from loguru import logger +from torch import nn +from torch.optim.lr_scheduler import CosineAnnealingLR +from torchmetrics import R2Score + +from src.utils import get_sentiment +from src.utils.neural_networks import set_layer +from config import DEVICE + +torch.set_default_dtype(torch.float32) + + +class DecoderPL(pl.LightningModule): + def __init__( + self, + input_dim: int = 774, + layer_norm: bool = True, + layer_dict: dict = {}, + device=DEVICE, + T_max: int = 10, + start_lr: float = 5 * 1e-4, + ): + super().__init__() + + # layers + self.linear1 = set_layer( + layer_dict=layer_dict, + name="linear1", + alternative=nn.Linear(in_features=input_dim, out_features=512), + ) + + self.linear2 = set_layer( + layer_dict=layer_dict, + name="linear2", + alternative=nn.Linear(in_features=512, out_features=264), + ) + + self.linear3 = set_layer( + layer_dict=layer_dict, + name="linear3", + alternative=nn.Linear(in_features=264, out_features=64), + ) + + self.linear4 = set_layer( + layer_dict=layer_dict, + name="linear4", + alternative=nn.Linear(in_features=64, out_features=1), + ) + + self.activation = nn.LeakyReLU(negative_slope=0.01) + + if not layer_norm: + self.layers = [ + self.linear1, + self.activation, + self.linear2, + self.activation, + self.linear3, + self.activation, + self.linear4, + ] + else: + self.layernorm1 = nn.LayerNorm(normalized_shape=(1, self.linear1.out_features)) + self.layernorm2 = nn.LayerNorm(normalized_shape=(1, self.linear2.out_features)) + self.layernorm3 = nn.LayerNorm(normalized_shape=(1, self.linear3.out_features)) + self.layers = [ + self.linear1, + self.layernorm1, + self.activation, + self.linear2, + self.layernorm2, + self.activation, + self.linear3, + self.layernorm3, + self.activation, + self.linear4, + ] + + # initialize weights + [self.initialize_weights(layer) for layer in self.layers] + + # optimizer and scheduler + self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=start_lr) + self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max) + + # else + self.save_hyperparameters(ignore=["model"]) + self.MSE = nn.MSELoss() + self.R2 = R2Score() + + def initialize_weights(self, module): + + if isinstance(module, nn.Linear): + logger.debug("linear weights initialized") + torch.nn.init.xavier_uniform_(module.weight) + module.bias.data.fill_(0.01) + + def forward(self, x: torch.Tensor): + + if x.dim() == 2: + x = x.unsqueeze(dim=1) + + for layer in self.layers: + x = layer(x) + + x = x.squeeze() + + if x.dim() == 0: + x = x.unsqueeze(dim=0) + + return x.to(torch.float32) + + def training_step(self, batch): + + loss_and_metrics = self._get_loss(batch, get_metrics=True) + pred = loss_and_metrics["pred"] + act = loss_and_metrics["act"] + loss = loss_and_metrics["loss"] + + self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True) + + return {"loss": loss, "pred": pred, "act": act} + + def configure_optimizers(self): + + optimizer = self.optimizer + scheduler = self.scheduler + return dict(optimizer=optimizer, lr_scheduler=scheduler) + + def lr_scheduler_step(self, scheduler, optimizer_idx, metric): + logger.debug(scheduler) + if metric is None: + scheduler.step() + else: + scheduler.step(metric) + + def validation_step(self, batch, batch_idx): + """used for logging metrics""" + loss_and_metrics = self._get_loss(batch, get_metrics=True) + loss = loss_and_metrics["loss"] + + # Log loss and metric + self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True) + + def training_epoch_end(self, training_step_outputs): + + training_step_outputs = list(training_step_outputs) + + training_step_outputs.pop() + + output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]} + + pred = torch.stack(output_dict["pred"]) + act = torch.stack(output_dict["act"]) + + loss = torch.sub(pred, act) + loss_sq = torch.square(loss) + + TSS = float(torch.var(act, unbiased=False)) + RSS = float(torch.mean(loss_sq)) + R2 = 1 - RSS / TSS + + self.log("train_R2", R2, prog_bar=True, logger=True) + + def _get_loss(self, batch, get_metrics: bool = False): + """convenience function since train/valid/test steps are similar""" + pred = self.forward(x=batch["embedding"]).to(torch.float32) + + act, loss = None, None + + if "ctr" in batch.keys(): + act = batch["ctr"].to(torch.float32) + loss = self.MSE(pred, act).to(torch.float32) + + return {"loss": loss, "pred": pred, "act": act} diff --git a/src/regression/PL/EncoderPL.py b/src/regression/PL/EncoderPL.py new file mode 100644 index 0000000000000000000000000000000000000000..ad6224300126ba2de6b5717ef5a2fcefd1d4e4e5 --- /dev/null +++ b/src/regression/PL/EncoderPL.py @@ -0,0 +1,116 @@ +import emoji +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from loguru import logger +from torch import nn +from torch.optim.lr_scheduler import CosineAnnealingLR +from torchmetrics import R2Score +from transformers import BertModel, BertTokenizer, DistilBertModel, AutoModel, AutoTokenizer +from pytorch_lightning import LightningModule + + +from src.utils.neural_networks import set_layer +from src.utils import add_emoji_tokens, add_new_line_token, vectorise_dict +from config import DEVICE + +torch.set_default_dtype(torch.float32) + + +class EncoderPL(pl.LightningModule): + def __init__( + self, + model_name: str = "bert-base-uncased", + tokenizer: AutoTokenizer | None = None, + bert: AutoModel | None = None, + cls: bool = False, + device=DEVICE, + ): + super().__init__() + + self._device = device + self.cls = cls + self.model_name = model_name + + # layers + + self.tokenizer = tokenizer if tokenizer is not None else BertTokenizer.from_pretrained(model_name) + + self.bert = bert if bert is not None else BertModel.from_pretrained(model_name) + + if tokenizer is None: + self.tokenizer = add_emoji_tokens(self.tokenizer) + self.tokenizer = add_new_line_token(self.tokenizer) + self.bert.resize_token_embeddings(len(self.tokenizer)) + + # optimizer and scheduler + self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-3) + + # config tweaking + self.bert.config.torch_dtype = "float32" + + def forward(self, text: str): + + # run text through bert and squash the output to get embeddings + encoded = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(self._device) + + if type(self.bert) == DistilBertModel: + encoded.pop("token_type_ids") + + bert_output = self.bert(**encoded) + + if self.cls: + if hasattr(bert_output, "pooler_output") and bert_output.pooler_output is not None: + embedding = bert_output.pooler_output.unsqueeze(dim=1) + else: + embedding = bert_output.last_hidden_state[0, 0, :].unsqueeze(dim=0).unsqueeze(dim=0) + else: + last_hidden_state = bert_output.last_hidden_state + + if last_hidden_state.dim() == 2: + last_hidden_state = last_hidden_state.unsqueeze(dim=0) + + embedding = torch.matmul( + encoded["attention_mask"].type(torch.float32).view(-1, 1, 512), + last_hidden_state, + ) + + return embedding + + def configure_optimizers(self): + return self.optimizer + + +def get_bert_embedding( + text: str, as_list: bool = True, cls: bool = False, device=DEVICE, layer_dict: dict = {} +) -> list: + encoder = EncoderPL(cls=cls, layer_dict=layer_dict).to(device) + embedding = encoder.forward(text) + + if as_list: + embedding = embedding.tolist()[0][0] + + return embedding + + +def get_concat_embedding( + text: str = None, + bert_embedding: list = [], + other_features: dict = {}, + cls: bool = False, + device=DEVICE, + layer_dict: dict = {}, +) -> list: + + if not len(bert_embedding): + + if text is None: + raise ValueError("both text and embedding are empty!") + bert_embedding = get_bert_embedding(text=text, cls=cls, device=device, layer_dict=layer_dict) + + other_features = vectorise_dict(other_features, as_list=True) + + concat_vec = bert_embedding + other_features + + return concat_vec diff --git a/src/regression/PL/FullModelPL.py b/src/regression/PL/FullModelPL.py new file mode 100644 index 0000000000000000000000000000000000000000..5dc8af6498b8498cb87750b9016d921be3f031cd --- /dev/null +++ b/src/regression/PL/FullModelPL.py @@ -0,0 +1,166 @@ +import emoji +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from loguru import logger +from torch import nn +from torch.optim.lr_scheduler import CosineAnnealingLR +from torchmetrics import R2Score +from transformers import BertModel, BertTokenizerFast + +from src.utils import get_sentiment, vectorise_dict +from src.utils.neural_networks import set_layer +from config import DEVICE + +from .DecoderPL import DecoderPL +from .EncoderPL import EncoderPL + + +torch.set_default_dtype(torch.float32) + + +class FullModelPL(pl.LightningModule): + def __init__( + self, + model_name: str = "bert-base-uncased", + nontext_features: list[str] = ["aov"], + encoder: EncoderPL | None = None, + decoder: DecoderPL | None = None, + layer_norm: bool = True, + device=DEVICE, + T_max: int = 10, + ): + super().__init__() + + # layers + self.encoder = ( + encoder.to(self.device) + if encoder is not None + else EncoderPL(model_name=model_name, device=device).to(self.device) + ) + self.decoder = ( + decoder.to(self.device) + if decoder is not None + else DecoderPL( + input_dim=768 + len(nontext_features) + 5, + layer_norm=layer_norm, + device=device, + ).to(self.device) + ) + + # else + self.MSE = nn.MSELoss() + self.R2 = R2Score() + + self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=3 * 1e-4) + self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max) + + # self.save_hyperparameters(ignore=["decoder", "encoder"]) + + def forward(self, input_dict: dict): + + input_dict = input_dict.copy() + text = input_dict.pop("text") + + print(f"text: {text}") + + if "ctr" in input_dict.keys(): + input_dict.pop("ctr") + + # encode + sentence_embedding = self.encoder.forward(text=text) + + # sentiment + sentiment = get_sentiment_for_list_of_texts(text) + input_dict = input_dict | sentiment + + input_dict = {k: v.to(self.device) for k, v in input_dict.items()} + + # concat nontext features to embedding + nontext_vec = vectorise_dict(input_dict) + nontext_tensor = torch.stack(nontext_vec).T.unsqueeze(1).to(torch.float32) + # logger.debug(f"nontext tensor type: {nontext_tensor.dtype}") + print(f"{sentence_embedding.get_device()}, {nontext_tensor.get_device()}") + x = torch.cat((sentence_embedding, nontext_tensor), 2) + + print(self.decoder.device) + print(x.get_device()) + + # decode + result = self.decoder.forward(x) + return result + + def training_step(self, batch): + + loss_and_metrics = self._get_loss(batch, get_metrics=True) + pred = loss_and_metrics["pred"] + act = loss_and_metrics["act"] + loss = loss_and_metrics["loss"] + + self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True) + + return {"loss": loss, "pred": pred, "act": act} + + def configure_optimizers(self): + + for name, param in self.named_parameters(): + if "bert" in name: + param.requires_grad = False + + optimizer = self.optimizer + scheduler = self.scheduler + return dict(optimizer=optimizer, lr_scheduler=scheduler) + + def lr_scheduler_step(self, scheduler, optimizer_idx, metric): + logger.debug(scheduler) + if metric is None: + scheduler.step() + else: + scheduler.step(metric) + + def validation_step(self, batch, batch_idx): + """used for logging metrics""" + loss_and_metrics = self._get_loss(batch, get_metrics=True) + loss = loss_and_metrics["loss"] + + # Log loss and metric + self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True) + + def training_epoch_end(self, training_step_outputs): + + training_step_outputs = list(training_step_outputs) + + training_step_outputs.pop() + + output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]} + + pred = torch.stack(output_dict["pred"]) + act = torch.stack(output_dict["act"]) + + loss = torch.sub(pred, act) + loss_sq = torch.square(loss) + + TSS = float(torch.var(act, unbiased=False)) + RSS = float(torch.mean(loss_sq)) + R2 = 1 - RSS / TSS + + self.log("train_R2", R2, prog_bar=True, logger=True) + + def _get_loss(self, batch, get_metrics: bool = False): + """convenience function since train/valid/test steps are similar""" + pred = self.forward(input_dict=batch).to(torch.float32) + + act, loss = None, None + + if "ctr" in batch.keys(): + act = batch["ctr"].to(torch.float32).to(self.device) + loss = self.MSE(pred, act).to(torch.float32) + + return {"loss": loss, "pred": pred, "act": act} + + +def get_sentiment_for_list_of_texts(texts: list[str]) -> dict: + ld = [get_sentiment(text) for text in texts] + v = {k: torch.Tensor([dic[k] for dic in ld]) for k in ld[0]} + return v diff --git a/src/regression/PL/__init__.py b/src/regression/PL/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90d5707048821acd3f38f6831b930eeb2874ad12 --- /dev/null +++ b/src/regression/PL/__init__.py @@ -0,0 +1,3 @@ +from .FullModelPL import FullModelPL +from .DecoderPL import DecoderPL +from .EncoderPL import EncoderPL, get_concat_embedding, get_bert_embedding diff --git a/src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc b/src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a183c7c4c85c6968d096c1ee26050947d0cd4569 Binary files /dev/null and b/src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc differ diff --git a/src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc b/src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d0cbb2a63a55ace1bbaa8b7fd6667fb7535fd23 Binary files /dev/null and b/src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc differ diff --git a/src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc b/src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c21097dab7c7e60105c3f4188d2eb062ca126bc Binary files /dev/null and b/src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc differ diff --git a/src/regression/PL/__pycache__/__init__.cpython-310.pyc b/src/regression/PL/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f260d7fa0e0a687b22283d6e430018890ff6d4e3 Binary files /dev/null and b/src/regression/PL/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/__init__.py b/src/regression/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..54dabba65b1d71b006c081b5efa40ca0df7389bc --- /dev/null +++ b/src/regression/__init__.py @@ -0,0 +1,3 @@ +from .datasets import * +from .training_scripts import * +from .PL import * diff --git a/src/regression/__pycache__/__init__.cpython-310.pyc b/src/regression/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1ab886b2e9ba0def7886fc77e0355dcd903b792 Binary files /dev/null and b/src/regression/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/datasets/DecoderDatasetTorch.py b/src/regression/datasets/DecoderDatasetTorch.py new file mode 100644 index 0000000000000000000000000000000000000000..acd8190b9a7db8fda5199359008f262d1c8f0d27 --- /dev/null +++ b/src/regression/datasets/DecoderDatasetTorch.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset + + +class DecoderDatasetTorch(Dataset): + """Train dataset.""" + + def __init__(self, df: pd.DataFrame, embedding_column: str = "my_full_mean_embedding"): + """ + + Args: + df (pd.DataFrame): dataframe with ads + embedding_column (str, optional): Column whose values to output in __get_item__. Defaults to 'full_mean_embedding'. + """ + self.df = df + self.embedding_column = embedding_column + + df[[embedding_column, "ctr"]] = df[[embedding_column, "ctr"]].applymap(lambda x: np.float32(x)) + # df["ctr"] = df["ctr"].astype(np.float32) + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + + if torch.is_tensor(idx): + idx = idx.tolist() + + embedding = self.df.loc[idx, self.embedding_column] + ctr = self.df.loc[idx, "ctr"] + + return {"embedding": embedding, "ctr": ctr} + + +# tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity") +# train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer) diff --git a/src/regression/datasets/FullModelDatasetTorch.py b/src/regression/datasets/FullModelDatasetTorch.py new file mode 100644 index 0000000000000000000000000000000000000000..bc57f749cdc3c09719ba2d96baba04c148765673 --- /dev/null +++ b/src/regression/datasets/FullModelDatasetTorch.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader, Dataset + + +class FullModelDatasetTorch(Dataset): + """Train dataset.""" + + def __init__(self, df: pd.DataFrame, nontext_features: list[str] = ["aov"]): + """ + + Args: + df (pd.DataFrame): train dataframe + nontext_features (list[str]): features to use in training except for text embeddings + """ + self.df = df + self.nontext_features = nontext_features + + df[nontext_features + ["ctr"]] = df[nontext_features + ["ctr"]].astype(np.float32) + + def __len__(self): + return len(self.df) + + def __getitem__(self, idx): + + if torch.is_tensor(idx): + idx = idx.tolist() + + text = self.df.loc[idx, "text_clean"] + ctr = self.df.loc[idx, "ctr"] + + nontext_features = {feature: self.df.loc[idx, feature] for feature in self.nontext_features} + + return {"text": text, "ctr": ctr} | nontext_features + + +# tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity") +# train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer) diff --git a/src/regression/datasets/RegressionDataset.py b/src/regression/datasets/RegressionDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..34da46c83ade1546c1e700e36f82f96c81b3a560 --- /dev/null +++ b/src/regression/datasets/RegressionDataset.py @@ -0,0 +1,313 @@ +import os + +import numpy as np +import pandas as pd +from dotenv import load_dotenv +from langdetect import detect +from loguru import logger +from sklearn.model_selection import train_test_split +from time import sleep +from transformers import BertModel, AutoTokenizer +from tqdm import tqdm +import torch +from config import DEVICE + +from src.utils.text_functions import clean_text, detect_language +from src.utils import ( + get_sentiment, + detect_language, +) + +from src.regression.PL import ( + get_bert_embedding, + get_concat_embedding, +) + +from src.utils.s3 import read_csv, save_csv + + +load_dotenv() + + +class RegressionDataset: + def __init__( + self, + s3: bool = False, + bucket: str = "lebesgue-data-science", + folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data", + s3_folder: str = "transformers/data", + ): + self.s3 = s3 + self.bucket = bucket + + if self.s3: + self.folder = s3_folder + else: + self.folder = folder + + self.original_path = f"{self.folder}/original.csv" + self.untrimmed_path = f"{self.folder}/untrimmed.csv" + self.normalized_path = f"{self.folder}/normalized.csv" + self.trimmed_path = f"{self.folder}/trimmed.csv" + + self.train_path = f"{self.folder}/train.csv" + self.val_path = f"{self.folder}/val.csv" + self.test_path = f"{self.folder}/test.csv" + + self.text_types = ["primary", "title", "description"] + + self.col_func_dict = { + "number": len, + "len": lambda texts: np.mean([len(text) for text in texts]), + } + + @property + def original(self) -> pd.DataFrame: + df = read_csv(path=self.original_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def untrimmed(self) -> pd.DataFrame: + df = read_csv(path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def normalized(self) -> pd.DataFrame: + df = read_csv(path=self.normalized_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def trimmed(self) -> pd.DataFrame: + df = read_csv(path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def train(self) -> pd.DataFrame: + df = read_csv(path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def val(self) -> pd.DataFrame: + df = read_csv(path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + @property + def test(self) -> pd.DataFrame: + df = read_csv(path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + def normalize_untrimmed(self, group_cols: list[str] = ["text", "target", "shop_id"]) -> pd.DataFrame: + df = self.untrimmed + grouped = df.groupby(group_cols) + + filters_df = grouped.agg({"impr": "sum", "spend": "sum"}).reset_index() + ctr = grouped.apply(lambda df: df.link_clicks.sum() / df.impr.sum()) + ctr_df = pd.DataFrame(ctr, columns=["ctr"]).reset_index() + normalised = filters_df.merge(ctr_df, on=group_cols) + + merged = df.merge(normalised, on=group_cols, validate="m:1", suffixes=["___", None]) + merged.drop(list([col for col in merged.columns if "___" in col]), inplace=True, axis=1) + final = merged.drop_duplicates(group_cols) + save_csv( + df=final, + path=self.normalized_path, + s3=self.s3, + s3_args={"bucket": self.bucket}, + ) + return df + + def expand_untrimmed(self, update_existing_columns: bool = False) -> pd.DataFrame: + + df = self.untrimmed + + # normalise target by adset + # df["ctr_norm"] = ( + # df.groupby(["shop_id", "adset_id"]) + # .ctr.transform(lambda x: (x - x.mean()) / x.std()) + # .count() + # ) + + new_col_func_dict = self.col_func_dict + + if not update_existing_columns: + new_col_func_dict = { + col: fun for col, fun in new_col_func_dict.items() if "primary_" + col not in df.columns + } + + # get extra columns + for col, func in new_col_func_dict.items(): + logger.debug(col) + for text_type in self.text_types: + df[f"{text_type}_{col}"] = df[text_type].apply(func) + + df["has_text"] = df.apply( + lambda df: bool(df.primary_number + df.title_number + df.description_number), + axis=1, + ) + + # text columns + df = df.apply(_get_text, axis=1) + df = df.apply(_get_concatinated_text, axis=1) + + df["language"] = df.text.apply(detect_language) + df = df[df.language == "en"] + df = df[df.ctr.notna()] + + save_csv(df=df, path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket}) + + return df + + def trim(self, min_impr: int = 900, min_spend: float = 90) -> pd.DataFrame: + df = self.normalized + df = df[(df.impr >= min_impr) & (df.spend >= min_spend)] + df = df[df.target == "acquisition"] + df = df[df.aov.notna()] + + df = df[df.has_text == True] + + save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket}) + + return df + + def expand_trimmed( + self, bert: BertModel = None, tokenizer: AutoTokenizer = None, add_bert_embeddings_bool: bool = False + ) -> pd.DataFrame: + df = self.trimmed + + # clean text + for col in ["text", "concat_text"]: + df[f"{col}_clean"] = df[col].apply(clean_text) + + df["text_clean_sentiment"] = df.text_clean.apply(get_sentiment) + + if add_bert_embeddings_bool: + if tokenizer is None or bert is None: + raise ValueError("tokenizer or bert is None") + layer_dict = {"bert": bert, "tokenizer": tokenizer} + df = add_bert_embeddings(df=df, save_path=self.trimmed_path, layer_dict=layer_dict) + + df = df.apply(add_concat_embeddings, axis=1) + + save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return df + + def split_into_train_and_test( + self, + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + df = self.trimmed + train, test = train_test_split(df, train_size=0.9, random_state=42) + train, val = train_test_split(train, train_size=0.85, random_state=42) + save_csv(df=train, path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket}) + save_csv(df=val, path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket}) + save_csv(df=test, path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket}) + return train, val, test + + def expand_normalise_trim_split( + self, + update_existing_columns: bool = False, + group_cols=["text", "target", "shop_id"], + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + self.expand_untrimmed(update_existing_columns=update_existing_columns) + self.normalize_untrimmed(group_cols=group_cols) + self.trim() + self.expand_trimmed() + train, val, test = self.split_into_train_and_test() + return train, val, test + + +def _get_text(ad: pd.Series) -> pd.Series: + + if ad.primary_number > 0: + ad["text"] = ad.primary[0] + + elif ad.description_number > 0: + ad["text"] = ad.description[0] + + elif ad.title_number > 0: + ad["text"] = ad.title[0] + + else: + ad["text"] = None + + return ad + + +def _get_concatinated_text(ad: pd.Series) -> pd.Series: + + concat_text = "" + + if ad.primary_number > 0: + concat_text = concat_text + ad.primary[0] + + if ad.description_number > 0: + concat_text = concat_text + ad.description[0] + + if ad.title_number > 0: + concat_text = concat_text + ad.title[0] + + ad["concat_text"] = concat_text + + return ad + + +regression_dataset = RegressionDataset() + +regression_dataset_s3 = RegressionDataset(s3=True) + + +def add_bert_embeddings(df: pd.DataFrame, save_path: str, layer_dict: dict = {}, device=DEVICE) -> pd.DataFrame: + + if device == torch.device("cuda"): + df["my_bert_cls_embedding"] = df.text_clean.apply( + lambda text: get_bert_embedding(text=text, cls=True, layer_dict=layer_dict) + ) + df["my_bert_mean_embedding"] = df.text_clean.apply( + lambda text: get_bert_embedding(text=text, cls=False, layer_dict=layer_dict) + ) + return df + + if "my_bert_cls_embedding" not in df.columns: + df["my_bert_cls_embedding"] = None + + if "my_bert_mean_embedding" not in df.columns: + df["my_bert_mean_embedding"] = None + + counter = 0 + + df["my_bert_cls_embedding"] = df["my_bert_cls_embedding"].astype(object) + df["my_bert_mean_embedding"] = df["my_bert_mean_embedding"].astype(object) + + for i in tqdm(range(len(df))): + + if df.at[i, "my_bert_cls_embedding"] is not None: + df.at[i, "my_bert_cls_embedding"] = get_bert_embedding( + text=df.at[i, "text_clean"], cls=False, layer_dict=layer_dict + ) + counter = counter + 1 + sleep(0.5) + + if df.at[i, "my_bert_mean_embedding"] is not None: + df.at[i, "my_bert_mean_embedding"] = get_bert_embedding( + text=df.at[i, "text_clean"], cls=True, layer_dict=layer_dict + ) + counter = counter + 1 + sleep(0.5) + + if counter % 50 in [0, 1]: + df.to_csv(save_path, index=False) + + df.to_csv(save_path, index=False) + + return df + + +def add_concat_embeddings(series: pd.DataFrame) -> pd.Series: + other_features = {"aov": series["aov"]} | series["text_clean_sentiment"] + + for type in ["cls", "mean"]: + bert_embedding = series[f"my_bert_{type}_embedding"] + series[f"my_full_{type}_embedding"] = get_concat_embedding( + bert_embedding=bert_embedding, other_features=other_features + ) + + return series diff --git a/src/regression/datasets/__init__.py b/src/regression/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0159851efbb827d29c6f6f9dc49f90157436813d --- /dev/null +++ b/src/regression/datasets/__init__.py @@ -0,0 +1,3 @@ +from .FullModelDatasetTorch import FullModelDatasetTorch +from .DecoderDatasetTorch import DecoderDatasetTorch +from .RegressionDataset import RegressionDataset, regression_dataset, regression_dataset_s3 diff --git a/src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc b/src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5da18344d472e79b3a3c30b7b037ec7fb046e3ad Binary files /dev/null and b/src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc differ diff --git a/src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc b/src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30d2303273747f3441f262c8567e4afb544805e5 Binary files /dev/null and b/src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc differ diff --git a/src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc b/src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..618c516e3c2d0c2b37c025012bda9986777a8f9a Binary files /dev/null and b/src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc differ diff --git a/src/regression/datasets/__pycache__/__init__.cpython-310.pyc b/src/regression/datasets/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a20b9528b2d8f66847ca74b80488150cb8adb773 Binary files /dev/null and b/src/regression/datasets/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/datasets/__pycache__/dataset.cpython-310.pyc b/src/regression/datasets/__pycache__/dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af717ed9b9a3ebde5dbd91ff9f7ddb5a9f36f5ed Binary files /dev/null and b/src/regression/datasets/__pycache__/dataset.cpython-310.pyc differ diff --git a/src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc b/src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aeecd39894d03be0142bc3b18fb4ecd7ffb4b3b1 Binary files /dev/null and b/src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc differ diff --git a/src/regression/training_scripts/__init__.py b/src/regression/training_scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1caa140e6e58156db45cdb83f24ff03744dd3800 --- /dev/null +++ b/src/regression/training_scripts/__init__.py @@ -0,0 +1,2 @@ +from .train_full_model_PL import train_full_model_PL +from .train_decoder_PL import train_decoder_PL diff --git a/src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc b/src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb93e8879961423d454ee84a895d5bbd08aa5cc5 Binary files /dev/null and b/src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc b/src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1b3711855b0a78288339c397695d81d53b729a4 Binary files /dev/null and b/src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc differ diff --git a/src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc b/src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..794dafeeb7cc98011562ba7bcd4526bd649519a3 Binary files /dev/null and b/src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc differ diff --git a/src/regression/training_scripts/__pycache__/train_decoder_PL.cpython-310.pyc b/src/regression/training_scripts/__pycache__/train_decoder_PL.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bcbd6bd910c67040d18375e8e4e6a4ee2553dadf Binary files /dev/null and b/src/regression/training_scripts/__pycache__/train_decoder_PL.cpython-310.pyc differ diff --git a/src/regression/training_scripts/__pycache__/train_full_model_PL.cpython-310.pyc b/src/regression/training_scripts/__pycache__/train_full_model_PL.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8d62a0ad625e44b3a00d4429c5df76a52f46fe8 Binary files /dev/null and b/src/regression/training_scripts/__pycache__/train_full_model_PL.cpython-310.pyc differ diff --git a/src/regression/training_scripts/train_decoder_PL.py b/src/regression/training_scripts/train_decoder_PL.py new file mode 100644 index 0000000000000000000000000000000000000000..3dc8cbdcc84e56379fc3cdc12a2e20670ae7aa24 --- /dev/null +++ b/src/regression/training_scripts/train_decoder_PL.py @@ -0,0 +1,97 @@ +from torch.utils.data import DataLoader +import pytorch_lightning as pl +import wandb +from torch import nn +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor +from pytorch_lightning import Trainer + +import pandas as pd +from loguru import logger +from dotenv import load_dotenv +import torch + +from src.regression.datasets import DecoderDatasetTorch +from src.regression.datasets import regression_dataset +from src.regression.PL import * + +load_dotenv() + + +def train_decoder_PL( + train: pd.DataFrame, + test: pd.DataFrame, + artifact_path: str | None = None, + resume: bool | str = "must", + run_id: str | None = None, + run_name: str = "sanity", + model_class=DecoderPL, + max_epochs: int = 2, + layer_norm: bool = True, + embedding_column: str = "my_full_mean_embedding", + device: str = "mps", + *args, + **kwargs +): + + torch.set_default_dtype(torch.float32) + + train = train[train.aov.notna()].reset_index(drop=True) + test = test[test.aov.notna()].reset_index(drop=True) + + if run_name == "sanity": + resume = False + run_id = None + max_epochs = 2 + train = train.loc[0:16, :] + test = test.loc[0:16] + + # initializing dataset, dataloader and nn.module model + train_dataset = DecoderDatasetTorch(df=train, embedding_column=embedding_column) + train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=8) + + test_dataset = DecoderDatasetTorch(df=test, embedding_column=embedding_column) + test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=8) + + wandb_logger = WandbLogger( + project="transformers", + entity="sanjin_juric_fot", + log_model=True, + reinit=True, + resume=resume, + id=run_id, + name=run_name, + ) + + # here lightning comes into play + if artifact_path is not None: + artifact = wandb_logger.use_artifact(artifact_path) + artifact_dir = artifact.download() + litmodel = model_class.load_from_checkpoint(artifact_dir + "/" + "model.ckpt").to(device) + + logger.debug("logged from checkpoint") + + torch.multiprocessing.set_sharing_strategy("file_system") + + else: + litmodel = model_class(input_dim=len(train.at[0, embedding_column]), layer_norm=layer_norm, *args, **kwargs).to( + device + ) + + checkpoint_callback = ModelCheckpoint(monitor="val_loss", mode="min") + lr_monitor = LearningRateMonitor(logging_interval="epoch") + trainer = Trainer( + accelerator=str(device), + devices=1, + logger=wandb_logger, + log_every_n_steps=2, + max_epochs=max_epochs, + callbacks=[checkpoint_callback, lr_monitor], + ) + + logger.debug("training...") + trainer.fit( + model=litmodel, + train_dataloaders=train_dataloader, + val_dataloaders=test_dataloader, + ) diff --git a/src/regression/training_scripts/train_full_model_PL.py b/src/regression/training_scripts/train_full_model_PL.py new file mode 100644 index 0000000000000000000000000000000000000000..fed8dd3ef6e70189c69ecab4fba1df667d0d4316 --- /dev/null +++ b/src/regression/training_scripts/train_full_model_PL.py @@ -0,0 +1,100 @@ +from torch.utils.data import DataLoader +from pytorch_lightning.loggers import WandbLogger +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning import Trainer +import pandas as pd +from loguru import logger +from dotenv import load_dotenv +import torch +from pytorch_lightning.callbacks import LearningRateMonitor + +from src.regression.datasets import FullModelDatasetTorch +from src.regression.PL import * + + +def train_full_model_PL( + train: pd.DataFrame, + test: pd.DataFrame, + artifact_path: str | None = None, + resume: bool | str = "must", + run_id: str | None = None, + run_name: str = "sanity", + model_class=FullModelPL, + max_epochs: int = 2, + layer_norm: bool = False, +): + + torch.set_default_dtype(torch.float32) + + load_dotenv() + + nontext_features = ["aov"] + + train = train[train.aov.notna()].reset_index(drop=True) + test = test[test.aov.notna()].reset_index(drop=True) + + if run_name == "sanity": + resume = False + run_id = None + train = train.loc[0:16, :] + test = test.loc[0:16] + + # initializing dataset, dataloader and nn.module model + train_dataset = FullModelDatasetTorch(df=train, nontext_features=nontext_features) + train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=8) + + test_dataset = FullModelDatasetTorch(df=test, nontext_features=nontext_features) + test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=8) + + wandb_logger = WandbLogger( + project="transformers", + entity="sanjin_juric_fot", + log_model=True, + reinit=True, + resume=resume, + id=run_id, + name=run_name, + ) + + # here lightning comes into play + if artifact_path is not None: + artifact = wandb_logger.use_artifact(artifact_path) + artifact_dir = artifact.download() + litmodel = model_class.load_from_checkpoint(artifact_dir + "/" + "model.ckpt").to("mps") + + logger.debug("logged from checkpoint") + + # for name, layer in litmodel.named_modules(): + # if isinstance(layer, nn.Linear) and name == "linear2": + # break + + # layer_dict = {"linear2": layer} + + # litmodel = LitAdModelLHS( + # nontext_features=nontext_features, layer_dict=layer_dict + # ) + + else: + litmodel = model_class( + model_name="bert-base-uncased", + nontext_features=nontext_features, + layer_norm=layer_norm, + ).to("mps") + + checkpoint_callback = ModelCheckpoint(monitor="val_loss", mode="min") + lr_monitor = LearningRateMonitor(logging_interval="epoch") + trainer = Trainer( + accelerator="mps", + devices=1, + logger=wandb_logger, + log_every_n_steps=2, + max_epochs=max_epochs, + callbacks=[checkpoint_callback, lr_monitor], + ) + # trainer = Trainer(logger=wandb_logger, log_every_n_steps=2, max_epochs=2, callbacks=[checkpoint_callback]) + logger.debug("training...") + trainer.fit( + model=litmodel, + train_dataloaders=train_dataloader, + val_dataloaders=test_dataloader, + ) diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c69c1156ce0180db45c7ce323addb61ad6322855 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1,7 @@ +from .common import element_to_list, vectorise_dict +from .decorators import print_execution_time +from .csv import read_csv_and_eval +from .nlp import detect_language, get_sentiment +from .bert import add_emoji_tokens, add_new_line_token, batch_iterator +from .hugging_face import user_id +from .s3 import * diff --git a/src/utils/__pycache__/__init__.cpython-310.pyc b/src/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c4005a350b90088005cd98057b6b18f7fcbaaad Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/utils/__pycache__/bert.cpython-310.pyc b/src/utils/__pycache__/bert.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..994e987ab9a41478127b774543765eb02897585c Binary files /dev/null and b/src/utils/__pycache__/bert.cpython-310.pyc differ diff --git a/src/utils/__pycache__/common.cpython-310.pyc b/src/utils/__pycache__/common.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99c7af8d7b3e571e241e6426d1f4e2104a43f4b1 Binary files /dev/null and b/src/utils/__pycache__/common.cpython-310.pyc differ diff --git a/src/utils/__pycache__/csv.cpython-310.pyc b/src/utils/__pycache__/csv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae041cd9170abb3cd0c12c293e58f54e6eda29e8 Binary files /dev/null and b/src/utils/__pycache__/csv.cpython-310.pyc differ diff --git a/src/utils/__pycache__/decorators.cpython-310.pyc b/src/utils/__pycache__/decorators.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e49d2df8e2a67b626564c963213612c61fbd8a6 Binary files /dev/null and b/src/utils/__pycache__/decorators.cpython-310.pyc differ diff --git a/src/utils/__pycache__/get_sentiment.cpython-310.pyc b/src/utils/__pycache__/get_sentiment.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3548a63de75a4e0d11741b3939cd6a0ce5fb4854 Binary files /dev/null and b/src/utils/__pycache__/get_sentiment.cpython-310.pyc differ diff --git a/src/utils/__pycache__/hugging_face.cpython-310.pyc b/src/utils/__pycache__/hugging_face.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd2a9299da08adb66857a734328300d29e99d7a6 Binary files /dev/null and b/src/utils/__pycache__/hugging_face.cpython-310.pyc differ diff --git a/src/utils/__pycache__/neural_networks.cpython-310.pyc b/src/utils/__pycache__/neural_networks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d6ba7a7ff6aa2dbe735bdbcbd4787ea08085222c Binary files /dev/null and b/src/utils/__pycache__/neural_networks.cpython-310.pyc differ diff --git a/src/utils/__pycache__/nlp.cpython-310.pyc b/src/utils/__pycache__/nlp.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..141f0610ea4dc7de6be86dd1ef8488d91b154ce4 Binary files /dev/null and b/src/utils/__pycache__/nlp.cpython-310.pyc differ diff --git a/src/utils/__pycache__/read_csv_and_eval.cpython-310.pyc b/src/utils/__pycache__/read_csv_and_eval.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06ceb606bc61ca31a8ca00411fa65b1c87550d01 Binary files /dev/null and b/src/utils/__pycache__/read_csv_and_eval.cpython-310.pyc differ diff --git a/src/utils/__pycache__/text_functions.cpython-310.pyc b/src/utils/__pycache__/text_functions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db4eb98e08c88efb48b5d35962f9c15cb34b4070 Binary files /dev/null and b/src/utils/__pycache__/text_functions.cpython-310.pyc differ diff --git a/src/utils/bert.py b/src/utils/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..3425ee303e0e1f5c86b59511199123736e48f3f0 --- /dev/null +++ b/src/utils/bert.py @@ -0,0 +1,20 @@ +from transformers import BertTokenizerFast +import emoji +import pandas as pd +from tqdm import tqdm + + +def add_emoji_tokens(tokenizer: BertTokenizerFast): + all_emojis = list(emoji.EMOJI_DATA.keys()) + tokenizer.add_tokens(all_emojis) + return tokenizer + + +def add_new_line_token(tokenizer: BertTokenizerFast): + tokenizer.add_special_tokens({"additional_special_tokens": ["\n"]}) + return tokenizer + + +def batch_iterator(df: pd.DataFrame, batch_size=10000, col: str = "text"): + for i in tqdm(range(0, len(df), batch_size)): + yield df.loc[i : i + batch_size, col] diff --git a/src/utils/common.py b/src/utils/common.py new file mode 100644 index 0000000000000000000000000000000000000000..9509edbe16647f833d6d23039fe8a21a98515103 --- /dev/null +++ b/src/utils/common.py @@ -0,0 +1,13 @@ +from typing import Any +import numpy as np + + +def element_to_list(a: Any) -> list: + if type(a) is not list: + return [a] + return a + + +def vectorise_dict(dt: dict, as_list=False) -> tuple | list: + keys, values = zip(*sorted(dt.items())) + return list(values) if as_list else values diff --git a/src/utils/csv.py b/src/utils/csv.py new file mode 100644 index 0000000000000000000000000000000000000000..8569420c51d5642da7da246f3b597d892c0213ac --- /dev/null +++ b/src/utils/csv.py @@ -0,0 +1,21 @@ +import ast +import pandas as pd + + +def read_csv_and_eval(path: str, *args, **kwargs) -> pd.DataFrame: + + df = pd.read_csv(path, *args, **kwargs) + + df = df.applymap(eval_but_leave_string_if_you_cant) + + return df + + +def eval_but_leave_string_if_you_cant(text: str): + + try: + out = ast.literal_eval(text) + except: + out = text + + return out diff --git a/src/utils/decorators.py b/src/utils/decorators.py new file mode 100644 index 0000000000000000000000000000000000000000..b62eb58af51503d5740c11a96f2473da4c482988 --- /dev/null +++ b/src/utils/decorators.py @@ -0,0 +1,15 @@ +import time +from datetime import timedelta +import pandas as pd +from loguru import logger + + +def print_execution_time(func): + def wrapper_time(*args, **kwargs): + start = time.perf_counter() + a = func(*args, **kwargs) + end = time.perf_counter() + logger.debug(f"{func.__name__}: {str(timedelta(seconds=end - start))}") + return a + + return wrapper_time diff --git a/src/utils/enum.py b/src/utils/enum.py new file mode 100644 index 0000000000000000000000000000000000000000..a87d477fbdaf1be26d45f94f3ba28519a1d614b6 --- /dev/null +++ b/src/utils/enum.py @@ -0,0 +1,6 @@ +from enum import EnumMeta +from typing import Any + + +def get_enum_values(_enum: EnumMeta) -> list[Any]: + return [item.value for item in _enum] diff --git a/src/utils/hugging_face.py b/src/utils/hugging_face.py new file mode 100644 index 0000000000000000000000000000000000000000..45dc9d37d666b6d6a02bb5b5b01e6db0695feabb --- /dev/null +++ b/src/utils/hugging_face.py @@ -0,0 +1,3 @@ +from huggingface_hub import HfApi + +user_id = HfApi().whoami()["name"] diff --git a/src/utils/neural_networks.py b/src/utils/neural_networks.py new file mode 100644 index 0000000000000000000000000000000000000000..f61a6b1067256960d5ca1d9d98269bf4d1ce719e --- /dev/null +++ b/src/utils/neural_networks.py @@ -0,0 +1,6 @@ +def set_layer(layer_dict: dict, name: str, alternative): + + if name in layer_dict.keys(): + return layer_dict[name] + + return alternative diff --git a/src/utils/nlp.py b/src/utils/nlp.py new file mode 100644 index 0000000000000000000000000000000000000000..c459559381f5e387d6fb556b55a7b8247f000c8c --- /dev/null +++ b/src/utils/nlp.py @@ -0,0 +1,22 @@ +from langdetect import detect +from loguru import logger +from nltk.sentiment import SentimentIntensityAnalyzer +from textblob import TextBlob + + +def detect_language(text: str) -> None | str: + + try: + return detect(text) + except Exception: + logger.debug(f"No language features: {text}") + return None + + +sia = SentimentIntensityAnalyzer() + + +def get_sentiment(text: str) -> dict: + sentiment = sia.polarity_scores(text) + sentiment["subjectivity"] = TextBlob(text).subjectivity + return sentiment diff --git a/src/utils/s3/__init__.py b/src/utils/s3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1ff7afef64c69a9ba7e5dddd09caaac3b0fadf --- /dev/null +++ b/src/utils/s3/__init__.py @@ -0,0 +1,4 @@ +from .read_file_from_s3 import * +from .s3_connect import * +from .save_file_to_s3 import * +from .decorators import save_output_csv_to_s3 diff --git a/src/utils/s3/__pycache__/__init__.cpython-310.pyc b/src/utils/s3/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..892303ca3eb846f99ab8918c78525bb7bd63a485 Binary files /dev/null and b/src/utils/s3/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/copy_on_s3.cpython-310.pyc b/src/utils/s3/__pycache__/copy_on_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..370236375e1bc02dacd5cca3652846a7e04dbec3 Binary files /dev/null and b/src/utils/s3/__pycache__/copy_on_s3.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/decorators.cpython-310.pyc b/src/utils/s3/__pycache__/decorators.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..efaef86c6c67f2414c0958e74b44a83cc8b971f4 Binary files /dev/null and b/src/utils/s3/__pycache__/decorators.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/delete_from_s3.cpython-310.pyc b/src/utils/s3/__pycache__/delete_from_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b51b34a6a05228924b865a29d74dc9a08ecbadac Binary files /dev/null and b/src/utils/s3/__pycache__/delete_from_s3.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/read_by_shop.cpython-310.pyc b/src/utils/s3/__pycache__/read_by_shop.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddb57802571cc1331d0de204b3580877210cfec3 Binary files /dev/null and b/src/utils/s3/__pycache__/read_by_shop.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/read_by_shop.cpython-39.pyc b/src/utils/s3/__pycache__/read_by_shop.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78658238ae1bee9477c2677ba39595ac56eac606 Binary files /dev/null and b/src/utils/s3/__pycache__/read_by_shop.cpython-39.pyc differ diff --git a/src/utils/s3/__pycache__/read_csv_from_s3.cpython-310.pyc b/src/utils/s3/__pycache__/read_csv_from_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df9bd2a60f7438e1819a1b8d9085b6053e4eb74c Binary files /dev/null and b/src/utils/s3/__pycache__/read_csv_from_s3.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/read_file_from_s3.cpython-310.pyc b/src/utils/s3/__pycache__/read_file_from_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d3b594d2c88801c8b50976ac20e5102ea57c940 Binary files /dev/null and b/src/utils/s3/__pycache__/read_file_from_s3.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/s3_connect.cpython-310.pyc b/src/utils/s3/__pycache__/s3_connect.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e01613d66461705667012cae8a7e96d6ef637d4 Binary files /dev/null and b/src/utils/s3/__pycache__/s3_connect.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/s3_connect.cpython-39.pyc b/src/utils/s3/__pycache__/s3_connect.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb98d4b48245131eaef2f1fdc217cc3d5ddc6b56 Binary files /dev/null and b/src/utils/s3/__pycache__/s3_connect.cpython-39.pyc differ diff --git a/src/utils/s3/__pycache__/save_csv_to_s3.cpython-310.pyc b/src/utils/s3/__pycache__/save_csv_to_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b15dbf4a68578115c1aebf3e1879df4a9a8c8c5 Binary files /dev/null and b/src/utils/s3/__pycache__/save_csv_to_s3.cpython-310.pyc differ diff --git a/src/utils/s3/__pycache__/save_file_to_s3.cpython-310.pyc b/src/utils/s3/__pycache__/save_file_to_s3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffd808020e0ece8c101e918e44fc7b8c427f048d Binary files /dev/null and b/src/utils/s3/__pycache__/save_file_to_s3.cpython-310.pyc differ diff --git a/src/utils/s3/decorators.py b/src/utils/s3/decorators.py new file mode 100644 index 0000000000000000000000000000000000000000..397e692c9c43d543a419b5be40512b2d145d30c8 --- /dev/null +++ b/src/utils/s3/decorators.py @@ -0,0 +1,24 @@ +import pandas as pd + +from .save_file_to_s3 import save_csv_to_s3 + + +def save_output_csv_to_s3(save_path: str, bucket: str = "lebesgue-common-bucket"): + def outer_wrapper(func): + def wrapper(*args, **kwargs): + + output = func(*args, **kwargs) + + if type(output) == pd.DataFrame: + save_csv_to_s3(df=output, path=save_path, index=False, bucket=bucket) + else: + [ + save_csv_to_s3(df=df, path=path, index=False, bucket=bucket) + for df, path in zip(output, save_path) + ] + + return output + + return wrapper + + return outer_wrapper diff --git a/src/utils/s3/read_file_from_s3.py b/src/utils/s3/read_file_from_s3.py new file mode 100644 index 0000000000000000000000000000000000000000..0ad48691ef80f8f24708b0dff659e8a5521e9064 --- /dev/null +++ b/src/utils/s3/read_file_from_s3.py @@ -0,0 +1,35 @@ +import sys + +sys.path.append("./.") + +import pandas as pd +import json + +import os +from dotenv import load_dotenv + +load_dotenv() + +from src.utils.s3.s3_connect import s3_connect +from src.utils.csv import read_csv_and_eval + +from src.utils import * + + +def read_csv_from_s3(path: str, client=s3_connect(), bucket="lebesgue-common-bucket", evaluate=True, *args, **kwargs): + + object = client.get_object(Bucket=bucket, Key=path) + + if evaluate: + df = read_csv_and_eval(object["Body"], *args, **kwargs) + else: + df = pd.read_csv(object["Body"], *args, **kwargs) + + return df + + +def read_csv(path: str, s3: bool = False, pd_args: dict = {}, s3_args: dict = {}): + if s3: + return read_csv_from_s3(path=path, **s3_args) + else: + return read_csv_and_eval(path=path, **pd_args) diff --git a/src/utils/s3/s3_connect.py b/src/utils/s3/s3_connect.py new file mode 100644 index 0000000000000000000000000000000000000000..da8eefa77e739f0e91a4c823f5884c286f3190d7 --- /dev/null +++ b/src/utils/s3/s3_connect.py @@ -0,0 +1,52 @@ +import boto3 +import os +from dotenv import load_dotenv + +from src.utils import print_execution_time + + +load_dotenv() + + +def s3_connect(): + s3 = boto3.resource( + service_name="s3", + region_name=os.environ["AWS_DEFAULT_REGION"], + aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], + ) + + return boto3.client("s3") + + +def s3_resource(): + s3 = boto3.resource( + service_name="s3", + region_name=os.environ["AWS_DEFAULT_REGION"], + aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], + ) + + return s3 + + +def list_objects_from_prefix( + prefix: str, client=s3_connect(), add_global_folder: bool = True +): + + if add_global_folder: + prefix = add_global_s3_folder(prefix) + client = s3_connect() + response = client.list_objects_v2(Bucket="creative-features", Prefix=prefix) + + if "Contents" in response.keys(): + list_of_paths = [content["Key"] for content in response["Contents"]] + + else: + list_of_paths = [] + + return list_of_paths + + +def add_global_s3_folder(path): + return f"{os.getenv('S3_PATH')}/{path}" diff --git a/src/utils/s3/save_file_to_s3.py b/src/utils/s3/save_file_to_s3.py new file mode 100644 index 0000000000000000000000000000000000000000..e74a952937b9e640901d999993bcd00592fc963b --- /dev/null +++ b/src/utils/s3/save_file_to_s3.py @@ -0,0 +1,37 @@ +import sys + +sys.path.append("./.") + +from src.utils.s3 import s3_connect +import pandas as pd +from io import StringIO + +import json + +from src.utils import * + + +def save_csv_to_s3( + df: pd.DataFrame, path: str, client=s3_connect(), bucket: str = "lebesgue-common-bucket", *args, **kwargs +): + + csv_buffer = StringIO() + df.to_csv(csv_buffer, index=False, *args, **kwargs) + key = path + + client.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue()) + + return + + +def save_csv( + df: pd.DataFrame, + path: str, + s3: bool = False, + pd_args: dict = {}, + s3_args: dict = {}, +): + if s3: + save_csv_to_s3(df=df, path=path, **s3_args) + else: + df.to_csv(path, index=False, **pd_args) diff --git a/src/utils/text_functions.py b/src/utils/text_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..dbab2512ea197b2e9be49fcca0fe2d46b633aeae --- /dev/null +++ b/src/utils/text_functions.py @@ -0,0 +1,86 @@ +import re +import string +from statistics import mode + +import emoji +from langdetect import detect +from spellchecker import SpellChecker + + +def clean_text(text: str) -> str: + + # text = text.replace("\n", " ") + + for fun in [ + remove_URL, + remove_html, + remove_hashtags, + # remove_backslashes, + # remove_emojis, + # remove_punct, + ]: + text = fun(text) + return text + + +def remove_URL(text: str) -> str: + url = re.compile(r"https?://\S+|www\.\S+") + return url.sub(r"", text) + + +def remove_hashtags(text: str) -> str: + hashtag = re.compile(r"#\S+") + return hashtag.sub(r"", text) + + +def remove_html(text: str) -> str: + html = re.compile(r"<.*?>") + return html.sub(r"", text) + + +def remove_emojis(text: str) -> str: + delimiter = "#4=" + for i in range(5): + text = emoji.demojize(string=text, delimiters=(delimiter, delimiter)) + text = re.sub(f"{delimiter}\S+{delimiter}", "", text) + return text + + +def remove_punct(text): + table = str.maketrans("", "", string.punctuation) + return text.translate(table) + + +def correct_spellings(text): + spell = SpellChecker() + corrected_text = [] + misspelled_words = spell.unknown(text.split()) + for word in text.split(): + corrected_word = spell.correction(word) + if word in misspelled_words and corrected_word is not None: + corrected_text.append(corrected_word) + else: + corrected_text.append(word) + return " ".join(corrected_text) + + +def remove_backslashes(text: str) -> str: + backslash = re.compile(r"\\\S+") + return backslash.sub(r"", text) + + +def detect_language(list_of_texts: list[str]) -> str | None: + + if len(list_of_texts) == 0: + return None + + languages = [] + + for text in list_of_texts: + try: + lan = detect(text) + languages.append(lan) + except Exception: + continue + + return mode(languages) if len(languages) else None