Upload src/ with huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- src/MLM/__init__.py +2 -0
- src/MLM/__pycache__/__init__.cpython-310.pyc +0 -0
- src/MLM/datasets/MLMDataset.py +193 -0
- src/MLM/datasets/__init__.py +2 -0
- src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc +0 -0
- src/MLM/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
- src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc +0 -0
- src/MLM/datasets/preprocess_dataset.py +39 -0
- src/MLM/mask_and_unmask.py +32 -0
- src/MLM/training_scripts/__init__.py +1 -0
- src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc +0 -0
- src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc +0 -0
- src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc +0 -0
- src/MLM/training_scripts/train_with_trainer.py +50 -0
- src/MLM/training_scripts/utils.py +9 -0
- src/regression/.gitignore +1 -0
- src/regression/HF/__init__.py +2 -0
- src/regression/HF/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/HF/configs/FullModelConfigHF.py +25 -0
- src/regression/HF/configs/__init__.py +1 -0
- src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc +0 -0
- src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/HF/models/FullModelHF.py +43 -0
- src/regression/HF/models/__init__.py +1 -0
- src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc +0 -0
- src/regression/HF/models/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/PL/DecoderPL.py +180 -0
- src/regression/PL/EncoderPL.py +116 -0
- src/regression/PL/FullModelPL.py +166 -0
- src/regression/PL/__init__.py +3 -0
- src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc +0 -0
- src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc +0 -0
- src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc +0 -0
- src/regression/PL/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/__init__.py +3 -0
- src/regression/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/datasets/DecoderDatasetTorch.py +38 -0
- src/regression/datasets/FullModelDatasetTorch.py +39 -0
- src/regression/datasets/RegressionDataset.py +313 -0
- src/regression/datasets/__init__.py +3 -0
- src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc +0 -0
- src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc +0 -0
- src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc +0 -0
- src/regression/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/datasets/__pycache__/dataset.cpython-310.pyc +0 -0
- src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc +0 -0
- src/regression/training_scripts/__init__.py +2 -0
- src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc +0 -0
- src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc +0 -0
- src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc +0 -0
src/MLM/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .training_scripts.train_with_trainer import train_with_trainer
|
| 2 |
+
from .datasets.preprocess_dataset import preprocess_dataset
|
src/MLM/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (332 Bytes). View file
|
|
|
src/MLM/datasets/MLMDataset.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
from transformers import BertTokenizerFast, AutoTokenizer
|
| 6 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
| 7 |
+
|
| 8 |
+
from src.utils import (
|
| 9 |
+
detect_language,
|
| 10 |
+
add_emoji_tokens,
|
| 11 |
+
add_new_line_token,
|
| 12 |
+
user_id,
|
| 13 |
+
)
|
| 14 |
+
from src.utils.text_functions import clean_text
|
| 15 |
+
from src.utils.s3 import read_csv, save_csv
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class MLMDataset:
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
s3: bool = False,
|
| 24 |
+
bucket: str = "lebesgue-data-science",
|
| 25 |
+
folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data/pretrain",
|
| 26 |
+
s3_folder: str = "transformers/data/pretrain",
|
| 27 |
+
):
|
| 28 |
+
self.s3 = s3
|
| 29 |
+
self.bucket = bucket
|
| 30 |
+
|
| 31 |
+
if self.s3:
|
| 32 |
+
self.folder = s3_folder
|
| 33 |
+
else:
|
| 34 |
+
self.folder = folder
|
| 35 |
+
|
| 36 |
+
self.primaries_path = f"{self.folder}/primaries.csv"
|
| 37 |
+
self.competitors_path = f"{self.folder}/competitor_ads.csv"
|
| 38 |
+
self.ad_copies_path = f"{self.folder}/ad_copies.csv"
|
| 39 |
+
self.english_copies_path = f"{self.folder}/english_copies.csv"
|
| 40 |
+
self.train_path = f"{self.folder}/train.csv"
|
| 41 |
+
self.val_path = f"{self.folder}/val.csv"
|
| 42 |
+
self.test_path = f"{self.folder}/test.csv"
|
| 43 |
+
|
| 44 |
+
self.tokenizer_id = f"{user_id}/lebesgue_ad_tokenizer"
|
| 45 |
+
|
| 46 |
+
self.hub_datasetdict_id = f"{user_id}/lebesgue_ad_datasets"
|
| 47 |
+
|
| 48 |
+
@property
|
| 49 |
+
def primaries(self) -> pd.DataFrame:
|
| 50 |
+
df = read_csv(self.primaries_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 51 |
+
return df
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def competitors(self) -> pd.DataFrame:
|
| 55 |
+
df = read_csv(self.competitors_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 56 |
+
return df
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def ad_copies(self) -> pd.DataFrame:
|
| 60 |
+
df = read_csv(self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 61 |
+
return df
|
| 62 |
+
|
| 63 |
+
@property
|
| 64 |
+
def english_copies(self) -> pd.DataFrame:
|
| 65 |
+
args = {"lineterminator": "\n"}
|
| 66 |
+
df = read_csv(
|
| 67 |
+
self.english_copies_path,
|
| 68 |
+
s3=self.s3,
|
| 69 |
+
s3_args={"bucket": self.bucket} | args,
|
| 70 |
+
pd_args=args,
|
| 71 |
+
)
|
| 72 |
+
return df
|
| 73 |
+
|
| 74 |
+
@property
|
| 75 |
+
def train(self) -> pd.DataFrame:
|
| 76 |
+
df = read_csv(self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 77 |
+
return df
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def val(self) -> pd.DataFrame:
|
| 81 |
+
df = read_csv(self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 82 |
+
return df
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def test(self) -> pd.DataFrame:
|
| 86 |
+
df = read_csv(self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 87 |
+
return df
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def datasets(self) -> DatasetDict:
|
| 91 |
+
return load_dataset(self.hub_datasetdict_id)
|
| 92 |
+
|
| 93 |
+
def tokenizer(self, checkpoint: str = "bert-base-uncased") -> AutoTokenizer:
|
| 94 |
+
|
| 95 |
+
return AutoTokenizer.from_pretrained(f"{self.tokenizer_id}_{checkpoint}")
|
| 96 |
+
|
| 97 |
+
def concat_and_remove_duplicates(self) -> pd.DataFrame:
|
| 98 |
+
|
| 99 |
+
comp = self.competitors
|
| 100 |
+
prim = self.primaries
|
| 101 |
+
|
| 102 |
+
primaries = prim.value.to_list()
|
| 103 |
+
primaries = [primary for primary in primaries if type(primary) == list]
|
| 104 |
+
|
| 105 |
+
list_of_primaries = []
|
| 106 |
+
for primary in primaries:
|
| 107 |
+
list_of_primaries.extend(primary)
|
| 108 |
+
|
| 109 |
+
competitors = comp.ad_text.to_list()
|
| 110 |
+
|
| 111 |
+
ad_copies = list_of_primaries + competitors
|
| 112 |
+
ad_copies = pd.Series(ad_copies).drop_duplicates()
|
| 113 |
+
ad_copies = pd.DataFrame(ad_copies, columns=["text"])
|
| 114 |
+
save_csv(
|
| 115 |
+
df=ad_copies,
|
| 116 |
+
path=self.ad_copies_path,
|
| 117 |
+
s3=self.s3,
|
| 118 |
+
s3_args={"bucket": self.bucket},
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
def get_language(self) -> pd.DataFrame:
|
| 122 |
+
ad_copies = self.ad_copies
|
| 123 |
+
ad_copies["language"] = ad_copies.text.apply(lambda text: detect_language(text))
|
| 124 |
+
save_csv(
|
| 125 |
+
df=ad_copies,
|
| 126 |
+
path=self.ad_copies_path,
|
| 127 |
+
s3=self.s3,
|
| 128 |
+
s3_args={"bucket": self.bucket},
|
| 129 |
+
)
|
| 130 |
+
return ad_copies
|
| 131 |
+
|
| 132 |
+
def filter_english(self) -> pd.DataFrame:
|
| 133 |
+
ad_copies = self.ad_copies
|
| 134 |
+
english = ad_copies[ad_copies.language == "en"]
|
| 135 |
+
save_csv(
|
| 136 |
+
df=english,
|
| 137 |
+
path=self.english_copies_path,
|
| 138 |
+
s3=self.s3,
|
| 139 |
+
s3_args={"bucket": self.bucket},
|
| 140 |
+
)
|
| 141 |
+
return english
|
| 142 |
+
|
| 143 |
+
def clean_english(self) -> pd.DataFrame:
|
| 144 |
+
english = self.english_copies
|
| 145 |
+
english["text_clean"] = english.text.apply(clean_text)
|
| 146 |
+
|
| 147 |
+
# remove empty ones
|
| 148 |
+
english = english[english.text_clean.apply(len) != 0]
|
| 149 |
+
save_csv(
|
| 150 |
+
df=english,
|
| 151 |
+
path=self.english_copies_path,
|
| 152 |
+
s3=self.s3,
|
| 153 |
+
s3_args={"bucket": self.bucket},
|
| 154 |
+
)
|
| 155 |
+
return english
|
| 156 |
+
|
| 157 |
+
def train_tokenizer(self, checkpoint: str = "bert-base-uncased"):
|
| 158 |
+
|
| 159 |
+
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
|
| 160 |
+
tokenizer = add_emoji_tokens(tokenizer=tokenizer)
|
| 161 |
+
tokenizer = add_new_line_token(tokenizer=tokenizer)
|
| 162 |
+
|
| 163 |
+
tokenizer.push_to_hub(f"{self.tokenizer_id}_{checkpoint}")
|
| 164 |
+
|
| 165 |
+
def get_tokenizer(self):
|
| 166 |
+
return BertTokenizerFast.from_pretrained(self.tokenizer_id)
|
| 167 |
+
|
| 168 |
+
def split_into_train_and_test(
|
| 169 |
+
self,
|
| 170 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 171 |
+
df = self.english_copies
|
| 172 |
+
train, test = train_test_split(df, train_size=0.9, random_state=42)
|
| 173 |
+
train, val = train_test_split(train, train_size=0.85, random_state=42)
|
| 174 |
+
|
| 175 |
+
dataset_dict = DatasetDict()
|
| 176 |
+
|
| 177 |
+
for df, local_path, dataset_dict_key in zip(
|
| 178 |
+
[train, val, test],
|
| 179 |
+
[self.train_path, self.val_path, self.train_path],
|
| 180 |
+
["train", "val", "test"],
|
| 181 |
+
):
|
| 182 |
+
save_csv(df=df, path=local_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 183 |
+
df_hf = Dataset.from_pandas(df, preserve_index=False)
|
| 184 |
+
dataset_dict[dataset_dict_key] = df_hf
|
| 185 |
+
|
| 186 |
+
dataset_dict.push_to_hub(self.hub_datasetdict_id)
|
| 187 |
+
|
| 188 |
+
return train, val, test
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
mlm_dataset = MLMDataset()
|
| 192 |
+
|
| 193 |
+
mlm_dataset_s3 = MLMDataset(s3=True)
|
src/MLM/datasets/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .preprocess_dataset import preprocess_dataset
|
| 2 |
+
from .MLMDataset import MLMDataset, mlm_dataset, mlm_dataset_s3
|
src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc
ADDED
|
Binary file (6.11 kB). View file
|
|
|
src/MLM/datasets/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (314 Bytes). View file
|
|
|
src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
src/MLM/datasets/preprocess_dataset.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import Dataset, DatasetDict
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
def preprocess_dataset(dataset: Dataset | DatasetDict, tokenizer: AutoTokenizer) -> Dataset | DatasetDict:
|
| 5 |
+
|
| 6 |
+
tokenized_dataset = dataset.map(
|
| 7 |
+
lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text", 'text_clean', 'language']
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
return tokenized_dataset.map(group_texts, batched=True)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def tokenize_function(examples, tokenizer: AutoTokenizer):
|
| 19 |
+
|
| 20 |
+
result = tokenizer(examples["text"])
|
| 21 |
+
|
| 22 |
+
if tokenizer.is_fast:
|
| 23 |
+
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
|
| 24 |
+
|
| 25 |
+
return result
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def group_texts(examples, chunk_size: int = 128):
|
| 29 |
+
|
| 30 |
+
concatinated_examples = {k : sum(examples[k], []) for k in examples.keys()}
|
| 31 |
+
|
| 32 |
+
total_length = len(concatinated_examples["input_ids"])
|
| 33 |
+
total_length = (total_length // chunk_size) * chunk_size
|
| 34 |
+
|
| 35 |
+
result = {k : [t[i : i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatinated_examples.items()}
|
| 36 |
+
result["labels"] = result["input_ids"].copy()
|
| 37 |
+
|
| 38 |
+
return result
|
| 39 |
+
|
src/MLM/mask_and_unmask.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizerFast, DataCollatorForLanguageModeling
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def mask_and_unmask(
|
| 6 |
+
text: str,
|
| 7 |
+
tokenizer: AutoTokenizer | BertTokenizerFast,
|
| 8 |
+
model: AutoModelForMaskedLM,
|
| 9 |
+
data_collator: DataCollatorForLanguageModeling,
|
| 10 |
+
) -> str:
|
| 11 |
+
|
| 12 |
+
collator_input = tokenizer(text)
|
| 13 |
+
collator_input["labels"] = collator_input["input_ids"].copy()
|
| 14 |
+
collator_output = data_collator([collator_input])
|
| 15 |
+
masked_text = tokenizer.decode(collator_output["input_ids"][0])
|
| 16 |
+
|
| 17 |
+
pred_dict = {"masked_text": masked_text}
|
| 18 |
+
|
| 19 |
+
inputs = tokenizer(masked_text, return_tensors="pt", padding="max_length", truncation=True)
|
| 20 |
+
token_logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
|
| 21 |
+
all_masked_token_index = torch.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)
|
| 22 |
+
if all_masked_token_index.size()[0] != 0:
|
| 23 |
+
|
| 24 |
+
for i, masked_index_token in enumerate(all_masked_token_index[:, 1]):
|
| 25 |
+
# print(masked_index_token)
|
| 26 |
+
masked_token_logits = token_logits[0, masked_index_token, :]
|
| 27 |
+
# print(masked_token_logits)
|
| 28 |
+
top_5_tokens = torch.argsort(masked_token_logits, descending=True)[:5].tolist()
|
| 29 |
+
value = tokenizer.decode(collator_output["labels"][0, masked_index_token - 1])
|
| 30 |
+
pred_dict[value] = [tokenizer.decode(token) for token in top_5_tokens]
|
| 31 |
+
|
| 32 |
+
return pred_dict
|
src/MLM/training_scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .train_with_trainer import train_with_trainer
|
src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (240 Bytes). View file
|
|
|
src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (564 Bytes). View file
|
|
|
src/MLM/training_scripts/train_with_trainer.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer, TrainingArguments, Trainer
|
| 2 |
+
from datasets import Dataset, DatasetDict
|
| 3 |
+
from transformers import DataCollatorForLanguageModeling
|
| 4 |
+
|
| 5 |
+
from src.MLM.datasets.preprocess_dataset import preprocess_dataset
|
| 6 |
+
from src.MLM.training_scripts.utils import get_new_model_name
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def train_with_trainer(
|
| 10 |
+
model_checkpoint: str,
|
| 11 |
+
tokenizer: AutoTokenizer,
|
| 12 |
+
dataset: DatasetDict,
|
| 13 |
+
model_name: str | None = None,
|
| 14 |
+
data_collator=None,
|
| 15 |
+
num_epochs: int = 3,
|
| 16 |
+
):
|
| 17 |
+
|
| 18 |
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
| 19 |
+
|
| 20 |
+
model_name = get_new_model_name(model_checkpoint=model_checkpoint, model_name=model_name)
|
| 21 |
+
|
| 22 |
+
dataset = preprocess_dataset(dataset=dataset, tokenizer=tokenizer)
|
| 23 |
+
|
| 24 |
+
if data_collator is None:
|
| 25 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
|
| 26 |
+
|
| 27 |
+
training_args = TrainingArguments(
|
| 28 |
+
model_name,
|
| 29 |
+
evaluation_strategy="epoch",
|
| 30 |
+
learning_rate=2e-5,
|
| 31 |
+
weight_decay=0.01,
|
| 32 |
+
push_to_hub=True,
|
| 33 |
+
report_to="wandb",
|
| 34 |
+
run_name=model_name,
|
| 35 |
+
num_train_epochs=num_epochs,
|
| 36 |
+
save_total_limit=1,
|
| 37 |
+
save_strategy="epoch",
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
print(f"device: {training_args.device}")
|
| 41 |
+
|
| 42 |
+
trainer = Trainer(
|
| 43 |
+
model=model,
|
| 44 |
+
args=training_args,
|
| 45 |
+
train_dataset=dataset["train"],
|
| 46 |
+
eval_dataset=dataset["val"],
|
| 47 |
+
data_collator=data_collator,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
trainer.train()
|
src/MLM/training_scripts/utils.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def get_new_model_name(model_checkpoint: str, model_name: str = None) -> str:
|
| 2 |
+
if model_name is None:
|
| 3 |
+
old_version_number = int(model_checkpoint[-2:])
|
| 4 |
+
new_version_number = str(old_version_number + 1).zfill(2)
|
| 5 |
+
model_name = f"{model_checkpoint[:-2]}{new_version_number}"
|
| 6 |
+
elif not model_name[-2:].isnumeric():
|
| 7 |
+
model_name = model_name + "_00"
|
| 8 |
+
|
| 9 |
+
return model_name
|
src/regression/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
runs/
|
src/regression/HF/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .configs import *
|
| 2 |
+
from .models import *
|
src/regression/HF/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (236 Bytes). View file
|
|
|
src/regression/HF/configs/FullModelConfigHF.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
from src.regression.PL import EncoderPL, DecoderPL
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class FullModelConfigHF(PretrainedConfig):
|
| 7 |
+
|
| 8 |
+
model_type = "full_model"
|
| 9 |
+
|
| 10 |
+
def __init__(
|
| 11 |
+
self,
|
| 12 |
+
tokenizer_ckpt: str = "",
|
| 13 |
+
bert_ckpt: str = "",
|
| 14 |
+
decoder_ckpt: str = "",
|
| 15 |
+
layer_norm: bool = True,
|
| 16 |
+
nontext_features: List[str] = ["aov"],
|
| 17 |
+
**kwargs,
|
| 18 |
+
):
|
| 19 |
+
|
| 20 |
+
self.tokenizer_ckpt = tokenizer_ckpt
|
| 21 |
+
self.bert_ckpt = bert_ckpt
|
| 22 |
+
self.decoder_ckpt = decoder_ckpt
|
| 23 |
+
self.nontext_features = nontext_features
|
| 24 |
+
self.layer_norm = layer_norm
|
| 25 |
+
super().__init__(**kwargs)
|
src/regression/HF/configs/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .FullModelConfigHF import FullModelConfigHF
|
src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc
ADDED
|
Binary file (1.02 kB). View file
|
|
|
src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (240 Bytes). View file
|
|
|
src/regression/HF/models/FullModelHF.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PreTrainedModel
|
| 2 |
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
| 3 |
+
from pytorch_lightning.loggers import WandbLogger
|
| 4 |
+
|
| 5 |
+
from src.regression.PL import FullModelPL, EncoderPL, DecoderPL
|
| 6 |
+
from src.regression.HF.configs import FullModelConfigHF
|
| 7 |
+
|
| 8 |
+
from config import DEVICE
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class FullModelHF(PreTrainedModel):
|
| 12 |
+
config_class = FullModelConfigHF
|
| 13 |
+
|
| 14 |
+
def __init__(self, config):
|
| 15 |
+
|
| 16 |
+
super().__init__(config)
|
| 17 |
+
|
| 18 |
+
self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_ckpt)
|
| 19 |
+
mlm_bert = AutoModelForMaskedLM.from_pretrained(config.bert_ckpt)
|
| 20 |
+
self.bert = mlm_bert.distilbert
|
| 21 |
+
|
| 22 |
+
encoder = EncoderPL(tokenizer=self.tokenizer, bert=self.bert).to(DEVICE)
|
| 23 |
+
|
| 24 |
+
wandb_logger = WandbLogger(
|
| 25 |
+
project="transformers",
|
| 26 |
+
entity="sanjin_juric_fot",
|
| 27 |
+
# log_model=True,
|
| 28 |
+
# reinit=True,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
artifact = wandb_logger.use_artifact(config.decoder_ckpt)
|
| 32 |
+
artifact_dir = artifact.download()
|
| 33 |
+
decoder = DecoderPL.load_from_checkpoint(artifact_dir + "/" + "model.ckpt").to(DEVICE)
|
| 34 |
+
|
| 35 |
+
self.model = FullModelPL(
|
| 36 |
+
encoder=encoder,
|
| 37 |
+
decoder=decoder,
|
| 38 |
+
layer_norm=config.layer_norm,
|
| 39 |
+
nontext_features=config.nontext_features,
|
| 40 |
+
).to(DEVICE)
|
| 41 |
+
|
| 42 |
+
def forward(self, input):
|
| 43 |
+
return self.model._get_loss(input)
|
src/regression/HF/models/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .FullModelHF import FullModelHF
|
src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc
ADDED
|
Binary file (1.64 kB). View file
|
|
|
src/regression/HF/models/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (233 Bytes). View file
|
|
|
src/regression/PL/DecoderPL.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import emoji
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pytorch_lightning as pl
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from torch import nn
|
| 8 |
+
from torch.optim.lr_scheduler import CosineAnnealingLR
|
| 9 |
+
from torchmetrics import R2Score
|
| 10 |
+
|
| 11 |
+
from src.utils import get_sentiment
|
| 12 |
+
from src.utils.neural_networks import set_layer
|
| 13 |
+
from config import DEVICE
|
| 14 |
+
|
| 15 |
+
torch.set_default_dtype(torch.float32)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DecoderPL(pl.LightningModule):
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
input_dim: int = 774,
|
| 22 |
+
layer_norm: bool = True,
|
| 23 |
+
layer_dict: dict = {},
|
| 24 |
+
device=DEVICE,
|
| 25 |
+
T_max: int = 10,
|
| 26 |
+
start_lr: float = 5 * 1e-4,
|
| 27 |
+
):
|
| 28 |
+
super().__init__()
|
| 29 |
+
|
| 30 |
+
# layers
|
| 31 |
+
self.linear1 = set_layer(
|
| 32 |
+
layer_dict=layer_dict,
|
| 33 |
+
name="linear1",
|
| 34 |
+
alternative=nn.Linear(in_features=input_dim, out_features=512),
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
self.linear2 = set_layer(
|
| 38 |
+
layer_dict=layer_dict,
|
| 39 |
+
name="linear2",
|
| 40 |
+
alternative=nn.Linear(in_features=512, out_features=264),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
self.linear3 = set_layer(
|
| 44 |
+
layer_dict=layer_dict,
|
| 45 |
+
name="linear3",
|
| 46 |
+
alternative=nn.Linear(in_features=264, out_features=64),
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
self.linear4 = set_layer(
|
| 50 |
+
layer_dict=layer_dict,
|
| 51 |
+
name="linear4",
|
| 52 |
+
alternative=nn.Linear(in_features=64, out_features=1),
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
self.activation = nn.LeakyReLU(negative_slope=0.01)
|
| 56 |
+
|
| 57 |
+
if not layer_norm:
|
| 58 |
+
self.layers = [
|
| 59 |
+
self.linear1,
|
| 60 |
+
self.activation,
|
| 61 |
+
self.linear2,
|
| 62 |
+
self.activation,
|
| 63 |
+
self.linear3,
|
| 64 |
+
self.activation,
|
| 65 |
+
self.linear4,
|
| 66 |
+
]
|
| 67 |
+
else:
|
| 68 |
+
self.layernorm1 = nn.LayerNorm(normalized_shape=(1, self.linear1.out_features))
|
| 69 |
+
self.layernorm2 = nn.LayerNorm(normalized_shape=(1, self.linear2.out_features))
|
| 70 |
+
self.layernorm3 = nn.LayerNorm(normalized_shape=(1, self.linear3.out_features))
|
| 71 |
+
self.layers = [
|
| 72 |
+
self.linear1,
|
| 73 |
+
self.layernorm1,
|
| 74 |
+
self.activation,
|
| 75 |
+
self.linear2,
|
| 76 |
+
self.layernorm2,
|
| 77 |
+
self.activation,
|
| 78 |
+
self.linear3,
|
| 79 |
+
self.layernorm3,
|
| 80 |
+
self.activation,
|
| 81 |
+
self.linear4,
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
# initialize weights
|
| 85 |
+
[self.initialize_weights(layer) for layer in self.layers]
|
| 86 |
+
|
| 87 |
+
# optimizer and scheduler
|
| 88 |
+
self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=start_lr)
|
| 89 |
+
self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max)
|
| 90 |
+
|
| 91 |
+
# else
|
| 92 |
+
self.save_hyperparameters(ignore=["model"])
|
| 93 |
+
self.MSE = nn.MSELoss()
|
| 94 |
+
self.R2 = R2Score()
|
| 95 |
+
|
| 96 |
+
def initialize_weights(self, module):
|
| 97 |
+
|
| 98 |
+
if isinstance(module, nn.Linear):
|
| 99 |
+
logger.debug("linear weights initialized")
|
| 100 |
+
torch.nn.init.xavier_uniform_(module.weight)
|
| 101 |
+
module.bias.data.fill_(0.01)
|
| 102 |
+
|
| 103 |
+
def forward(self, x: torch.Tensor):
|
| 104 |
+
|
| 105 |
+
if x.dim() == 2:
|
| 106 |
+
x = x.unsqueeze(dim=1)
|
| 107 |
+
|
| 108 |
+
for layer in self.layers:
|
| 109 |
+
x = layer(x)
|
| 110 |
+
|
| 111 |
+
x = x.squeeze()
|
| 112 |
+
|
| 113 |
+
if x.dim() == 0:
|
| 114 |
+
x = x.unsqueeze(dim=0)
|
| 115 |
+
|
| 116 |
+
return x.to(torch.float32)
|
| 117 |
+
|
| 118 |
+
def training_step(self, batch):
|
| 119 |
+
|
| 120 |
+
loss_and_metrics = self._get_loss(batch, get_metrics=True)
|
| 121 |
+
pred = loss_and_metrics["pred"]
|
| 122 |
+
act = loss_and_metrics["act"]
|
| 123 |
+
loss = loss_and_metrics["loss"]
|
| 124 |
+
|
| 125 |
+
self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True)
|
| 126 |
+
|
| 127 |
+
return {"loss": loss, "pred": pred, "act": act}
|
| 128 |
+
|
| 129 |
+
def configure_optimizers(self):
|
| 130 |
+
|
| 131 |
+
optimizer = self.optimizer
|
| 132 |
+
scheduler = self.scheduler
|
| 133 |
+
return dict(optimizer=optimizer, lr_scheduler=scheduler)
|
| 134 |
+
|
| 135 |
+
def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
|
| 136 |
+
logger.debug(scheduler)
|
| 137 |
+
if metric is None:
|
| 138 |
+
scheduler.step()
|
| 139 |
+
else:
|
| 140 |
+
scheduler.step(metric)
|
| 141 |
+
|
| 142 |
+
def validation_step(self, batch, batch_idx):
|
| 143 |
+
"""used for logging metrics"""
|
| 144 |
+
loss_and_metrics = self._get_loss(batch, get_metrics=True)
|
| 145 |
+
loss = loss_and_metrics["loss"]
|
| 146 |
+
|
| 147 |
+
# Log loss and metric
|
| 148 |
+
self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
|
| 149 |
+
|
| 150 |
+
def training_epoch_end(self, training_step_outputs):
|
| 151 |
+
|
| 152 |
+
training_step_outputs = list(training_step_outputs)
|
| 153 |
+
|
| 154 |
+
training_step_outputs.pop()
|
| 155 |
+
|
| 156 |
+
output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]}
|
| 157 |
+
|
| 158 |
+
pred = torch.stack(output_dict["pred"])
|
| 159 |
+
act = torch.stack(output_dict["act"])
|
| 160 |
+
|
| 161 |
+
loss = torch.sub(pred, act)
|
| 162 |
+
loss_sq = torch.square(loss)
|
| 163 |
+
|
| 164 |
+
TSS = float(torch.var(act, unbiased=False))
|
| 165 |
+
RSS = float(torch.mean(loss_sq))
|
| 166 |
+
R2 = 1 - RSS / TSS
|
| 167 |
+
|
| 168 |
+
self.log("train_R2", R2, prog_bar=True, logger=True)
|
| 169 |
+
|
| 170 |
+
def _get_loss(self, batch, get_metrics: bool = False):
|
| 171 |
+
"""convenience function since train/valid/test steps are similar"""
|
| 172 |
+
pred = self.forward(x=batch["embedding"]).to(torch.float32)
|
| 173 |
+
|
| 174 |
+
act, loss = None, None
|
| 175 |
+
|
| 176 |
+
if "ctr" in batch.keys():
|
| 177 |
+
act = batch["ctr"].to(torch.float32)
|
| 178 |
+
loss = self.MSE(pred, act).to(torch.float32)
|
| 179 |
+
|
| 180 |
+
return {"loss": loss, "pred": pred, "act": act}
|
src/regression/PL/EncoderPL.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import emoji
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pytorch_lightning as pl
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from torch import nn
|
| 8 |
+
from torch.optim.lr_scheduler import CosineAnnealingLR
|
| 9 |
+
from torchmetrics import R2Score
|
| 10 |
+
from transformers import BertModel, BertTokenizer, DistilBertModel, AutoModel, AutoTokenizer
|
| 11 |
+
from pytorch_lightning import LightningModule
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
from src.utils.neural_networks import set_layer
|
| 15 |
+
from src.utils import add_emoji_tokens, add_new_line_token, vectorise_dict
|
| 16 |
+
from config import DEVICE
|
| 17 |
+
|
| 18 |
+
torch.set_default_dtype(torch.float32)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class EncoderPL(pl.LightningModule):
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
model_name: str = "bert-base-uncased",
|
| 25 |
+
tokenizer: AutoTokenizer | None = None,
|
| 26 |
+
bert: AutoModel | None = None,
|
| 27 |
+
cls: bool = False,
|
| 28 |
+
device=DEVICE,
|
| 29 |
+
):
|
| 30 |
+
super().__init__()
|
| 31 |
+
|
| 32 |
+
self._device = device
|
| 33 |
+
self.cls = cls
|
| 34 |
+
self.model_name = model_name
|
| 35 |
+
|
| 36 |
+
# layers
|
| 37 |
+
|
| 38 |
+
self.tokenizer = tokenizer if tokenizer is not None else BertTokenizer.from_pretrained(model_name)
|
| 39 |
+
|
| 40 |
+
self.bert = bert if bert is not None else BertModel.from_pretrained(model_name)
|
| 41 |
+
|
| 42 |
+
if tokenizer is None:
|
| 43 |
+
self.tokenizer = add_emoji_tokens(self.tokenizer)
|
| 44 |
+
self.tokenizer = add_new_line_token(self.tokenizer)
|
| 45 |
+
self.bert.resize_token_embeddings(len(self.tokenizer))
|
| 46 |
+
|
| 47 |
+
# optimizer and scheduler
|
| 48 |
+
self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-3)
|
| 49 |
+
|
| 50 |
+
# config tweaking
|
| 51 |
+
self.bert.config.torch_dtype = "float32"
|
| 52 |
+
|
| 53 |
+
def forward(self, text: str):
|
| 54 |
+
|
| 55 |
+
# run text through bert and squash the output to get embeddings
|
| 56 |
+
encoded = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(self._device)
|
| 57 |
+
|
| 58 |
+
if type(self.bert) == DistilBertModel:
|
| 59 |
+
encoded.pop("token_type_ids")
|
| 60 |
+
|
| 61 |
+
bert_output = self.bert(**encoded)
|
| 62 |
+
|
| 63 |
+
if self.cls:
|
| 64 |
+
if hasattr(bert_output, "pooler_output") and bert_output.pooler_output is not None:
|
| 65 |
+
embedding = bert_output.pooler_output.unsqueeze(dim=1)
|
| 66 |
+
else:
|
| 67 |
+
embedding = bert_output.last_hidden_state[0, 0, :].unsqueeze(dim=0).unsqueeze(dim=0)
|
| 68 |
+
else:
|
| 69 |
+
last_hidden_state = bert_output.last_hidden_state
|
| 70 |
+
|
| 71 |
+
if last_hidden_state.dim() == 2:
|
| 72 |
+
last_hidden_state = last_hidden_state.unsqueeze(dim=0)
|
| 73 |
+
|
| 74 |
+
embedding = torch.matmul(
|
| 75 |
+
encoded["attention_mask"].type(torch.float32).view(-1, 1, 512),
|
| 76 |
+
last_hidden_state,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
return embedding
|
| 80 |
+
|
| 81 |
+
def configure_optimizers(self):
|
| 82 |
+
return self.optimizer
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def get_bert_embedding(
|
| 86 |
+
text: str, as_list: bool = True, cls: bool = False, device=DEVICE, layer_dict: dict = {}
|
| 87 |
+
) -> list:
|
| 88 |
+
encoder = EncoderPL(cls=cls, layer_dict=layer_dict).to(device)
|
| 89 |
+
embedding = encoder.forward(text)
|
| 90 |
+
|
| 91 |
+
if as_list:
|
| 92 |
+
embedding = embedding.tolist()[0][0]
|
| 93 |
+
|
| 94 |
+
return embedding
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def get_concat_embedding(
|
| 98 |
+
text: str = None,
|
| 99 |
+
bert_embedding: list = [],
|
| 100 |
+
other_features: dict = {},
|
| 101 |
+
cls: bool = False,
|
| 102 |
+
device=DEVICE,
|
| 103 |
+
layer_dict: dict = {},
|
| 104 |
+
) -> list:
|
| 105 |
+
|
| 106 |
+
if not len(bert_embedding):
|
| 107 |
+
|
| 108 |
+
if text is None:
|
| 109 |
+
raise ValueError("both text and embedding are empty!")
|
| 110 |
+
bert_embedding = get_bert_embedding(text=text, cls=cls, device=device, layer_dict=layer_dict)
|
| 111 |
+
|
| 112 |
+
other_features = vectorise_dict(other_features, as_list=True)
|
| 113 |
+
|
| 114 |
+
concat_vec = bert_embedding + other_features
|
| 115 |
+
|
| 116 |
+
return concat_vec
|
src/regression/PL/FullModelPL.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import emoji
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pytorch_lightning as pl
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from loguru import logger
|
| 7 |
+
from torch import nn
|
| 8 |
+
from torch.optim.lr_scheduler import CosineAnnealingLR
|
| 9 |
+
from torchmetrics import R2Score
|
| 10 |
+
from transformers import BertModel, BertTokenizerFast
|
| 11 |
+
|
| 12 |
+
from src.utils import get_sentiment, vectorise_dict
|
| 13 |
+
from src.utils.neural_networks import set_layer
|
| 14 |
+
from config import DEVICE
|
| 15 |
+
|
| 16 |
+
from .DecoderPL import DecoderPL
|
| 17 |
+
from .EncoderPL import EncoderPL
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
torch.set_default_dtype(torch.float32)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class FullModelPL(pl.LightningModule):
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model_name: str = "bert-base-uncased",
|
| 27 |
+
nontext_features: list[str] = ["aov"],
|
| 28 |
+
encoder: EncoderPL | None = None,
|
| 29 |
+
decoder: DecoderPL | None = None,
|
| 30 |
+
layer_norm: bool = True,
|
| 31 |
+
device=DEVICE,
|
| 32 |
+
T_max: int = 10,
|
| 33 |
+
):
|
| 34 |
+
super().__init__()
|
| 35 |
+
|
| 36 |
+
# layers
|
| 37 |
+
self.encoder = (
|
| 38 |
+
encoder.to(self.device)
|
| 39 |
+
if encoder is not None
|
| 40 |
+
else EncoderPL(model_name=model_name, device=device).to(self.device)
|
| 41 |
+
)
|
| 42 |
+
self.decoder = (
|
| 43 |
+
decoder.to(self.device)
|
| 44 |
+
if decoder is not None
|
| 45 |
+
else DecoderPL(
|
| 46 |
+
input_dim=768 + len(nontext_features) + 5,
|
| 47 |
+
layer_norm=layer_norm,
|
| 48 |
+
device=device,
|
| 49 |
+
).to(self.device)
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# else
|
| 53 |
+
self.MSE = nn.MSELoss()
|
| 54 |
+
self.R2 = R2Score()
|
| 55 |
+
|
| 56 |
+
self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=3 * 1e-4)
|
| 57 |
+
self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max)
|
| 58 |
+
|
| 59 |
+
# self.save_hyperparameters(ignore=["decoder", "encoder"])
|
| 60 |
+
|
| 61 |
+
def forward(self, input_dict: dict):
|
| 62 |
+
|
| 63 |
+
input_dict = input_dict.copy()
|
| 64 |
+
text = input_dict.pop("text")
|
| 65 |
+
|
| 66 |
+
print(f"text: {text}")
|
| 67 |
+
|
| 68 |
+
if "ctr" in input_dict.keys():
|
| 69 |
+
input_dict.pop("ctr")
|
| 70 |
+
|
| 71 |
+
# encode
|
| 72 |
+
sentence_embedding = self.encoder.forward(text=text)
|
| 73 |
+
|
| 74 |
+
# sentiment
|
| 75 |
+
sentiment = get_sentiment_for_list_of_texts(text)
|
| 76 |
+
input_dict = input_dict | sentiment
|
| 77 |
+
|
| 78 |
+
input_dict = {k: v.to(self.device) for k, v in input_dict.items()}
|
| 79 |
+
|
| 80 |
+
# concat nontext features to embedding
|
| 81 |
+
nontext_vec = vectorise_dict(input_dict)
|
| 82 |
+
nontext_tensor = torch.stack(nontext_vec).T.unsqueeze(1).to(torch.float32)
|
| 83 |
+
# logger.debug(f"nontext tensor type: {nontext_tensor.dtype}")
|
| 84 |
+
print(f"{sentence_embedding.get_device()}, {nontext_tensor.get_device()}")
|
| 85 |
+
x = torch.cat((sentence_embedding, nontext_tensor), 2)
|
| 86 |
+
|
| 87 |
+
print(self.decoder.device)
|
| 88 |
+
print(x.get_device())
|
| 89 |
+
|
| 90 |
+
# decode
|
| 91 |
+
result = self.decoder.forward(x)
|
| 92 |
+
return result
|
| 93 |
+
|
| 94 |
+
def training_step(self, batch):
|
| 95 |
+
|
| 96 |
+
loss_and_metrics = self._get_loss(batch, get_metrics=True)
|
| 97 |
+
pred = loss_and_metrics["pred"]
|
| 98 |
+
act = loss_and_metrics["act"]
|
| 99 |
+
loss = loss_and_metrics["loss"]
|
| 100 |
+
|
| 101 |
+
self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True)
|
| 102 |
+
|
| 103 |
+
return {"loss": loss, "pred": pred, "act": act}
|
| 104 |
+
|
| 105 |
+
def configure_optimizers(self):
|
| 106 |
+
|
| 107 |
+
for name, param in self.named_parameters():
|
| 108 |
+
if "bert" in name:
|
| 109 |
+
param.requires_grad = False
|
| 110 |
+
|
| 111 |
+
optimizer = self.optimizer
|
| 112 |
+
scheduler = self.scheduler
|
| 113 |
+
return dict(optimizer=optimizer, lr_scheduler=scheduler)
|
| 114 |
+
|
| 115 |
+
def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
|
| 116 |
+
logger.debug(scheduler)
|
| 117 |
+
if metric is None:
|
| 118 |
+
scheduler.step()
|
| 119 |
+
else:
|
| 120 |
+
scheduler.step(metric)
|
| 121 |
+
|
| 122 |
+
def validation_step(self, batch, batch_idx):
|
| 123 |
+
"""used for logging metrics"""
|
| 124 |
+
loss_and_metrics = self._get_loss(batch, get_metrics=True)
|
| 125 |
+
loss = loss_and_metrics["loss"]
|
| 126 |
+
|
| 127 |
+
# Log loss and metric
|
| 128 |
+
self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
|
| 129 |
+
|
| 130 |
+
def training_epoch_end(self, training_step_outputs):
|
| 131 |
+
|
| 132 |
+
training_step_outputs = list(training_step_outputs)
|
| 133 |
+
|
| 134 |
+
training_step_outputs.pop()
|
| 135 |
+
|
| 136 |
+
output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]}
|
| 137 |
+
|
| 138 |
+
pred = torch.stack(output_dict["pred"])
|
| 139 |
+
act = torch.stack(output_dict["act"])
|
| 140 |
+
|
| 141 |
+
loss = torch.sub(pred, act)
|
| 142 |
+
loss_sq = torch.square(loss)
|
| 143 |
+
|
| 144 |
+
TSS = float(torch.var(act, unbiased=False))
|
| 145 |
+
RSS = float(torch.mean(loss_sq))
|
| 146 |
+
R2 = 1 - RSS / TSS
|
| 147 |
+
|
| 148 |
+
self.log("train_R2", R2, prog_bar=True, logger=True)
|
| 149 |
+
|
| 150 |
+
def _get_loss(self, batch, get_metrics: bool = False):
|
| 151 |
+
"""convenience function since train/valid/test steps are similar"""
|
| 152 |
+
pred = self.forward(input_dict=batch).to(torch.float32)
|
| 153 |
+
|
| 154 |
+
act, loss = None, None
|
| 155 |
+
|
| 156 |
+
if "ctr" in batch.keys():
|
| 157 |
+
act = batch["ctr"].to(torch.float32).to(self.device)
|
| 158 |
+
loss = self.MSE(pred, act).to(torch.float32)
|
| 159 |
+
|
| 160 |
+
return {"loss": loss, "pred": pred, "act": act}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def get_sentiment_for_list_of_texts(texts: list[str]) -> dict:
|
| 164 |
+
ld = [get_sentiment(text) for text in texts]
|
| 165 |
+
v = {k: torch.Tensor([dic[k] for dic in ld]) for k in ld[0]}
|
| 166 |
+
return v
|
src/regression/PL/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .FullModelPL import FullModelPL
|
| 2 |
+
from .DecoderPL import DecoderPL
|
| 3 |
+
from .EncoderPL import EncoderPL, get_concat_embedding, get_bert_embedding
|
src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc
ADDED
|
Binary file (5.34 kB). View file
|
|
|
src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc
ADDED
|
Binary file (3.53 kB). View file
|
|
|
src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc
ADDED
|
Binary file (5.87 kB). View file
|
|
|
src/regression/PL/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (352 Bytes). View file
|
|
|
src/regression/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .datasets import *
|
| 2 |
+
from .training_scripts import *
|
| 3 |
+
from .PL import *
|
src/regression/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (258 Bytes). View file
|
|
|
src/regression/datasets/DecoderDatasetTorch.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import torch
|
| 4 |
+
from torch.utils.data import Dataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DecoderDatasetTorch(Dataset):
|
| 8 |
+
"""Train dataset."""
|
| 9 |
+
|
| 10 |
+
def __init__(self, df: pd.DataFrame, embedding_column: str = "my_full_mean_embedding"):
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
df (pd.DataFrame): dataframe with ads
|
| 15 |
+
embedding_column (str, optional): Column whose values to output in __get_item__. Defaults to 'full_mean_embedding'.
|
| 16 |
+
"""
|
| 17 |
+
self.df = df
|
| 18 |
+
self.embedding_column = embedding_column
|
| 19 |
+
|
| 20 |
+
df[[embedding_column, "ctr"]] = df[[embedding_column, "ctr"]].applymap(lambda x: np.float32(x))
|
| 21 |
+
# df["ctr"] = df["ctr"].astype(np.float32)
|
| 22 |
+
|
| 23 |
+
def __len__(self):
|
| 24 |
+
return len(self.df)
|
| 25 |
+
|
| 26 |
+
def __getitem__(self, idx):
|
| 27 |
+
|
| 28 |
+
if torch.is_tensor(idx):
|
| 29 |
+
idx = idx.tolist()
|
| 30 |
+
|
| 31 |
+
embedding = self.df.loc[idx, self.embedding_column]
|
| 32 |
+
ctr = self.df.loc[idx, "ctr"]
|
| 33 |
+
|
| 34 |
+
return {"embedding": embedding, "ctr": ctr}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
|
| 38 |
+
# train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)
|
src/regression/datasets/FullModelDatasetTorch.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import torch
|
| 4 |
+
from torch.utils.data import DataLoader, Dataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class FullModelDatasetTorch(Dataset):
|
| 8 |
+
"""Train dataset."""
|
| 9 |
+
|
| 10 |
+
def __init__(self, df: pd.DataFrame, nontext_features: list[str] = ["aov"]):
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
df (pd.DataFrame): train dataframe
|
| 15 |
+
nontext_features (list[str]): features to use in training except for text embeddings
|
| 16 |
+
"""
|
| 17 |
+
self.df = df
|
| 18 |
+
self.nontext_features = nontext_features
|
| 19 |
+
|
| 20 |
+
df[nontext_features + ["ctr"]] = df[nontext_features + ["ctr"]].astype(np.float32)
|
| 21 |
+
|
| 22 |
+
def __len__(self):
|
| 23 |
+
return len(self.df)
|
| 24 |
+
|
| 25 |
+
def __getitem__(self, idx):
|
| 26 |
+
|
| 27 |
+
if torch.is_tensor(idx):
|
| 28 |
+
idx = idx.tolist()
|
| 29 |
+
|
| 30 |
+
text = self.df.loc[idx, "text_clean"]
|
| 31 |
+
ctr = self.df.loc[idx, "ctr"]
|
| 32 |
+
|
| 33 |
+
nontext_features = {feature: self.df.loc[idx, feature] for feature in self.nontext_features}
|
| 34 |
+
|
| 35 |
+
return {"text": text, "ctr": ctr} | nontext_features
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
|
| 39 |
+
# train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)
|
src/regression/datasets/RegressionDataset.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from langdetect import detect
|
| 7 |
+
from loguru import logger
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from time import sleep
|
| 10 |
+
from transformers import BertModel, AutoTokenizer
|
| 11 |
+
from tqdm import tqdm
|
| 12 |
+
import torch
|
| 13 |
+
from config import DEVICE
|
| 14 |
+
|
| 15 |
+
from src.utils.text_functions import clean_text, detect_language
|
| 16 |
+
from src.utils import (
|
| 17 |
+
get_sentiment,
|
| 18 |
+
detect_language,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
from src.regression.PL import (
|
| 22 |
+
get_bert_embedding,
|
| 23 |
+
get_concat_embedding,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
from src.utils.s3 import read_csv, save_csv
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class RegressionDataset:
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
s3: bool = False,
|
| 36 |
+
bucket: str = "lebesgue-data-science",
|
| 37 |
+
folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data",
|
| 38 |
+
s3_folder: str = "transformers/data",
|
| 39 |
+
):
|
| 40 |
+
self.s3 = s3
|
| 41 |
+
self.bucket = bucket
|
| 42 |
+
|
| 43 |
+
if self.s3:
|
| 44 |
+
self.folder = s3_folder
|
| 45 |
+
else:
|
| 46 |
+
self.folder = folder
|
| 47 |
+
|
| 48 |
+
self.original_path = f"{self.folder}/original.csv"
|
| 49 |
+
self.untrimmed_path = f"{self.folder}/untrimmed.csv"
|
| 50 |
+
self.normalized_path = f"{self.folder}/normalized.csv"
|
| 51 |
+
self.trimmed_path = f"{self.folder}/trimmed.csv"
|
| 52 |
+
|
| 53 |
+
self.train_path = f"{self.folder}/train.csv"
|
| 54 |
+
self.val_path = f"{self.folder}/val.csv"
|
| 55 |
+
self.test_path = f"{self.folder}/test.csv"
|
| 56 |
+
|
| 57 |
+
self.text_types = ["primary", "title", "description"]
|
| 58 |
+
|
| 59 |
+
self.col_func_dict = {
|
| 60 |
+
"number": len,
|
| 61 |
+
"len": lambda texts: np.mean([len(text) for text in texts]),
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def original(self) -> pd.DataFrame:
|
| 66 |
+
df = read_csv(path=self.original_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 67 |
+
return df
|
| 68 |
+
|
| 69 |
+
@property
|
| 70 |
+
def untrimmed(self) -> pd.DataFrame:
|
| 71 |
+
df = read_csv(path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 72 |
+
return df
|
| 73 |
+
|
| 74 |
+
@property
|
| 75 |
+
def normalized(self) -> pd.DataFrame:
|
| 76 |
+
df = read_csv(path=self.normalized_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 77 |
+
return df
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def trimmed(self) -> pd.DataFrame:
|
| 81 |
+
df = read_csv(path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 82 |
+
return df
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def train(self) -> pd.DataFrame:
|
| 86 |
+
df = read_csv(path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 87 |
+
return df
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def val(self) -> pd.DataFrame:
|
| 91 |
+
df = read_csv(path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 92 |
+
return df
|
| 93 |
+
|
| 94 |
+
@property
|
| 95 |
+
def test(self) -> pd.DataFrame:
|
| 96 |
+
df = read_csv(path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 97 |
+
return df
|
| 98 |
+
|
| 99 |
+
def normalize_untrimmed(self, group_cols: list[str] = ["text", "target", "shop_id"]) -> pd.DataFrame:
|
| 100 |
+
df = self.untrimmed
|
| 101 |
+
grouped = df.groupby(group_cols)
|
| 102 |
+
|
| 103 |
+
filters_df = grouped.agg({"impr": "sum", "spend": "sum"}).reset_index()
|
| 104 |
+
ctr = grouped.apply(lambda df: df.link_clicks.sum() / df.impr.sum())
|
| 105 |
+
ctr_df = pd.DataFrame(ctr, columns=["ctr"]).reset_index()
|
| 106 |
+
normalised = filters_df.merge(ctr_df, on=group_cols)
|
| 107 |
+
|
| 108 |
+
merged = df.merge(normalised, on=group_cols, validate="m:1", suffixes=["___", None])
|
| 109 |
+
merged.drop(list([col for col in merged.columns if "___" in col]), inplace=True, axis=1)
|
| 110 |
+
final = merged.drop_duplicates(group_cols)
|
| 111 |
+
save_csv(
|
| 112 |
+
df=final,
|
| 113 |
+
path=self.normalized_path,
|
| 114 |
+
s3=self.s3,
|
| 115 |
+
s3_args={"bucket": self.bucket},
|
| 116 |
+
)
|
| 117 |
+
return df
|
| 118 |
+
|
| 119 |
+
def expand_untrimmed(self, update_existing_columns: bool = False) -> pd.DataFrame:
|
| 120 |
+
|
| 121 |
+
df = self.untrimmed
|
| 122 |
+
|
| 123 |
+
# normalise target by adset
|
| 124 |
+
# df["ctr_norm"] = (
|
| 125 |
+
# df.groupby(["shop_id", "adset_id"])
|
| 126 |
+
# .ctr.transform(lambda x: (x - x.mean()) / x.std())
|
| 127 |
+
# .count()
|
| 128 |
+
# )
|
| 129 |
+
|
| 130 |
+
new_col_func_dict = self.col_func_dict
|
| 131 |
+
|
| 132 |
+
if not update_existing_columns:
|
| 133 |
+
new_col_func_dict = {
|
| 134 |
+
col: fun for col, fun in new_col_func_dict.items() if "primary_" + col not in df.columns
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
# get extra columns
|
| 138 |
+
for col, func in new_col_func_dict.items():
|
| 139 |
+
logger.debug(col)
|
| 140 |
+
for text_type in self.text_types:
|
| 141 |
+
df[f"{text_type}_{col}"] = df[text_type].apply(func)
|
| 142 |
+
|
| 143 |
+
df["has_text"] = df.apply(
|
| 144 |
+
lambda df: bool(df.primary_number + df.title_number + df.description_number),
|
| 145 |
+
axis=1,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# text columns
|
| 149 |
+
df = df.apply(_get_text, axis=1)
|
| 150 |
+
df = df.apply(_get_concatinated_text, axis=1)
|
| 151 |
+
|
| 152 |
+
df["language"] = df.text.apply(detect_language)
|
| 153 |
+
df = df[df.language == "en"]
|
| 154 |
+
df = df[df.ctr.notna()]
|
| 155 |
+
|
| 156 |
+
save_csv(df=df, path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 157 |
+
|
| 158 |
+
return df
|
| 159 |
+
|
| 160 |
+
def trim(self, min_impr: int = 900, min_spend: float = 90) -> pd.DataFrame:
|
| 161 |
+
df = self.normalized
|
| 162 |
+
df = df[(df.impr >= min_impr) & (df.spend >= min_spend)]
|
| 163 |
+
df = df[df.target == "acquisition"]
|
| 164 |
+
df = df[df.aov.notna()]
|
| 165 |
+
|
| 166 |
+
df = df[df.has_text == True]
|
| 167 |
+
|
| 168 |
+
save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 169 |
+
|
| 170 |
+
return df
|
| 171 |
+
|
| 172 |
+
def expand_trimmed(
|
| 173 |
+
self, bert: BertModel = None, tokenizer: AutoTokenizer = None, add_bert_embeddings_bool: bool = False
|
| 174 |
+
) -> pd.DataFrame:
|
| 175 |
+
df = self.trimmed
|
| 176 |
+
|
| 177 |
+
# clean text
|
| 178 |
+
for col in ["text", "concat_text"]:
|
| 179 |
+
df[f"{col}_clean"] = df[col].apply(clean_text)
|
| 180 |
+
|
| 181 |
+
df["text_clean_sentiment"] = df.text_clean.apply(get_sentiment)
|
| 182 |
+
|
| 183 |
+
if add_bert_embeddings_bool:
|
| 184 |
+
if tokenizer is None or bert is None:
|
| 185 |
+
raise ValueError("tokenizer or bert is None")
|
| 186 |
+
layer_dict = {"bert": bert, "tokenizer": tokenizer}
|
| 187 |
+
df = add_bert_embeddings(df=df, save_path=self.trimmed_path, layer_dict=layer_dict)
|
| 188 |
+
|
| 189 |
+
df = df.apply(add_concat_embeddings, axis=1)
|
| 190 |
+
|
| 191 |
+
save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 192 |
+
return df
|
| 193 |
+
|
| 194 |
+
def split_into_train_and_test(
|
| 195 |
+
self,
|
| 196 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 197 |
+
df = self.trimmed
|
| 198 |
+
train, test = train_test_split(df, train_size=0.9, random_state=42)
|
| 199 |
+
train, val = train_test_split(train, train_size=0.85, random_state=42)
|
| 200 |
+
save_csv(df=train, path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 201 |
+
save_csv(df=val, path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 202 |
+
save_csv(df=test, path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
|
| 203 |
+
return train, val, test
|
| 204 |
+
|
| 205 |
+
def expand_normalise_trim_split(
|
| 206 |
+
self,
|
| 207 |
+
update_existing_columns: bool = False,
|
| 208 |
+
group_cols=["text", "target", "shop_id"],
|
| 209 |
+
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 210 |
+
self.expand_untrimmed(update_existing_columns=update_existing_columns)
|
| 211 |
+
self.normalize_untrimmed(group_cols=group_cols)
|
| 212 |
+
self.trim()
|
| 213 |
+
self.expand_trimmed()
|
| 214 |
+
train, val, test = self.split_into_train_and_test()
|
| 215 |
+
return train, val, test
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _get_text(ad: pd.Series) -> pd.Series:
|
| 219 |
+
|
| 220 |
+
if ad.primary_number > 0:
|
| 221 |
+
ad["text"] = ad.primary[0]
|
| 222 |
+
|
| 223 |
+
elif ad.description_number > 0:
|
| 224 |
+
ad["text"] = ad.description[0]
|
| 225 |
+
|
| 226 |
+
elif ad.title_number > 0:
|
| 227 |
+
ad["text"] = ad.title[0]
|
| 228 |
+
|
| 229 |
+
else:
|
| 230 |
+
ad["text"] = None
|
| 231 |
+
|
| 232 |
+
return ad
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _get_concatinated_text(ad: pd.Series) -> pd.Series:
|
| 236 |
+
|
| 237 |
+
concat_text = ""
|
| 238 |
+
|
| 239 |
+
if ad.primary_number > 0:
|
| 240 |
+
concat_text = concat_text + ad.primary[0]
|
| 241 |
+
|
| 242 |
+
if ad.description_number > 0:
|
| 243 |
+
concat_text = concat_text + ad.description[0]
|
| 244 |
+
|
| 245 |
+
if ad.title_number > 0:
|
| 246 |
+
concat_text = concat_text + ad.title[0]
|
| 247 |
+
|
| 248 |
+
ad["concat_text"] = concat_text
|
| 249 |
+
|
| 250 |
+
return ad
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
regression_dataset = RegressionDataset()
|
| 254 |
+
|
| 255 |
+
regression_dataset_s3 = RegressionDataset(s3=True)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
def add_bert_embeddings(df: pd.DataFrame, save_path: str, layer_dict: dict = {}, device=DEVICE) -> pd.DataFrame:
|
| 259 |
+
|
| 260 |
+
if device == torch.device("cuda"):
|
| 261 |
+
df["my_bert_cls_embedding"] = df.text_clean.apply(
|
| 262 |
+
lambda text: get_bert_embedding(text=text, cls=True, layer_dict=layer_dict)
|
| 263 |
+
)
|
| 264 |
+
df["my_bert_mean_embedding"] = df.text_clean.apply(
|
| 265 |
+
lambda text: get_bert_embedding(text=text, cls=False, layer_dict=layer_dict)
|
| 266 |
+
)
|
| 267 |
+
return df
|
| 268 |
+
|
| 269 |
+
if "my_bert_cls_embedding" not in df.columns:
|
| 270 |
+
df["my_bert_cls_embedding"] = None
|
| 271 |
+
|
| 272 |
+
if "my_bert_mean_embedding" not in df.columns:
|
| 273 |
+
df["my_bert_mean_embedding"] = None
|
| 274 |
+
|
| 275 |
+
counter = 0
|
| 276 |
+
|
| 277 |
+
df["my_bert_cls_embedding"] = df["my_bert_cls_embedding"].astype(object)
|
| 278 |
+
df["my_bert_mean_embedding"] = df["my_bert_mean_embedding"].astype(object)
|
| 279 |
+
|
| 280 |
+
for i in tqdm(range(len(df))):
|
| 281 |
+
|
| 282 |
+
if df.at[i, "my_bert_cls_embedding"] is not None:
|
| 283 |
+
df.at[i, "my_bert_cls_embedding"] = get_bert_embedding(
|
| 284 |
+
text=df.at[i, "text_clean"], cls=False, layer_dict=layer_dict
|
| 285 |
+
)
|
| 286 |
+
counter = counter + 1
|
| 287 |
+
sleep(0.5)
|
| 288 |
+
|
| 289 |
+
if df.at[i, "my_bert_mean_embedding"] is not None:
|
| 290 |
+
df.at[i, "my_bert_mean_embedding"] = get_bert_embedding(
|
| 291 |
+
text=df.at[i, "text_clean"], cls=True, layer_dict=layer_dict
|
| 292 |
+
)
|
| 293 |
+
counter = counter + 1
|
| 294 |
+
sleep(0.5)
|
| 295 |
+
|
| 296 |
+
if counter % 50 in [0, 1]:
|
| 297 |
+
df.to_csv(save_path, index=False)
|
| 298 |
+
|
| 299 |
+
df.to_csv(save_path, index=False)
|
| 300 |
+
|
| 301 |
+
return df
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def add_concat_embeddings(series: pd.DataFrame) -> pd.Series:
|
| 305 |
+
other_features = {"aov": series["aov"]} | series["text_clean_sentiment"]
|
| 306 |
+
|
| 307 |
+
for type in ["cls", "mean"]:
|
| 308 |
+
bert_embedding = series[f"my_bert_{type}_embedding"]
|
| 309 |
+
series[f"my_full_{type}_embedding"] = get_concat_embedding(
|
| 310 |
+
bert_embedding=bert_embedding, other_features=other_features
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
return series
|
src/regression/datasets/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .FullModelDatasetTorch import FullModelDatasetTorch
|
| 2 |
+
from .DecoderDatasetTorch import DecoderDatasetTorch
|
| 3 |
+
from .RegressionDataset import RegressionDataset, regression_dataset, regression_dataset_s3
|
src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc
ADDED
|
Binary file (1.56 kB). View file
|
|
|
src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc
ADDED
|
Binary file (1.64 kB). View file
|
|
|
src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc
ADDED
|
Binary file (9.28 kB). View file
|
|
|
src/regression/datasets/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (359 Bytes). View file
|
|
|
src/regression/datasets/__pycache__/dataset.cpython-310.pyc
ADDED
|
Binary file (1.71 kB). View file
|
|
|
src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc
ADDED
|
Binary file (1.73 kB). View file
|
|
|
src/regression/training_scripts/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .train_full_model_PL import train_full_model_PL
|
| 2 |
+
from .train_decoder_PL import train_decoder_PL
|
src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (261 Bytes). View file
|
|
|
src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc
ADDED
|
Binary file (2.44 kB). View file
|
|
|
src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc
ADDED
|
Binary file (2.6 kB). View file
|
|
|