sanjin7 commited on
Commit
cea4a4b
·
1 Parent(s): 82c0c38

Upload src/ with huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. src/MLM/__init__.py +2 -0
  2. src/MLM/__pycache__/__init__.cpython-310.pyc +0 -0
  3. src/MLM/datasets/MLMDataset.py +193 -0
  4. src/MLM/datasets/__init__.py +2 -0
  5. src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc +0 -0
  6. src/MLM/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  7. src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc +0 -0
  8. src/MLM/datasets/preprocess_dataset.py +39 -0
  9. src/MLM/mask_and_unmask.py +32 -0
  10. src/MLM/training_scripts/__init__.py +1 -0
  11. src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc +0 -0
  12. src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc +0 -0
  13. src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc +0 -0
  14. src/MLM/training_scripts/train_with_trainer.py +50 -0
  15. src/MLM/training_scripts/utils.py +9 -0
  16. src/regression/.gitignore +1 -0
  17. src/regression/HF/__init__.py +2 -0
  18. src/regression/HF/__pycache__/__init__.cpython-310.pyc +0 -0
  19. src/regression/HF/configs/FullModelConfigHF.py +25 -0
  20. src/regression/HF/configs/__init__.py +1 -0
  21. src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc +0 -0
  22. src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc +0 -0
  23. src/regression/HF/models/FullModelHF.py +43 -0
  24. src/regression/HF/models/__init__.py +1 -0
  25. src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc +0 -0
  26. src/regression/HF/models/__pycache__/__init__.cpython-310.pyc +0 -0
  27. src/regression/PL/DecoderPL.py +180 -0
  28. src/regression/PL/EncoderPL.py +116 -0
  29. src/regression/PL/FullModelPL.py +166 -0
  30. src/regression/PL/__init__.py +3 -0
  31. src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc +0 -0
  32. src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc +0 -0
  33. src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc +0 -0
  34. src/regression/PL/__pycache__/__init__.cpython-310.pyc +0 -0
  35. src/regression/__init__.py +3 -0
  36. src/regression/__pycache__/__init__.cpython-310.pyc +0 -0
  37. src/regression/datasets/DecoderDatasetTorch.py +38 -0
  38. src/regression/datasets/FullModelDatasetTorch.py +39 -0
  39. src/regression/datasets/RegressionDataset.py +313 -0
  40. src/regression/datasets/__init__.py +3 -0
  41. src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc +0 -0
  42. src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc +0 -0
  43. src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc +0 -0
  44. src/regression/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  45. src/regression/datasets/__pycache__/dataset.cpython-310.pyc +0 -0
  46. src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc +0 -0
  47. src/regression/training_scripts/__init__.py +2 -0
  48. src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc +0 -0
  49. src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc +0 -0
  50. src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc +0 -0
src/MLM/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .training_scripts.train_with_trainer import train_with_trainer
2
+ from .datasets.preprocess_dataset import preprocess_dataset
src/MLM/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (332 Bytes). View file
 
src/MLM/datasets/MLMDataset.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ from transformers import BertTokenizerFast, AutoTokenizer
6
+ from datasets import Dataset, DatasetDict, load_dataset
7
+
8
+ from src.utils import (
9
+ detect_language,
10
+ add_emoji_tokens,
11
+ add_new_line_token,
12
+ user_id,
13
+ )
14
+ from src.utils.text_functions import clean_text
15
+ from src.utils.s3 import read_csv, save_csv
16
+
17
+ load_dotenv()
18
+
19
+
20
+ class MLMDataset:
21
+ def __init__(
22
+ self,
23
+ s3: bool = False,
24
+ bucket: str = "lebesgue-data-science",
25
+ folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data/pretrain",
26
+ s3_folder: str = "transformers/data/pretrain",
27
+ ):
28
+ self.s3 = s3
29
+ self.bucket = bucket
30
+
31
+ if self.s3:
32
+ self.folder = s3_folder
33
+ else:
34
+ self.folder = folder
35
+
36
+ self.primaries_path = f"{self.folder}/primaries.csv"
37
+ self.competitors_path = f"{self.folder}/competitor_ads.csv"
38
+ self.ad_copies_path = f"{self.folder}/ad_copies.csv"
39
+ self.english_copies_path = f"{self.folder}/english_copies.csv"
40
+ self.train_path = f"{self.folder}/train.csv"
41
+ self.val_path = f"{self.folder}/val.csv"
42
+ self.test_path = f"{self.folder}/test.csv"
43
+
44
+ self.tokenizer_id = f"{user_id}/lebesgue_ad_tokenizer"
45
+
46
+ self.hub_datasetdict_id = f"{user_id}/lebesgue_ad_datasets"
47
+
48
+ @property
49
+ def primaries(self) -> pd.DataFrame:
50
+ df = read_csv(self.primaries_path, s3=self.s3, s3_args={"bucket": self.bucket})
51
+ return df
52
+
53
+ @property
54
+ def competitors(self) -> pd.DataFrame:
55
+ df = read_csv(self.competitors_path, s3=self.s3, s3_args={"bucket": self.bucket})
56
+ return df
57
+
58
+ @property
59
+ def ad_copies(self) -> pd.DataFrame:
60
+ df = read_csv(self.ad_copies_path, s3=self.s3, s3_args={"bucket": self.bucket})
61
+ return df
62
+
63
+ @property
64
+ def english_copies(self) -> pd.DataFrame:
65
+ args = {"lineterminator": "\n"}
66
+ df = read_csv(
67
+ self.english_copies_path,
68
+ s3=self.s3,
69
+ s3_args={"bucket": self.bucket} | args,
70
+ pd_args=args,
71
+ )
72
+ return df
73
+
74
+ @property
75
+ def train(self) -> pd.DataFrame:
76
+ df = read_csv(self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
77
+ return df
78
+
79
+ @property
80
+ def val(self) -> pd.DataFrame:
81
+ df = read_csv(self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
82
+ return df
83
+
84
+ @property
85
+ def test(self) -> pd.DataFrame:
86
+ df = read_csv(self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
87
+ return df
88
+
89
+ @property
90
+ def datasets(self) -> DatasetDict:
91
+ return load_dataset(self.hub_datasetdict_id)
92
+
93
+ def tokenizer(self, checkpoint: str = "bert-base-uncased") -> AutoTokenizer:
94
+
95
+ return AutoTokenizer.from_pretrained(f"{self.tokenizer_id}_{checkpoint}")
96
+
97
+ def concat_and_remove_duplicates(self) -> pd.DataFrame:
98
+
99
+ comp = self.competitors
100
+ prim = self.primaries
101
+
102
+ primaries = prim.value.to_list()
103
+ primaries = [primary for primary in primaries if type(primary) == list]
104
+
105
+ list_of_primaries = []
106
+ for primary in primaries:
107
+ list_of_primaries.extend(primary)
108
+
109
+ competitors = comp.ad_text.to_list()
110
+
111
+ ad_copies = list_of_primaries + competitors
112
+ ad_copies = pd.Series(ad_copies).drop_duplicates()
113
+ ad_copies = pd.DataFrame(ad_copies, columns=["text"])
114
+ save_csv(
115
+ df=ad_copies,
116
+ path=self.ad_copies_path,
117
+ s3=self.s3,
118
+ s3_args={"bucket": self.bucket},
119
+ )
120
+
121
+ def get_language(self) -> pd.DataFrame:
122
+ ad_copies = self.ad_copies
123
+ ad_copies["language"] = ad_copies.text.apply(lambda text: detect_language(text))
124
+ save_csv(
125
+ df=ad_copies,
126
+ path=self.ad_copies_path,
127
+ s3=self.s3,
128
+ s3_args={"bucket": self.bucket},
129
+ )
130
+ return ad_copies
131
+
132
+ def filter_english(self) -> pd.DataFrame:
133
+ ad_copies = self.ad_copies
134
+ english = ad_copies[ad_copies.language == "en"]
135
+ save_csv(
136
+ df=english,
137
+ path=self.english_copies_path,
138
+ s3=self.s3,
139
+ s3_args={"bucket": self.bucket},
140
+ )
141
+ return english
142
+
143
+ def clean_english(self) -> pd.DataFrame:
144
+ english = self.english_copies
145
+ english["text_clean"] = english.text.apply(clean_text)
146
+
147
+ # remove empty ones
148
+ english = english[english.text_clean.apply(len) != 0]
149
+ save_csv(
150
+ df=english,
151
+ path=self.english_copies_path,
152
+ s3=self.s3,
153
+ s3_args={"bucket": self.bucket},
154
+ )
155
+ return english
156
+
157
+ def train_tokenizer(self, checkpoint: str = "bert-base-uncased"):
158
+
159
+ tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
160
+ tokenizer = add_emoji_tokens(tokenizer=tokenizer)
161
+ tokenizer = add_new_line_token(tokenizer=tokenizer)
162
+
163
+ tokenizer.push_to_hub(f"{self.tokenizer_id}_{checkpoint}")
164
+
165
+ def get_tokenizer(self):
166
+ return BertTokenizerFast.from_pretrained(self.tokenizer_id)
167
+
168
+ def split_into_train_and_test(
169
+ self,
170
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
171
+ df = self.english_copies
172
+ train, test = train_test_split(df, train_size=0.9, random_state=42)
173
+ train, val = train_test_split(train, train_size=0.85, random_state=42)
174
+
175
+ dataset_dict = DatasetDict()
176
+
177
+ for df, local_path, dataset_dict_key in zip(
178
+ [train, val, test],
179
+ [self.train_path, self.val_path, self.train_path],
180
+ ["train", "val", "test"],
181
+ ):
182
+ save_csv(df=df, path=local_path, s3=self.s3, s3_args={"bucket": self.bucket})
183
+ df_hf = Dataset.from_pandas(df, preserve_index=False)
184
+ dataset_dict[dataset_dict_key] = df_hf
185
+
186
+ dataset_dict.push_to_hub(self.hub_datasetdict_id)
187
+
188
+ return train, val, test
189
+
190
+
191
+ mlm_dataset = MLMDataset()
192
+
193
+ mlm_dataset_s3 = MLMDataset(s3=True)
src/MLM/datasets/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .preprocess_dataset import preprocess_dataset
2
+ from .MLMDataset import MLMDataset, mlm_dataset, mlm_dataset_s3
src/MLM/datasets/__pycache__/MLMDataset.cpython-310.pyc ADDED
Binary file (6.11 kB). View file
 
src/MLM/datasets/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (314 Bytes). View file
 
src/MLM/datasets/__pycache__/preprocess_dataset.cpython-310.pyc ADDED
Binary file (1.91 kB). View file
 
src/MLM/datasets/preprocess_dataset.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset, DatasetDict
2
+ from transformers import AutoTokenizer
3
+
4
+ def preprocess_dataset(dataset: Dataset | DatasetDict, tokenizer: AutoTokenizer) -> Dataset | DatasetDict:
5
+
6
+ tokenized_dataset = dataset.map(
7
+ lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=["text", 'text_clean', 'language']
8
+ )
9
+
10
+ return tokenized_dataset.map(group_texts, batched=True)
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+ def tokenize_function(examples, tokenizer: AutoTokenizer):
19
+
20
+ result = tokenizer(examples["text"])
21
+
22
+ if tokenizer.is_fast:
23
+ result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
24
+
25
+ return result
26
+
27
+
28
+ def group_texts(examples, chunk_size: int = 128):
29
+
30
+ concatinated_examples = {k : sum(examples[k], []) for k in examples.keys()}
31
+
32
+ total_length = len(concatinated_examples["input_ids"])
33
+ total_length = (total_length // chunk_size) * chunk_size
34
+
35
+ result = {k : [t[i : i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatinated_examples.items()}
36
+ result["labels"] = result["input_ids"].copy()
37
+
38
+ return result
39
+
src/MLM/mask_and_unmask.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizerFast, DataCollatorForLanguageModeling
2
+ import torch
3
+
4
+
5
+ def mask_and_unmask(
6
+ text: str,
7
+ tokenizer: AutoTokenizer | BertTokenizerFast,
8
+ model: AutoModelForMaskedLM,
9
+ data_collator: DataCollatorForLanguageModeling,
10
+ ) -> str:
11
+
12
+ collator_input = tokenizer(text)
13
+ collator_input["labels"] = collator_input["input_ids"].copy()
14
+ collator_output = data_collator([collator_input])
15
+ masked_text = tokenizer.decode(collator_output["input_ids"][0])
16
+
17
+ pred_dict = {"masked_text": masked_text}
18
+
19
+ inputs = tokenizer(masked_text, return_tensors="pt", padding="max_length", truncation=True)
20
+ token_logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]).logits
21
+ all_masked_token_index = torch.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)
22
+ if all_masked_token_index.size()[0] != 0:
23
+
24
+ for i, masked_index_token in enumerate(all_masked_token_index[:, 1]):
25
+ # print(masked_index_token)
26
+ masked_token_logits = token_logits[0, masked_index_token, :]
27
+ # print(masked_token_logits)
28
+ top_5_tokens = torch.argsort(masked_token_logits, descending=True)[:5].tolist()
29
+ value = tokenizer.decode(collator_output["labels"][0, masked_index_token - 1])
30
+ pred_dict[value] = [tokenizer.decode(token) for token in top_5_tokens]
31
+
32
+ return pred_dict
src/MLM/training_scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .train_with_trainer import train_with_trainer
src/MLM/training_scripts/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (240 Bytes). View file
 
src/MLM/training_scripts/__pycache__/train_with_trainer.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
src/MLM/training_scripts/__pycache__/utils.cpython-310.pyc ADDED
Binary file (564 Bytes). View file
 
src/MLM/training_scripts/train_with_trainer.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForMaskedLM, AutoTokenizer, TrainingArguments, Trainer
2
+ from datasets import Dataset, DatasetDict
3
+ from transformers import DataCollatorForLanguageModeling
4
+
5
+ from src.MLM.datasets.preprocess_dataset import preprocess_dataset
6
+ from src.MLM.training_scripts.utils import get_new_model_name
7
+
8
+
9
+ def train_with_trainer(
10
+ model_checkpoint: str,
11
+ tokenizer: AutoTokenizer,
12
+ dataset: DatasetDict,
13
+ model_name: str | None = None,
14
+ data_collator=None,
15
+ num_epochs: int = 3,
16
+ ):
17
+
18
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
19
+
20
+ model_name = get_new_model_name(model_checkpoint=model_checkpoint, model_name=model_name)
21
+
22
+ dataset = preprocess_dataset(dataset=dataset, tokenizer=tokenizer)
23
+
24
+ if data_collator is None:
25
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
26
+
27
+ training_args = TrainingArguments(
28
+ model_name,
29
+ evaluation_strategy="epoch",
30
+ learning_rate=2e-5,
31
+ weight_decay=0.01,
32
+ push_to_hub=True,
33
+ report_to="wandb",
34
+ run_name=model_name,
35
+ num_train_epochs=num_epochs,
36
+ save_total_limit=1,
37
+ save_strategy="epoch",
38
+ )
39
+
40
+ print(f"device: {training_args.device}")
41
+
42
+ trainer = Trainer(
43
+ model=model,
44
+ args=training_args,
45
+ train_dataset=dataset["train"],
46
+ eval_dataset=dataset["val"],
47
+ data_collator=data_collator,
48
+ )
49
+
50
+ trainer.train()
src/MLM/training_scripts/utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ def get_new_model_name(model_checkpoint: str, model_name: str = None) -> str:
2
+ if model_name is None:
3
+ old_version_number = int(model_checkpoint[-2:])
4
+ new_version_number = str(old_version_number + 1).zfill(2)
5
+ model_name = f"{model_checkpoint[:-2]}{new_version_number}"
6
+ elif not model_name[-2:].isnumeric():
7
+ model_name = model_name + "_00"
8
+
9
+ return model_name
src/regression/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ runs/
src/regression/HF/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .configs import *
2
+ from .models import *
src/regression/HF/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (236 Bytes). View file
 
src/regression/HF/configs/FullModelConfigHF.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from src.regression.PL import EncoderPL, DecoderPL
3
+ from typing import List
4
+
5
+
6
+ class FullModelConfigHF(PretrainedConfig):
7
+
8
+ model_type = "full_model"
9
+
10
+ def __init__(
11
+ self,
12
+ tokenizer_ckpt: str = "",
13
+ bert_ckpt: str = "",
14
+ decoder_ckpt: str = "",
15
+ layer_norm: bool = True,
16
+ nontext_features: List[str] = ["aov"],
17
+ **kwargs,
18
+ ):
19
+
20
+ self.tokenizer_ckpt = tokenizer_ckpt
21
+ self.bert_ckpt = bert_ckpt
22
+ self.decoder_ckpt = decoder_ckpt
23
+ self.nontext_features = nontext_features
24
+ self.layer_norm = layer_norm
25
+ super().__init__(**kwargs)
src/regression/HF/configs/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .FullModelConfigHF import FullModelConfigHF
src/regression/HF/configs/__pycache__/FullModelConfigHF.cpython-310.pyc ADDED
Binary file (1.02 kB). View file
 
src/regression/HF/configs/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (240 Bytes). View file
 
src/regression/HF/models/FullModelHF.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel
2
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
3
+ from pytorch_lightning.loggers import WandbLogger
4
+
5
+ from src.regression.PL import FullModelPL, EncoderPL, DecoderPL
6
+ from src.regression.HF.configs import FullModelConfigHF
7
+
8
+ from config import DEVICE
9
+
10
+
11
+ class FullModelHF(PreTrainedModel):
12
+ config_class = FullModelConfigHF
13
+
14
+ def __init__(self, config):
15
+
16
+ super().__init__(config)
17
+
18
+ self.tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_ckpt)
19
+ mlm_bert = AutoModelForMaskedLM.from_pretrained(config.bert_ckpt)
20
+ self.bert = mlm_bert.distilbert
21
+
22
+ encoder = EncoderPL(tokenizer=self.tokenizer, bert=self.bert).to(DEVICE)
23
+
24
+ wandb_logger = WandbLogger(
25
+ project="transformers",
26
+ entity="sanjin_juric_fot",
27
+ # log_model=True,
28
+ # reinit=True,
29
+ )
30
+
31
+ artifact = wandb_logger.use_artifact(config.decoder_ckpt)
32
+ artifact_dir = artifact.download()
33
+ decoder = DecoderPL.load_from_checkpoint(artifact_dir + "/" + "model.ckpt").to(DEVICE)
34
+
35
+ self.model = FullModelPL(
36
+ encoder=encoder,
37
+ decoder=decoder,
38
+ layer_norm=config.layer_norm,
39
+ nontext_features=config.nontext_features,
40
+ ).to(DEVICE)
41
+
42
+ def forward(self, input):
43
+ return self.model._get_loss(input)
src/regression/HF/models/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .FullModelHF import FullModelHF
src/regression/HF/models/__pycache__/FullModelHF.cpython-310.pyc ADDED
Binary file (1.64 kB). View file
 
src/regression/HF/models/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (233 Bytes). View file
 
src/regression/PL/DecoderPL.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import emoji
2
+ import numpy as np
3
+ import pytorch_lightning as pl
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from loguru import logger
7
+ from torch import nn
8
+ from torch.optim.lr_scheduler import CosineAnnealingLR
9
+ from torchmetrics import R2Score
10
+
11
+ from src.utils import get_sentiment
12
+ from src.utils.neural_networks import set_layer
13
+ from config import DEVICE
14
+
15
+ torch.set_default_dtype(torch.float32)
16
+
17
+
18
+ class DecoderPL(pl.LightningModule):
19
+ def __init__(
20
+ self,
21
+ input_dim: int = 774,
22
+ layer_norm: bool = True,
23
+ layer_dict: dict = {},
24
+ device=DEVICE,
25
+ T_max: int = 10,
26
+ start_lr: float = 5 * 1e-4,
27
+ ):
28
+ super().__init__()
29
+
30
+ # layers
31
+ self.linear1 = set_layer(
32
+ layer_dict=layer_dict,
33
+ name="linear1",
34
+ alternative=nn.Linear(in_features=input_dim, out_features=512),
35
+ )
36
+
37
+ self.linear2 = set_layer(
38
+ layer_dict=layer_dict,
39
+ name="linear2",
40
+ alternative=nn.Linear(in_features=512, out_features=264),
41
+ )
42
+
43
+ self.linear3 = set_layer(
44
+ layer_dict=layer_dict,
45
+ name="linear3",
46
+ alternative=nn.Linear(in_features=264, out_features=64),
47
+ )
48
+
49
+ self.linear4 = set_layer(
50
+ layer_dict=layer_dict,
51
+ name="linear4",
52
+ alternative=nn.Linear(in_features=64, out_features=1),
53
+ )
54
+
55
+ self.activation = nn.LeakyReLU(negative_slope=0.01)
56
+
57
+ if not layer_norm:
58
+ self.layers = [
59
+ self.linear1,
60
+ self.activation,
61
+ self.linear2,
62
+ self.activation,
63
+ self.linear3,
64
+ self.activation,
65
+ self.linear4,
66
+ ]
67
+ else:
68
+ self.layernorm1 = nn.LayerNorm(normalized_shape=(1, self.linear1.out_features))
69
+ self.layernorm2 = nn.LayerNorm(normalized_shape=(1, self.linear2.out_features))
70
+ self.layernorm3 = nn.LayerNorm(normalized_shape=(1, self.linear3.out_features))
71
+ self.layers = [
72
+ self.linear1,
73
+ self.layernorm1,
74
+ self.activation,
75
+ self.linear2,
76
+ self.layernorm2,
77
+ self.activation,
78
+ self.linear3,
79
+ self.layernorm3,
80
+ self.activation,
81
+ self.linear4,
82
+ ]
83
+
84
+ # initialize weights
85
+ [self.initialize_weights(layer) for layer in self.layers]
86
+
87
+ # optimizer and scheduler
88
+ self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=start_lr)
89
+ self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max)
90
+
91
+ # else
92
+ self.save_hyperparameters(ignore=["model"])
93
+ self.MSE = nn.MSELoss()
94
+ self.R2 = R2Score()
95
+
96
+ def initialize_weights(self, module):
97
+
98
+ if isinstance(module, nn.Linear):
99
+ logger.debug("linear weights initialized")
100
+ torch.nn.init.xavier_uniform_(module.weight)
101
+ module.bias.data.fill_(0.01)
102
+
103
+ def forward(self, x: torch.Tensor):
104
+
105
+ if x.dim() == 2:
106
+ x = x.unsqueeze(dim=1)
107
+
108
+ for layer in self.layers:
109
+ x = layer(x)
110
+
111
+ x = x.squeeze()
112
+
113
+ if x.dim() == 0:
114
+ x = x.unsqueeze(dim=0)
115
+
116
+ return x.to(torch.float32)
117
+
118
+ def training_step(self, batch):
119
+
120
+ loss_and_metrics = self._get_loss(batch, get_metrics=True)
121
+ pred = loss_and_metrics["pred"]
122
+ act = loss_and_metrics["act"]
123
+ loss = loss_and_metrics["loss"]
124
+
125
+ self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True)
126
+
127
+ return {"loss": loss, "pred": pred, "act": act}
128
+
129
+ def configure_optimizers(self):
130
+
131
+ optimizer = self.optimizer
132
+ scheduler = self.scheduler
133
+ return dict(optimizer=optimizer, lr_scheduler=scheduler)
134
+
135
+ def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
136
+ logger.debug(scheduler)
137
+ if metric is None:
138
+ scheduler.step()
139
+ else:
140
+ scheduler.step(metric)
141
+
142
+ def validation_step(self, batch, batch_idx):
143
+ """used for logging metrics"""
144
+ loss_and_metrics = self._get_loss(batch, get_metrics=True)
145
+ loss = loss_and_metrics["loss"]
146
+
147
+ # Log loss and metric
148
+ self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
149
+
150
+ def training_epoch_end(self, training_step_outputs):
151
+
152
+ training_step_outputs = list(training_step_outputs)
153
+
154
+ training_step_outputs.pop()
155
+
156
+ output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]}
157
+
158
+ pred = torch.stack(output_dict["pred"])
159
+ act = torch.stack(output_dict["act"])
160
+
161
+ loss = torch.sub(pred, act)
162
+ loss_sq = torch.square(loss)
163
+
164
+ TSS = float(torch.var(act, unbiased=False))
165
+ RSS = float(torch.mean(loss_sq))
166
+ R2 = 1 - RSS / TSS
167
+
168
+ self.log("train_R2", R2, prog_bar=True, logger=True)
169
+
170
+ def _get_loss(self, batch, get_metrics: bool = False):
171
+ """convenience function since train/valid/test steps are similar"""
172
+ pred = self.forward(x=batch["embedding"]).to(torch.float32)
173
+
174
+ act, loss = None, None
175
+
176
+ if "ctr" in batch.keys():
177
+ act = batch["ctr"].to(torch.float32)
178
+ loss = self.MSE(pred, act).to(torch.float32)
179
+
180
+ return {"loss": loss, "pred": pred, "act": act}
src/regression/PL/EncoderPL.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import emoji
2
+ import numpy as np
3
+ import pytorch_lightning as pl
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from loguru import logger
7
+ from torch import nn
8
+ from torch.optim.lr_scheduler import CosineAnnealingLR
9
+ from torchmetrics import R2Score
10
+ from transformers import BertModel, BertTokenizer, DistilBertModel, AutoModel, AutoTokenizer
11
+ from pytorch_lightning import LightningModule
12
+
13
+
14
+ from src.utils.neural_networks import set_layer
15
+ from src.utils import add_emoji_tokens, add_new_line_token, vectorise_dict
16
+ from config import DEVICE
17
+
18
+ torch.set_default_dtype(torch.float32)
19
+
20
+
21
+ class EncoderPL(pl.LightningModule):
22
+ def __init__(
23
+ self,
24
+ model_name: str = "bert-base-uncased",
25
+ tokenizer: AutoTokenizer | None = None,
26
+ bert: AutoModel | None = None,
27
+ cls: bool = False,
28
+ device=DEVICE,
29
+ ):
30
+ super().__init__()
31
+
32
+ self._device = device
33
+ self.cls = cls
34
+ self.model_name = model_name
35
+
36
+ # layers
37
+
38
+ self.tokenizer = tokenizer if tokenizer is not None else BertTokenizer.from_pretrained(model_name)
39
+
40
+ self.bert = bert if bert is not None else BertModel.from_pretrained(model_name)
41
+
42
+ if tokenizer is None:
43
+ self.tokenizer = add_emoji_tokens(self.tokenizer)
44
+ self.tokenizer = add_new_line_token(self.tokenizer)
45
+ self.bert.resize_token_embeddings(len(self.tokenizer))
46
+
47
+ # optimizer and scheduler
48
+ self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=1e-3)
49
+
50
+ # config tweaking
51
+ self.bert.config.torch_dtype = "float32"
52
+
53
+ def forward(self, text: str):
54
+
55
+ # run text through bert and squash the output to get embeddings
56
+ encoded = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(self._device)
57
+
58
+ if type(self.bert) == DistilBertModel:
59
+ encoded.pop("token_type_ids")
60
+
61
+ bert_output = self.bert(**encoded)
62
+
63
+ if self.cls:
64
+ if hasattr(bert_output, "pooler_output") and bert_output.pooler_output is not None:
65
+ embedding = bert_output.pooler_output.unsqueeze(dim=1)
66
+ else:
67
+ embedding = bert_output.last_hidden_state[0, 0, :].unsqueeze(dim=0).unsqueeze(dim=0)
68
+ else:
69
+ last_hidden_state = bert_output.last_hidden_state
70
+
71
+ if last_hidden_state.dim() == 2:
72
+ last_hidden_state = last_hidden_state.unsqueeze(dim=0)
73
+
74
+ embedding = torch.matmul(
75
+ encoded["attention_mask"].type(torch.float32).view(-1, 1, 512),
76
+ last_hidden_state,
77
+ )
78
+
79
+ return embedding
80
+
81
+ def configure_optimizers(self):
82
+ return self.optimizer
83
+
84
+
85
+ def get_bert_embedding(
86
+ text: str, as_list: bool = True, cls: bool = False, device=DEVICE, layer_dict: dict = {}
87
+ ) -> list:
88
+ encoder = EncoderPL(cls=cls, layer_dict=layer_dict).to(device)
89
+ embedding = encoder.forward(text)
90
+
91
+ if as_list:
92
+ embedding = embedding.tolist()[0][0]
93
+
94
+ return embedding
95
+
96
+
97
+ def get_concat_embedding(
98
+ text: str = None,
99
+ bert_embedding: list = [],
100
+ other_features: dict = {},
101
+ cls: bool = False,
102
+ device=DEVICE,
103
+ layer_dict: dict = {},
104
+ ) -> list:
105
+
106
+ if not len(bert_embedding):
107
+
108
+ if text is None:
109
+ raise ValueError("both text and embedding are empty!")
110
+ bert_embedding = get_bert_embedding(text=text, cls=cls, device=device, layer_dict=layer_dict)
111
+
112
+ other_features = vectorise_dict(other_features, as_list=True)
113
+
114
+ concat_vec = bert_embedding + other_features
115
+
116
+ return concat_vec
src/regression/PL/FullModelPL.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import emoji
2
+ import numpy as np
3
+ import pytorch_lightning as pl
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from loguru import logger
7
+ from torch import nn
8
+ from torch.optim.lr_scheduler import CosineAnnealingLR
9
+ from torchmetrics import R2Score
10
+ from transformers import BertModel, BertTokenizerFast
11
+
12
+ from src.utils import get_sentiment, vectorise_dict
13
+ from src.utils.neural_networks import set_layer
14
+ from config import DEVICE
15
+
16
+ from .DecoderPL import DecoderPL
17
+ from .EncoderPL import EncoderPL
18
+
19
+
20
+ torch.set_default_dtype(torch.float32)
21
+
22
+
23
+ class FullModelPL(pl.LightningModule):
24
+ def __init__(
25
+ self,
26
+ model_name: str = "bert-base-uncased",
27
+ nontext_features: list[str] = ["aov"],
28
+ encoder: EncoderPL | None = None,
29
+ decoder: DecoderPL | None = None,
30
+ layer_norm: bool = True,
31
+ device=DEVICE,
32
+ T_max: int = 10,
33
+ ):
34
+ super().__init__()
35
+
36
+ # layers
37
+ self.encoder = (
38
+ encoder.to(self.device)
39
+ if encoder is not None
40
+ else EncoderPL(model_name=model_name, device=device).to(self.device)
41
+ )
42
+ self.decoder = (
43
+ decoder.to(self.device)
44
+ if decoder is not None
45
+ else DecoderPL(
46
+ input_dim=768 + len(nontext_features) + 5,
47
+ layer_norm=layer_norm,
48
+ device=device,
49
+ ).to(self.device)
50
+ )
51
+
52
+ # else
53
+ self.MSE = nn.MSELoss()
54
+ self.R2 = R2Score()
55
+
56
+ self.optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, self.parameters()), lr=3 * 1e-4)
57
+ self.scheduler = CosineAnnealingLR(self.optimizer, T_max=T_max)
58
+
59
+ # self.save_hyperparameters(ignore=["decoder", "encoder"])
60
+
61
+ def forward(self, input_dict: dict):
62
+
63
+ input_dict = input_dict.copy()
64
+ text = input_dict.pop("text")
65
+
66
+ print(f"text: {text}")
67
+
68
+ if "ctr" in input_dict.keys():
69
+ input_dict.pop("ctr")
70
+
71
+ # encode
72
+ sentence_embedding = self.encoder.forward(text=text)
73
+
74
+ # sentiment
75
+ sentiment = get_sentiment_for_list_of_texts(text)
76
+ input_dict = input_dict | sentiment
77
+
78
+ input_dict = {k: v.to(self.device) for k, v in input_dict.items()}
79
+
80
+ # concat nontext features to embedding
81
+ nontext_vec = vectorise_dict(input_dict)
82
+ nontext_tensor = torch.stack(nontext_vec).T.unsqueeze(1).to(torch.float32)
83
+ # logger.debug(f"nontext tensor type: {nontext_tensor.dtype}")
84
+ print(f"{sentence_embedding.get_device()}, {nontext_tensor.get_device()}")
85
+ x = torch.cat((sentence_embedding, nontext_tensor), 2)
86
+
87
+ print(self.decoder.device)
88
+ print(x.get_device())
89
+
90
+ # decode
91
+ result = self.decoder.forward(x)
92
+ return result
93
+
94
+ def training_step(self, batch):
95
+
96
+ loss_and_metrics = self._get_loss(batch, get_metrics=True)
97
+ pred = loss_and_metrics["pred"]
98
+ act = loss_and_metrics["act"]
99
+ loss = loss_and_metrics["loss"]
100
+
101
+ self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, logger=True)
102
+
103
+ return {"loss": loss, "pred": pred, "act": act}
104
+
105
+ def configure_optimizers(self):
106
+
107
+ for name, param in self.named_parameters():
108
+ if "bert" in name:
109
+ param.requires_grad = False
110
+
111
+ optimizer = self.optimizer
112
+ scheduler = self.scheduler
113
+ return dict(optimizer=optimizer, lr_scheduler=scheduler)
114
+
115
+ def lr_scheduler_step(self, scheduler, optimizer_idx, metric):
116
+ logger.debug(scheduler)
117
+ if metric is None:
118
+ scheduler.step()
119
+ else:
120
+ scheduler.step(metric)
121
+
122
+ def validation_step(self, batch, batch_idx):
123
+ """used for logging metrics"""
124
+ loss_and_metrics = self._get_loss(batch, get_metrics=True)
125
+ loss = loss_and_metrics["loss"]
126
+
127
+ # Log loss and metric
128
+ self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
129
+
130
+ def training_epoch_end(self, training_step_outputs):
131
+
132
+ training_step_outputs = list(training_step_outputs)
133
+
134
+ training_step_outputs.pop()
135
+
136
+ output_dict = {k: [dic[k] for dic in training_step_outputs] for k in training_step_outputs[0]}
137
+
138
+ pred = torch.stack(output_dict["pred"])
139
+ act = torch.stack(output_dict["act"])
140
+
141
+ loss = torch.sub(pred, act)
142
+ loss_sq = torch.square(loss)
143
+
144
+ TSS = float(torch.var(act, unbiased=False))
145
+ RSS = float(torch.mean(loss_sq))
146
+ R2 = 1 - RSS / TSS
147
+
148
+ self.log("train_R2", R2, prog_bar=True, logger=True)
149
+
150
+ def _get_loss(self, batch, get_metrics: bool = False):
151
+ """convenience function since train/valid/test steps are similar"""
152
+ pred = self.forward(input_dict=batch).to(torch.float32)
153
+
154
+ act, loss = None, None
155
+
156
+ if "ctr" in batch.keys():
157
+ act = batch["ctr"].to(torch.float32).to(self.device)
158
+ loss = self.MSE(pred, act).to(torch.float32)
159
+
160
+ return {"loss": loss, "pred": pred, "act": act}
161
+
162
+
163
+ def get_sentiment_for_list_of_texts(texts: list[str]) -> dict:
164
+ ld = [get_sentiment(text) for text in texts]
165
+ v = {k: torch.Tensor([dic[k] for dic in ld]) for k in ld[0]}
166
+ return v
src/regression/PL/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .FullModelPL import FullModelPL
2
+ from .DecoderPL import DecoderPL
3
+ from .EncoderPL import EncoderPL, get_concat_embedding, get_bert_embedding
src/regression/PL/__pycache__/DecoderPL.cpython-310.pyc ADDED
Binary file (5.34 kB). View file
 
src/regression/PL/__pycache__/EncoderPL.cpython-310.pyc ADDED
Binary file (3.53 kB). View file
 
src/regression/PL/__pycache__/FullModelPL.cpython-310.pyc ADDED
Binary file (5.87 kB). View file
 
src/regression/PL/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (352 Bytes). View file
 
src/regression/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .datasets import *
2
+ from .training_scripts import *
3
+ from .PL import *
src/regression/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (258 Bytes). View file
 
src/regression/datasets/DecoderDatasetTorch.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+
7
+ class DecoderDatasetTorch(Dataset):
8
+ """Train dataset."""
9
+
10
+ def __init__(self, df: pd.DataFrame, embedding_column: str = "my_full_mean_embedding"):
11
+ """
12
+
13
+ Args:
14
+ df (pd.DataFrame): dataframe with ads
15
+ embedding_column (str, optional): Column whose values to output in __get_item__. Defaults to 'full_mean_embedding'.
16
+ """
17
+ self.df = df
18
+ self.embedding_column = embedding_column
19
+
20
+ df[[embedding_column, "ctr"]] = df[[embedding_column, "ctr"]].applymap(lambda x: np.float32(x))
21
+ # df["ctr"] = df["ctr"].astype(np.float32)
22
+
23
+ def __len__(self):
24
+ return len(self.df)
25
+
26
+ def __getitem__(self, idx):
27
+
28
+ if torch.is_tensor(idx):
29
+ idx = idx.tolist()
30
+
31
+ embedding = self.df.loc[idx, self.embedding_column]
32
+ ctr = self.df.loc[idx, "ctr"]
33
+
34
+ return {"embedding": embedding, "ctr": ctr}
35
+
36
+
37
+ # tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
38
+ # train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)
src/regression/datasets/FullModelDatasetTorch.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ from torch.utils.data import DataLoader, Dataset
5
+
6
+
7
+ class FullModelDatasetTorch(Dataset):
8
+ """Train dataset."""
9
+
10
+ def __init__(self, df: pd.DataFrame, nontext_features: list[str] = ["aov"]):
11
+ """
12
+
13
+ Args:
14
+ df (pd.DataFrame): train dataframe
15
+ nontext_features (list[str]): features to use in training except for text embeddings
16
+ """
17
+ self.df = df
18
+ self.nontext_features = nontext_features
19
+
20
+ df[nontext_features + ["ctr"]] = df[nontext_features + ["ctr"]].astype(np.float32)
21
+
22
+ def __len__(self):
23
+ return len(self.df)
24
+
25
+ def __getitem__(self, idx):
26
+
27
+ if torch.is_tensor(idx):
28
+ idx = idx.tolist()
29
+
30
+ text = self.df.loc[idx, "text_clean"]
31
+ ctr = self.df.loc[idx, "ctr"]
32
+
33
+ nontext_features = {feature: self.df.loc[idx, feature] for feature in self.nontext_features}
34
+
35
+ return {"text": text, "ctr": ctr} | nontext_features
36
+
37
+
38
+ # tokenizer = BertTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
39
+ # train_dataset = AdDataset(df=dataset.train, tokenizer=tokenizer)
src/regression/datasets/RegressionDataset.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from dotenv import load_dotenv
6
+ from langdetect import detect
7
+ from loguru import logger
8
+ from sklearn.model_selection import train_test_split
9
+ from time import sleep
10
+ from transformers import BertModel, AutoTokenizer
11
+ from tqdm import tqdm
12
+ import torch
13
+ from config import DEVICE
14
+
15
+ from src.utils.text_functions import clean_text, detect_language
16
+ from src.utils import (
17
+ get_sentiment,
18
+ detect_language,
19
+ )
20
+
21
+ from src.regression.PL import (
22
+ get_bert_embedding,
23
+ get_concat_embedding,
24
+ )
25
+
26
+ from src.utils.s3 import read_csv, save_csv
27
+
28
+
29
+ load_dotenv()
30
+
31
+
32
+ class RegressionDataset:
33
+ def __init__(
34
+ self,
35
+ s3: bool = False,
36
+ bucket: str = "lebesgue-data-science",
37
+ folder: str = os.getenv("GLOBAL_PATH_TO_REPO") + "/data",
38
+ s3_folder: str = "transformers/data",
39
+ ):
40
+ self.s3 = s3
41
+ self.bucket = bucket
42
+
43
+ if self.s3:
44
+ self.folder = s3_folder
45
+ else:
46
+ self.folder = folder
47
+
48
+ self.original_path = f"{self.folder}/original.csv"
49
+ self.untrimmed_path = f"{self.folder}/untrimmed.csv"
50
+ self.normalized_path = f"{self.folder}/normalized.csv"
51
+ self.trimmed_path = f"{self.folder}/trimmed.csv"
52
+
53
+ self.train_path = f"{self.folder}/train.csv"
54
+ self.val_path = f"{self.folder}/val.csv"
55
+ self.test_path = f"{self.folder}/test.csv"
56
+
57
+ self.text_types = ["primary", "title", "description"]
58
+
59
+ self.col_func_dict = {
60
+ "number": len,
61
+ "len": lambda texts: np.mean([len(text) for text in texts]),
62
+ }
63
+
64
+ @property
65
+ def original(self) -> pd.DataFrame:
66
+ df = read_csv(path=self.original_path, s3=self.s3, s3_args={"bucket": self.bucket})
67
+ return df
68
+
69
+ @property
70
+ def untrimmed(self) -> pd.DataFrame:
71
+ df = read_csv(path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
72
+ return df
73
+
74
+ @property
75
+ def normalized(self) -> pd.DataFrame:
76
+ df = read_csv(path=self.normalized_path, s3=self.s3, s3_args={"bucket": self.bucket})
77
+ return df
78
+
79
+ @property
80
+ def trimmed(self) -> pd.DataFrame:
81
+ df = read_csv(path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
82
+ return df
83
+
84
+ @property
85
+ def train(self) -> pd.DataFrame:
86
+ df = read_csv(path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
87
+ return df
88
+
89
+ @property
90
+ def val(self) -> pd.DataFrame:
91
+ df = read_csv(path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
92
+ return df
93
+
94
+ @property
95
+ def test(self) -> pd.DataFrame:
96
+ df = read_csv(path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
97
+ return df
98
+
99
+ def normalize_untrimmed(self, group_cols: list[str] = ["text", "target", "shop_id"]) -> pd.DataFrame:
100
+ df = self.untrimmed
101
+ grouped = df.groupby(group_cols)
102
+
103
+ filters_df = grouped.agg({"impr": "sum", "spend": "sum"}).reset_index()
104
+ ctr = grouped.apply(lambda df: df.link_clicks.sum() / df.impr.sum())
105
+ ctr_df = pd.DataFrame(ctr, columns=["ctr"]).reset_index()
106
+ normalised = filters_df.merge(ctr_df, on=group_cols)
107
+
108
+ merged = df.merge(normalised, on=group_cols, validate="m:1", suffixes=["___", None])
109
+ merged.drop(list([col for col in merged.columns if "___" in col]), inplace=True, axis=1)
110
+ final = merged.drop_duplicates(group_cols)
111
+ save_csv(
112
+ df=final,
113
+ path=self.normalized_path,
114
+ s3=self.s3,
115
+ s3_args={"bucket": self.bucket},
116
+ )
117
+ return df
118
+
119
+ def expand_untrimmed(self, update_existing_columns: bool = False) -> pd.DataFrame:
120
+
121
+ df = self.untrimmed
122
+
123
+ # normalise target by adset
124
+ # df["ctr_norm"] = (
125
+ # df.groupby(["shop_id", "adset_id"])
126
+ # .ctr.transform(lambda x: (x - x.mean()) / x.std())
127
+ # .count()
128
+ # )
129
+
130
+ new_col_func_dict = self.col_func_dict
131
+
132
+ if not update_existing_columns:
133
+ new_col_func_dict = {
134
+ col: fun for col, fun in new_col_func_dict.items() if "primary_" + col not in df.columns
135
+ }
136
+
137
+ # get extra columns
138
+ for col, func in new_col_func_dict.items():
139
+ logger.debug(col)
140
+ for text_type in self.text_types:
141
+ df[f"{text_type}_{col}"] = df[text_type].apply(func)
142
+
143
+ df["has_text"] = df.apply(
144
+ lambda df: bool(df.primary_number + df.title_number + df.description_number),
145
+ axis=1,
146
+ )
147
+
148
+ # text columns
149
+ df = df.apply(_get_text, axis=1)
150
+ df = df.apply(_get_concatinated_text, axis=1)
151
+
152
+ df["language"] = df.text.apply(detect_language)
153
+ df = df[df.language == "en"]
154
+ df = df[df.ctr.notna()]
155
+
156
+ save_csv(df=df, path=self.untrimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
157
+
158
+ return df
159
+
160
+ def trim(self, min_impr: int = 900, min_spend: float = 90) -> pd.DataFrame:
161
+ df = self.normalized
162
+ df = df[(df.impr >= min_impr) & (df.spend >= min_spend)]
163
+ df = df[df.target == "acquisition"]
164
+ df = df[df.aov.notna()]
165
+
166
+ df = df[df.has_text == True]
167
+
168
+ save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
169
+
170
+ return df
171
+
172
+ def expand_trimmed(
173
+ self, bert: BertModel = None, tokenizer: AutoTokenizer = None, add_bert_embeddings_bool: bool = False
174
+ ) -> pd.DataFrame:
175
+ df = self.trimmed
176
+
177
+ # clean text
178
+ for col in ["text", "concat_text"]:
179
+ df[f"{col}_clean"] = df[col].apply(clean_text)
180
+
181
+ df["text_clean_sentiment"] = df.text_clean.apply(get_sentiment)
182
+
183
+ if add_bert_embeddings_bool:
184
+ if tokenizer is None or bert is None:
185
+ raise ValueError("tokenizer or bert is None")
186
+ layer_dict = {"bert": bert, "tokenizer": tokenizer}
187
+ df = add_bert_embeddings(df=df, save_path=self.trimmed_path, layer_dict=layer_dict)
188
+
189
+ df = df.apply(add_concat_embeddings, axis=1)
190
+
191
+ save_csv(df=df, path=self.trimmed_path, s3=self.s3, s3_args={"bucket": self.bucket})
192
+ return df
193
+
194
+ def split_into_train_and_test(
195
+ self,
196
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
197
+ df = self.trimmed
198
+ train, test = train_test_split(df, train_size=0.9, random_state=42)
199
+ train, val = train_test_split(train, train_size=0.85, random_state=42)
200
+ save_csv(df=train, path=self.train_path, s3=self.s3, s3_args={"bucket": self.bucket})
201
+ save_csv(df=val, path=self.val_path, s3=self.s3, s3_args={"bucket": self.bucket})
202
+ save_csv(df=test, path=self.test_path, s3=self.s3, s3_args={"bucket": self.bucket})
203
+ return train, val, test
204
+
205
+ def expand_normalise_trim_split(
206
+ self,
207
+ update_existing_columns: bool = False,
208
+ group_cols=["text", "target", "shop_id"],
209
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
210
+ self.expand_untrimmed(update_existing_columns=update_existing_columns)
211
+ self.normalize_untrimmed(group_cols=group_cols)
212
+ self.trim()
213
+ self.expand_trimmed()
214
+ train, val, test = self.split_into_train_and_test()
215
+ return train, val, test
216
+
217
+
218
+ def _get_text(ad: pd.Series) -> pd.Series:
219
+
220
+ if ad.primary_number > 0:
221
+ ad["text"] = ad.primary[0]
222
+
223
+ elif ad.description_number > 0:
224
+ ad["text"] = ad.description[0]
225
+
226
+ elif ad.title_number > 0:
227
+ ad["text"] = ad.title[0]
228
+
229
+ else:
230
+ ad["text"] = None
231
+
232
+ return ad
233
+
234
+
235
+ def _get_concatinated_text(ad: pd.Series) -> pd.Series:
236
+
237
+ concat_text = ""
238
+
239
+ if ad.primary_number > 0:
240
+ concat_text = concat_text + ad.primary[0]
241
+
242
+ if ad.description_number > 0:
243
+ concat_text = concat_text + ad.description[0]
244
+
245
+ if ad.title_number > 0:
246
+ concat_text = concat_text + ad.title[0]
247
+
248
+ ad["concat_text"] = concat_text
249
+
250
+ return ad
251
+
252
+
253
+ regression_dataset = RegressionDataset()
254
+
255
+ regression_dataset_s3 = RegressionDataset(s3=True)
256
+
257
+
258
+ def add_bert_embeddings(df: pd.DataFrame, save_path: str, layer_dict: dict = {}, device=DEVICE) -> pd.DataFrame:
259
+
260
+ if device == torch.device("cuda"):
261
+ df["my_bert_cls_embedding"] = df.text_clean.apply(
262
+ lambda text: get_bert_embedding(text=text, cls=True, layer_dict=layer_dict)
263
+ )
264
+ df["my_bert_mean_embedding"] = df.text_clean.apply(
265
+ lambda text: get_bert_embedding(text=text, cls=False, layer_dict=layer_dict)
266
+ )
267
+ return df
268
+
269
+ if "my_bert_cls_embedding" not in df.columns:
270
+ df["my_bert_cls_embedding"] = None
271
+
272
+ if "my_bert_mean_embedding" not in df.columns:
273
+ df["my_bert_mean_embedding"] = None
274
+
275
+ counter = 0
276
+
277
+ df["my_bert_cls_embedding"] = df["my_bert_cls_embedding"].astype(object)
278
+ df["my_bert_mean_embedding"] = df["my_bert_mean_embedding"].astype(object)
279
+
280
+ for i in tqdm(range(len(df))):
281
+
282
+ if df.at[i, "my_bert_cls_embedding"] is not None:
283
+ df.at[i, "my_bert_cls_embedding"] = get_bert_embedding(
284
+ text=df.at[i, "text_clean"], cls=False, layer_dict=layer_dict
285
+ )
286
+ counter = counter + 1
287
+ sleep(0.5)
288
+
289
+ if df.at[i, "my_bert_mean_embedding"] is not None:
290
+ df.at[i, "my_bert_mean_embedding"] = get_bert_embedding(
291
+ text=df.at[i, "text_clean"], cls=True, layer_dict=layer_dict
292
+ )
293
+ counter = counter + 1
294
+ sleep(0.5)
295
+
296
+ if counter % 50 in [0, 1]:
297
+ df.to_csv(save_path, index=False)
298
+
299
+ df.to_csv(save_path, index=False)
300
+
301
+ return df
302
+
303
+
304
+ def add_concat_embeddings(series: pd.DataFrame) -> pd.Series:
305
+ other_features = {"aov": series["aov"]} | series["text_clean_sentiment"]
306
+
307
+ for type in ["cls", "mean"]:
308
+ bert_embedding = series[f"my_bert_{type}_embedding"]
309
+ series[f"my_full_{type}_embedding"] = get_concat_embedding(
310
+ bert_embedding=bert_embedding, other_features=other_features
311
+ )
312
+
313
+ return series
src/regression/datasets/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .FullModelDatasetTorch import FullModelDatasetTorch
2
+ from .DecoderDatasetTorch import DecoderDatasetTorch
3
+ from .RegressionDataset import RegressionDataset, regression_dataset, regression_dataset_s3
src/regression/datasets/__pycache__/DecoderDatasetTorch.cpython-310.pyc ADDED
Binary file (1.56 kB). View file
 
src/regression/datasets/__pycache__/FullModelDatasetTorch.cpython-310.pyc ADDED
Binary file (1.64 kB). View file
 
src/regression/datasets/__pycache__/RegressionDataset.cpython-310.pyc ADDED
Binary file (9.28 kB). View file
 
src/regression/datasets/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (359 Bytes). View file
 
src/regression/datasets/__pycache__/dataset.cpython-310.pyc ADDED
Binary file (1.71 kB). View file
 
src/regression/datasets/__pycache__/dataset_decoder.cpython-310.pyc ADDED
Binary file (1.73 kB). View file
 
src/regression/training_scripts/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .train_full_model_PL import train_full_model_PL
2
+ from .train_decoder_PL import train_decoder_PL
src/regression/training_scripts/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (261 Bytes). View file
 
src/regression/training_scripts/__pycache__/littrain.cpython-310.pyc ADDED
Binary file (2.44 kB). View file
 
src/regression/training_scripts/__pycache__/littrain_decoder.cpython-310.pyc ADDED
Binary file (2.6 kB). View file