Sina1138 commited on
Commit ·
ea770c8
1
Parent(s): 66efe3b
fix model training scripts
Browse files- alternative_polarity/deberta/{deberta_v3_large_polarity.py → deberta_v3_base_polarity.py} +1 -1
- alternative_polarity/deberta/{deberta_v3_large_polarity_train.py → deberta_v3_base_polarity_train.py} +23 -23
- alternative_polarity/scideberta/scideberta_full_polarity_train.py +20 -9
- scibert/scibert_polarity/scibert_polarity_train.py +1 -1
alternative_polarity/deberta/{deberta_v3_large_polarity.py → deberta_v3_base_polarity.py}
RENAMED
|
@@ -16,7 +16,7 @@ from glimpse.glimpse.data_loading.Glimpse_tokenizer import glimpse_tokenizer
|
|
| 16 |
|
| 17 |
# === CONFIGURATION ===
|
| 18 |
|
| 19 |
-
MODEL_DIR = BASE_DIR / "alternative_polarity" / "deberta" / "
|
| 20 |
DATA_DIR = BASE_DIR / "glimpse" / "data" / "processed"
|
| 21 |
OUTPUT_DIR = BASE_DIR / "data" / "polarity_scored"
|
| 22 |
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 16 |
|
| 17 |
# === CONFIGURATION ===
|
| 18 |
|
| 19 |
+
MODEL_DIR = BASE_DIR / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model"
|
| 20 |
DATA_DIR = BASE_DIR / "glimpse" / "data" / "processed"
|
| 21 |
OUTPUT_DIR = BASE_DIR / "data" / "polarity_scored"
|
| 22 |
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
alternative_polarity/deberta/{deberta_v3_large_polarity_train.py → deberta_v3_base_polarity_train.py}
RENAMED
|
@@ -8,37 +8,20 @@ from torch.nn import functional as F
|
|
| 8 |
|
| 9 |
from transformers import Trainer
|
| 10 |
|
| 11 |
-
class WeightedTrainer(Trainer):
|
| 12 |
-
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
|
| 13 |
-
labels = inputs.pop("labels")
|
| 14 |
-
outputs = model(**inputs)
|
| 15 |
-
logits = outputs.logits
|
| 16 |
-
weights = class_weights.to(logits.device)
|
| 17 |
-
loss = F.cross_entropy(logits, labels, weight=weights)
|
| 18 |
-
return (loss, outputs) if return_outputs else loss
|
| 19 |
-
|
| 20 |
-
|
| 21 |
|
| 22 |
# Load data
|
| 23 |
train_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_train.csv")
|
| 24 |
dev_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_dev.csv")
|
| 25 |
test_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_test.csv")
|
| 26 |
|
| 27 |
-
# Compute class weights (inverse frequency)
|
| 28 |
-
neg_weight = 1.0
|
| 29 |
-
pos_weight = train_df['label'].value_counts()[0] / train_df['label'].value_counts()[1]
|
| 30 |
-
class_weights = torch.tensor([neg_weight, pos_weight], dtype=torch.float32)
|
| 31 |
-
|
| 32 |
# Convert to HuggingFace Datasets
|
| 33 |
train_ds = Dataset.from_pandas(train_df)
|
| 34 |
dev_ds = Dataset.from_pandas(dev_df)
|
| 35 |
test_ds = Dataset.from_pandas(test_df)
|
| 36 |
|
| 37 |
# Tokenize
|
| 38 |
-
model_name = "microsoft/deberta-v3-
|
| 39 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 40 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
| 41 |
-
|
| 42 |
def tokenize(batch):
|
| 43 |
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
|
| 44 |
|
|
@@ -52,13 +35,30 @@ dev_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]
|
|
| 52 |
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 53 |
|
| 54 |
# Load model
|
| 55 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Metrics
|
| 58 |
def compute_metrics(eval_pred):
|
| 59 |
logits, labels = eval_pred
|
| 60 |
preds = np.argmax(logits, axis=1)
|
| 61 |
-
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="
|
| 62 |
acc = accuracy_score(labels, preds)
|
| 63 |
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
|
| 64 |
|
|
@@ -66,7 +66,7 @@ def compute_metrics(eval_pred):
|
|
| 66 |
args = TrainingArguments(
|
| 67 |
output_dir="./alternative_polarity/deberta/checkpoints",
|
| 68 |
eval_strategy="epoch",
|
| 69 |
-
save_strategy="
|
| 70 |
learning_rate=2e-5,
|
| 71 |
per_device_train_batch_size=4,
|
| 72 |
per_device_eval_batch_size=8,
|
|
@@ -94,5 +94,5 @@ results = trainer.evaluate(test_ds)
|
|
| 94 |
print("Test results:", results)
|
| 95 |
|
| 96 |
# Save the model and tokenizer
|
| 97 |
-
model.save_pretrained("./alternative_polarity/deberta/
|
| 98 |
-
tokenizer.save_pretrained("./alternative_polarity/deberta/
|
|
|
|
| 8 |
|
| 9 |
from transformers import Trainer
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Load data
|
| 13 |
train_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_train.csv")
|
| 14 |
dev_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_dev.csv")
|
| 15 |
test_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_test.csv")
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Convert to HuggingFace Datasets
|
| 18 |
train_ds = Dataset.from_pandas(train_df)
|
| 19 |
dev_ds = Dataset.from_pandas(dev_df)
|
| 20 |
test_ds = Dataset.from_pandas(test_df)
|
| 21 |
|
| 22 |
# Tokenize
|
| 23 |
+
model_name = "microsoft/deberta-v3-base"
|
| 24 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
| 25 |
def tokenize(batch):
|
| 26 |
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
|
| 27 |
|
|
|
|
| 35 |
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 36 |
|
| 37 |
# Load model
|
| 38 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
|
| 39 |
+
|
| 40 |
+
# Compute class weights
|
| 41 |
+
label_counts = train_df['label'].value_counts()
|
| 42 |
+
total_samples = len(train_df)
|
| 43 |
+
class_weights = torch.tensor([total_samples / (len(label_counts) * count) for count in label_counts.sort_index().values])
|
| 44 |
+
class_weights = class_weights.to(dtype=torch.float32)
|
| 45 |
+
print("Class weights:", class_weights)
|
| 46 |
+
|
| 47 |
+
class WeightedTrainer(Trainer):
|
| 48 |
+
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
|
| 49 |
+
labels = inputs.pop("labels")
|
| 50 |
+
outputs = model(**inputs)
|
| 51 |
+
logits = outputs.logits
|
| 52 |
+
weights = class_weights.to(logits.device)
|
| 53 |
+
loss = F.cross_entropy(logits, labels, weight=weights)
|
| 54 |
+
return (loss, outputs) if return_outputs else loss
|
| 55 |
+
|
| 56 |
|
| 57 |
# Metrics
|
| 58 |
def compute_metrics(eval_pred):
|
| 59 |
logits, labels = eval_pred
|
| 60 |
preds = np.argmax(logits, axis=1)
|
| 61 |
+
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
|
| 62 |
acc = accuracy_score(labels, preds)
|
| 63 |
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
|
| 64 |
|
|
|
|
| 66 |
args = TrainingArguments(
|
| 67 |
output_dir="./alternative_polarity/deberta/checkpoints",
|
| 68 |
eval_strategy="epoch",
|
| 69 |
+
save_strategy="epoch",
|
| 70 |
learning_rate=2e-5,
|
| 71 |
per_device_train_batch_size=4,
|
| 72 |
per_device_eval_batch_size=8,
|
|
|
|
| 94 |
print("Test results:", results)
|
| 95 |
|
| 96 |
# Save the model and tokenizer
|
| 97 |
+
model.save_pretrained("./alternative_polarity/deberta/deberta_v3_base_polarity_final_model")
|
| 98 |
+
tokenizer.save_pretrained("./alternative_polarity/deberta/deberta_v3_base_polarity_final_model")
|
alternative_polarity/scideberta/scideberta_full_polarity_train.py
CHANGED
|
@@ -24,11 +24,6 @@ train_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity
|
|
| 24 |
dev_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_dev.csv")
|
| 25 |
test_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_test.csv")
|
| 26 |
|
| 27 |
-
# Compute class weights (inverse frequency)
|
| 28 |
-
neg_weight = 1.0
|
| 29 |
-
pos_weight = train_df['label'].value_counts()[0] / train_df['label'].value_counts()[1]
|
| 30 |
-
class_weights = torch.tensor([neg_weight, pos_weight], dtype=torch.float32)
|
| 31 |
-
|
| 32 |
# Convert to HuggingFace Datasets
|
| 33 |
train_ds = Dataset.from_pandas(train_df)
|
| 34 |
dev_ds = Dataset.from_pandas(dev_df)
|
|
@@ -36,7 +31,6 @@ test_ds = Dataset.from_pandas(test_df)
|
|
| 36 |
|
| 37 |
model_name = "KISTI-AI/Scideberta-full"
|
| 38 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 39 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
|
| 40 |
|
| 41 |
def tokenize(batch):
|
| 42 |
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
|
|
@@ -51,13 +45,30 @@ dev_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"]
|
|
| 51 |
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 52 |
|
| 53 |
# Load model
|
| 54 |
-
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# Metrics
|
| 57 |
def compute_metrics(eval_pred):
|
| 58 |
logits, labels = eval_pred
|
| 59 |
preds = np.argmax(logits, axis=1)
|
| 60 |
-
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="
|
| 61 |
acc = accuracy_score(labels, preds)
|
| 62 |
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
|
| 63 |
|
|
@@ -65,7 +76,7 @@ def compute_metrics(eval_pred):
|
|
| 65 |
args = TrainingArguments(
|
| 66 |
output_dir="./alternative_polarity/scideberta/checkpoints",
|
| 67 |
eval_strategy="epoch",
|
| 68 |
-
save_strategy="
|
| 69 |
learning_rate=2e-5,
|
| 70 |
per_device_train_batch_size=4,
|
| 71 |
per_device_eval_batch_size=8,
|
|
|
|
| 24 |
dev_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_dev.csv")
|
| 25 |
test_df = pd.read_csv("./data/DISAPERE-main/SELFExtractedData/disapere_polarity_test.csv")
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Convert to HuggingFace Datasets
|
| 28 |
train_ds = Dataset.from_pandas(train_df)
|
| 29 |
dev_ds = Dataset.from_pandas(dev_df)
|
|
|
|
| 31 |
|
| 32 |
model_name = "KISTI-AI/Scideberta-full"
|
| 33 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 34 |
|
| 35 |
def tokenize(batch):
|
| 36 |
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
|
|
|
|
| 45 |
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
|
| 46 |
|
| 47 |
# Load model
|
| 48 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
|
| 49 |
+
|
| 50 |
+
# Compute class weights
|
| 51 |
+
label_counts = train_df['label'].value_counts()
|
| 52 |
+
total_samples = len(train_df)
|
| 53 |
+
class_weights = torch.tensor([total_samples / (len(label_counts) * count) for count in label_counts.sort_index().values])
|
| 54 |
+
class_weights = class_weights.to(dtype=torch.float32)
|
| 55 |
+
print("Class weights:", class_weights)
|
| 56 |
+
|
| 57 |
+
class WeightedTrainer(Trainer):
|
| 58 |
+
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
|
| 59 |
+
labels = inputs.pop("labels")
|
| 60 |
+
outputs = model(**inputs)
|
| 61 |
+
logits = outputs.logits
|
| 62 |
+
weights = class_weights.to(logits.device)
|
| 63 |
+
loss = F.cross_entropy(logits, labels, weight=weights)
|
| 64 |
+
return (loss, outputs) if return_outputs else loss
|
| 65 |
+
|
| 66 |
|
| 67 |
# Metrics
|
| 68 |
def compute_metrics(eval_pred):
|
| 69 |
logits, labels = eval_pred
|
| 70 |
preds = np.argmax(logits, axis=1)
|
| 71 |
+
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
|
| 72 |
acc = accuracy_score(labels, preds)
|
| 73 |
return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
|
| 74 |
|
|
|
|
| 76 |
args = TrainingArguments(
|
| 77 |
output_dir="./alternative_polarity/scideberta/checkpoints",
|
| 78 |
eval_strategy="epoch",
|
| 79 |
+
save_strategy="epoch",
|
| 80 |
learning_rate=2e-5,
|
| 81 |
per_device_train_batch_size=4,
|
| 82 |
per_device_eval_batch_size=8,
|
scibert/scibert_polarity/scibert_polarity_train.py
CHANGED
|
@@ -67,7 +67,7 @@ def compute_metrics(eval_pred):
|
|
| 67 |
args = TrainingArguments(
|
| 68 |
output_dir="./scibert/scibert_polarity/checkpoints",
|
| 69 |
eval_strategy="epoch",
|
| 70 |
-
save_strategy="
|
| 71 |
learning_rate=2e-5,
|
| 72 |
per_device_train_batch_size=8,
|
| 73 |
per_device_eval_batch_size=16,
|
|
|
|
| 67 |
args = TrainingArguments(
|
| 68 |
output_dir="./scibert/scibert_polarity/checkpoints",
|
| 69 |
eval_strategy="epoch",
|
| 70 |
+
save_strategy="epoch",
|
| 71 |
learning_rate=2e-5,
|
| 72 |
per_device_train_batch_size=8,
|
| 73 |
per_device_eval_batch_size=16,
|