| |
| import os |
| import pandas as pd |
|
|
| from datasets import Dataset, DatasetDict, ClassLabel |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments |
| from sklearn.metrics import accuracy_score, f1_score |
|
|
| import torch |
|
|
| |
| DATA_FILE = "df.csv" |
|
|
| MODEL_NAME = "mediawatch-el-climate" |
| MODEL_CHECKPOINT = os.getenv("MODEL_CHECKPOINT", "cvcio/roberta-el-news") |
| OUTPUT_DIR = MODEL_NAME + "/" + MODEL_CHECKPOINT.replace("/", "-") + "-finetuned" |
|
|
| NUM_EPOCHS = 4 |
| BATCH_SIZE = 64 |
|
|
| |
| print("Step 2: Loading and preparing the dataset...") |
|
|
| |
| df = pd.read_csv(DATA_FILE) |
|
|
| |
| df = df.rename(columns={'text': 'text', 'label': 'label'}) |
| df = df.dropna(subset=['text', 'label']).reset_index(drop=True) |
|
|
| |
| dataset = Dataset.from_pandas(df) |
|
|
| |
| unique_labels = df['label'].unique().tolist() |
|
|
| |
| label2id = {label: i for i, label in enumerate(unique_labels)} |
| id2label = {i: label for i, label in enumerate(unique_labels)} |
|
|
| num_labels = len(unique_labels) |
| print(f"Found {num_labels} unique labels: {unique_labels}") |
|
|
| |
| class_label_feature = ClassLabel(names=unique_labels) |
|
|
| |
| def map_labels(example): |
| example['label'] = class_label_feature.str2int(example['label']) |
| return example |
|
|
|
|
| dataset = dataset.map(map_labels, batched=True) |
| dataset = dataset.class_encode_column("label") |
|
|
| |
| train_test_split = dataset.train_test_split(test_size=0.2) |
|
|
| |
| raw_datasets = DatasetDict({ |
| 'train': train_test_split['train'], |
| 'test': train_test_split['test'] |
| }) |
|
|
| print("Dataset prepared and split.") |
| print(raw_datasets) |
|
|
|
|
| |
| print("\nStep 3: Tokenizing the text data...") |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT,model_max_length=512) |
|
|
| |
| def tokenize_function(examples): |
| return tokenizer(examples["text"], padding="max_length", truncation=True) |
|
|
| |
| tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) |
|
|
| print("Tokenization complete.") |
|
|
| |
| print("\nStep 4: Setting up and training the model...") |
|
|
| |
| model = AutoModelForSequenceClassification.from_pretrained( |
| MODEL_CHECKPOINT, |
| num_labels=num_labels, |
| id2label=id2label, |
| label2id=label2id, |
| max_length=512, |
| ) |
|
|
| |
| def compute_metrics(eval_pred): |
| logits, labels = eval_pred |
| predictions = logits.argmax(axis=-1) |
| return { |
| "accuracy": accuracy_score(labels, predictions), |
| "f1_weighted": f1_score(labels, predictions, average="weighted"), |
| } |
|
|
| |
| training_args = TrainingArguments( |
| output_dir=OUTPUT_DIR, |
| num_train_epochs=NUM_EPOCHS, |
| per_device_train_batch_size=BATCH_SIZE, |
| per_device_eval_batch_size=BATCH_SIZE, |
| warmup_steps=50, |
| weight_decay=0.01, |
| logging_dir='./logs', |
| logging_steps=10, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| ) |
|
|
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_datasets["train"], |
| eval_dataset=tokenized_datasets["test"], |
| compute_metrics=compute_metrics, |
| tokenizer=tokenizer, |
| ) |
|
|
| |
| print("Starting training...") |
| trainer.train() |
| print("Training finished.") |
|
|
| |
| trainer.save_model(OUTPUT_DIR) |
| print(f"Model saved to {OUTPUT_DIR}") |
|
|
|
|
| |
| print("\nStep 5: Running an example prediction...") |
|
|
| |
| |
|
|
| |
| new_text = "Λειψυδρία: Σε ανησυχητικό επίπεδο η στάθμη του νερού σε Πηνειό και Μόρνο – Καμπανάκι ΕΥΔΑΠ για τα αποθέματα : Έχουμε λιγότερο από τα μισά του 2019" |
|
|
| |
| inputs = tokenizer(new_text, return_tensors="pt") |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| logits = model(**inputs).logits |
|
|
| |
| predicted_class_id = logits.argmax().item() |
| predicted_label = id2label[predicted_class_id] |
|
|
| print(f"\nText: '{new_text}'") |
| print(f"Predicted Label: {predicted_label}") |