mediawatch-el-climate / fine_tune_classifier.py
andefined's picture
Upload fine_tune_classifier.py with huggingface_hub
2acb489 verified
# fine_tune_classifier.py
import os
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch
# --- 1. Configuration ---
DATA_FILE = "df.csv"
MODEL_NAME = "mediawatch-el-climate"
MODEL_CHECKPOINT = os.getenv("MODEL_CHECKPOINT", "cvcio/roberta-el-news")
OUTPUT_DIR = MODEL_NAME + "/" + MODEL_CHECKPOINT.replace("/", "-") + "-finetuned"
NUM_EPOCHS = 4
BATCH_SIZE = 64
# --- 2. Load and Prepare the Dataset ---
print("Step 2: Loading and preparing the dataset...")
# Load your data from the CSV file
df = pd.read_csv(DATA_FILE)
# Ensure the columns are named 'text' and 'label'
df = df.rename(columns={'text': 'text', 'label': 'label'})
df = df.dropna(subset=['text', 'label']).reset_index(drop=True)
# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
# Get the list of unique labels
unique_labels = df['label'].unique().tolist()
# Create label-to-ID and ID-to-label mappings
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
num_labels = len(unique_labels)
print(f"Found {num_labels} unique labels: {unique_labels}")
# Create a ClassLabel feature to map string labels to integer IDs
class_label_feature = ClassLabel(names=unique_labels)
# Map string labels to integer IDs
def map_labels(example):
example['label'] = class_label_feature.str2int(example['label'])
return example
dataset = dataset.map(map_labels, batched=True)
dataset = dataset.class_encode_column("label")
# Split the dataset into training (80%) and testing (20%) sets
train_test_split = dataset.train_test_split(test_size=0.2) ## , stratify_by_column="label")
# Create a DatasetDict
raw_datasets = DatasetDict({
'train': train_test_split['train'],
'test': train_test_split['test']
})
print("Dataset prepared and split.")
print(raw_datasets)
# --- 3. Tokenization ---
print("\nStep 3: Tokenizing the text data...")
# Load the tokenizer associated with the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT,model_max_length=512)
# Create a function to tokenize the text
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
# Apply the tokenization to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print("Tokenization complete.")
# --- 4. Model Training ---
print("\nStep 4: Setting up and training the model...")
# Load the pre-trained model, configured for our number of labels
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_CHECKPOINT,
num_labels=num_labels,
id2label=id2label, # Pass the mappings to the model
label2id=label2id,
max_length=512,
)
# Define a function to compute metrics during evaluation
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = logits.argmax(axis=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1_weighted": f1_score(labels, predictions, average="weighted"),
}
# Define the training arguments
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=NUM_EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
warmup_steps=50,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
eval_strategy="epoch", # Evaluate at the end of each epoch
save_strategy="epoch", # Save the model at the end of each epoch
load_best_model_at_end=True, # Load the best model found during training
)
# Create the Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
# Start the training
print("Starting training...")
trainer.train()
print("Training finished.")
# Save the final model and tokenizer
trainer.save_model(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")
# --- 5. Example Prediction ---
print("\nStep 5: Running an example prediction...")
# # The trainer saves the label mapping in the model's config
# id2label = model.config.id2label
# Text to classify
new_text = "Λειψυδρία: Σε ανησυχητικό επίπεδο η στάθμη του νερού σε Πηνειό και Μόρνο – Καμπανάκι ΕΥΔΑΠ για τα αποθέματα : Έχουμε λιγότερο από τα μισά του 2019"
# Tokenize the new text
inputs = tokenizer(new_text, return_tensors="pt")
# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}
# Get predictions
with torch.no_grad():
logits = model(**inputs).logits
# Find the label with the highest probability
predicted_class_id = logits.argmax().item()
predicted_label = id2label[predicted_class_id]
print(f"\nText: '{new_text}'")
print(f"Predicted Label: {predicted_label}")