mediawatch-el-climate / fine_tune_classifier.py

Upload fine_tune_classifier.py with huggingface_hub

2acb489 verified 5 months ago

5.25 kB

	# fine_tune_classifier.py
	import os
	import pandas as pd

	from datasets import Dataset, DatasetDict, ClassLabel
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
	from sklearn.metrics import accuracy_score, f1_score

	import torch

	# --- 1. Configuration ---
	DATA_FILE = "df.csv"

	MODEL_NAME = "mediawatch-el-climate"
	MODEL_CHECKPOINT = os.getenv("MODEL_CHECKPOINT", "cvcio/roberta-el-news")
	OUTPUT_DIR = MODEL_NAME + "/" + MODEL_CHECKPOINT.replace("/", "-") + "-finetuned"

	NUM_EPOCHS = 4
	BATCH_SIZE = 64

	# --- 2. Load and Prepare the Dataset ---
	print("Step 2: Loading and preparing the dataset...")

	# Load your data from the CSV file
	df = pd.read_csv(DATA_FILE)

	# Ensure the columns are named 'text' and 'label'
	df = df.rename(columns={'text': 'text', 'label': 'label'})
	df = df.dropna(subset=['text', 'label']).reset_index(drop=True)

	# Convert the pandas DataFrame to a Hugging Face Dataset
	dataset = Dataset.from_pandas(df)

	# Get the list of unique labels
	unique_labels = df['label'].unique().tolist()

	# Create label-to-ID and ID-to-label mappings
	label2id = {label: i for i, label in enumerate(unique_labels)}
	id2label = {i: label for i, label in enumerate(unique_labels)}

	num_labels = len(unique_labels)
	print(f"Found {num_labels} unique labels: {unique_labels}")

	# Create a ClassLabel feature to map string labels to integer IDs
	class_label_feature = ClassLabel(names=unique_labels)

	# Map string labels to integer IDs
	def map_labels(example):
	example['label'] = class_label_feature.str2int(example['label'])
	return example


	dataset = dataset.map(map_labels, batched=True)
	dataset = dataset.class_encode_column("label")

	# Split the dataset into training (80%) and testing (20%) sets
	train_test_split = dataset.train_test_split(test_size=0.2) ## , stratify_by_column="label")

	# Create a DatasetDict
	raw_datasets = DatasetDict({
	'train': train_test_split['train'],
	'test': train_test_split['test']
	})

	print("Dataset prepared and split.")
	print(raw_datasets)


	# --- 3. Tokenization ---
	print("\nStep 3: Tokenizing the text data...")

	# Load the tokenizer associated with the pre-trained model
	tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT,model_max_length=512)

	# Create a function to tokenize the text
	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True)

	# Apply the tokenization to the entire dataset
	tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

	print("Tokenization complete.")

	# --- 4. Model Training ---
	print("\nStep 4: Setting up and training the model...")

	# Load the pre-trained model, configured for our number of labels
	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_CHECKPOINT,
	num_labels=num_labels,
	id2label=id2label, # Pass the mappings to the model
	label2id=label2id,
	max_length=512,
	)

	# Define a function to compute metrics during evaluation
	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = logits.argmax(axis=-1)
	return {
	"accuracy": accuracy_score(labels, predictions),
	"f1_weighted": f1_score(labels, predictions, average="weighted"),
	}

	# Define the training arguments
	training_args = TrainingArguments(
	output_dir=OUTPUT_DIR,
	num_train_epochs=NUM_EPOCHS,
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	warmup_steps=50,
	weight_decay=0.01,
	logging_dir='./logs',
	logging_steps=10,
	eval_strategy="epoch", # Evaluate at the end of each epoch
	save_strategy="epoch", # Save the model at the end of each epoch
	load_best_model_at_end=True, # Load the best model found during training
	)

	# Create the Trainer instance
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["test"],
	compute_metrics=compute_metrics,
	tokenizer=tokenizer,
	)

	# Start the training
	print("Starting training...")
	trainer.train()
	print("Training finished.")

	# Save the final model and tokenizer
	trainer.save_model(OUTPUT_DIR)
	print(f"Model saved to {OUTPUT_DIR}")


	# --- 5. Example Prediction ---
	print("\nStep 5: Running an example prediction...")

	# # The trainer saves the label mapping in the model's config
	# id2label = model.config.id2label

	# Text to classify
	new_text = "Λειψυδρία: Σε ανησυχητικό επίπεδο η στάθμη του νερού σε Πηνειό και Μόρνο – Καμπανάκι ΕΥΔΑΠ για τα αποθέματα : Έχουμε λιγότερο από τα μισά του 2019"

	# Tokenize the new text
	inputs = tokenizer(new_text, return_tensors="pt")

	# Move inputs to the same device as the model
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Get predictions
	with torch.no_grad():
	logits = model(**inputs).logits

	# Find the label with the highest probability
	predicted_class_id = logits.argmax().item()
	predicted_label = id2label[predicted_class_id]

	print(f"\nText: '{new_text}'")
	print(f"Predicted Label: {predicted_label}")