SocialMediaFoci / finetune.py
Bismark
Update Space
5ab54b7
# Ensure you've run: pip install transformers datasets torch numpy tf-keras
# PyTorch should already be installed (2.4.0 CPU version is fine)
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, TextClassificationPipeline
from datasets import load_dataset
import numpy as np
# Check device: Use MPS if available (Apple Silicon), else CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
# Step 1: Load the pre-trained model and tokenizer
model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
# Step 2: Load and prepare the tweet_eval sentiment dataset
dataset = load_dataset("tweet_eval", "sentiment")
# Remap labels: tweet_eval (0=negative, 1=neutral, 2=positive) to our model (0=positive, 1=neutral, 2=negative)
def remap_labels(example):
label_map = {0: 2, 1: 1, 2: 0} # Negative->2, Neutral->1, Positive->0
example["label"] = label_map[example["label"]]
return example
dataset = dataset.map(remap_labels)
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
# Split into train and eval datasets
train_dataset = tokenized_dataset["train"] # ~45,580 examples
eval_dataset = tokenized_dataset["test"] # ~12,000 examples
# Step 3: Define a function to compute accuracy
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = (predictions == labels).mean()
return {"accuracy": accuracy}
# Step 4: Set up training arguments
training_args = TrainingArguments(
output_dir="./fine-tuned-sentiment-large",
num_train_epochs=3,
per_device_train_batch_size=4, # Reduced for 8GB RAM
per_device_eval_batch_size=4, # Reduced for 8GB RAM
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
eval_strategy="epoch", # Updated from evaluation_strategy
save_strategy="epoch",
learning_rate=2e-5,
fp16=False, # Disabled (not supported on MPS)
# Use MPS acceleration if available
no_cuda=True, # Force no CUDA since M2 doesn't support it
# torch.backends.mps.is_available() check is handled by device selection
)
# Step 5: Initialize and train the model
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
print("Starting training...")
trainer.train()
# Step 6: Save the fine-tuned model
model.save_pretrained("./fine-tuned-sentiment-large")
tokenizer.save_pretrained("./fine-tuned-sentiment-large")
print("Model saved to ./fine-tuned-sentiment-large")
# Step 7: Evaluate the model on the test set
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# Step 8: Test on your specific examples
classifier = TextClassificationPipeline(
model=AutoModelForSequenceClassification.from_pretrained("./fine-tuned-sentiment-large").to(device),
tokenizer=AutoTokenizer.from_pretrained("./fine-tuned-sentiment-large"),
device=0 if device.type == "mps" else -1, # 0 for MPS, -1 for CPU
return_all_scores=False
)
texts = ["Great service!", "It's okay, nothing special.", "Terrible experience."]
results = classifier(texts)
print("\nTesting on custom examples:")
for text, result in zip(texts, results):
print(f"Text: {text} -> Sentiment: {result['label']}")