Spaces:
Sleeping
Sleeping
| import torch | |
| import pandas as pd | |
| from sklearn.preprocessing import LabelEncoder | |
| from datasets import Dataset | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer | |
| from transformers import RobertaConfig, RobertaForSequenceClassification | |
| from transformers import AdamW | |
| from newhead import NewClassificationHead | |
| def preprocess_data(df): | |
| """ | |
| Preprocess the data by renaming columns, removing rows with missing values, and removing extra spaces. | |
| """ | |
| df = df.rename(columns={'Comment': 'text', 'Emotion': 'label'}) | |
| df = df.dropna() | |
| df['text'] = df['text'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() | |
| df['label'] = df['label'].str.replace('\t', ' ').str.replace(' +', ' ', regex=True).str.strip() | |
| return df | |
| def encode_label(df): | |
| """ | |
| Encode the labels using LabelEncoder. | |
| """ | |
| label_encoder = LabelEncoder() | |
| df['label'] = label_encoder.fit_transform(df['label']) | |
| return df | |
| def generate_dataset(df, test_size=0.2): | |
| """ | |
| Convert the DataFrame into a Dataset that can be used with transformers. | |
| """ | |
| return Dataset.from_pandas(df) | |
| def tokenize(batch): | |
| return tokenizer(batch['text'], padding='max_length', truncation=True) | |
| def compute_metrics(pred): | |
| from sklearn.metrics import accuracy_score, precision_recall_fscore_support | |
| labels = pred.label_ids | |
| preds = pred.predictions.argmax(-1) | |
| precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted') | |
| acc = accuracy_score(labels, preds) | |
| return { | |
| 'accuracy': acc, | |
| 'f1': f1, | |
| 'precision': precision, | |
| 'recall': recall | |
| } | |
| # Define model and training arguments | |
| model_name = "cardiffnlp/twitter-roberta-base-emotion" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| config = RobertaConfig.from_pretrained(model_name, num_labels=3) # Set the number of labels to 3 | |
| model = RobertaForSequenceClassification.from_pretrained(model_name, config=config, ignore_mismatched_sizes=True) | |
| model.classifier = NewClassificationHead(config) | |
| df = pd.read_csv('Emotion_classify_Data.csv') | |
| df = preprocess_data(df) | |
| df = encode_label(df) | |
| ds = generate_dataset(df) | |
| ds = ds.map(tokenize, batched=True) | |
| ### Transer Learning First | |
| # Freeze all layers first | |
| for param in model.parameters(): | |
| param.requires_grad = False | |
| # Unfreeze the classifier layer | |
| for param in model.classifier.parameters(): | |
| param.requires_grad = True | |
| # Define different learning rates | |
| head_lr = 3e-4 # Higher learning rate for the head | |
| base_lr = head_lr/5 # Lower learning rate for the base layers | |
| # Group parameters and set learning rates | |
| optimizer_grouped_parameters = [ | |
| {'params': model.classifier.parameters(), 'lr': head_lr}, | |
| {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} | |
| ] | |
| optimizer = AdamW(optimizer_grouped_parameters) | |
| ## Training arguments | |
| training_args = TrainingArguments( | |
| output_dir='./results', | |
| num_train_epochs=10, | |
| per_device_train_batch_size=16, | |
| per_device_eval_batch_size=64, | |
| warmup_steps=500, | |
| weight_decay=0.01, | |
| logging_dir='./logs', | |
| save_strategy="no", | |
| ) | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=ds['train'], | |
| eval_dataset=ds['test'], | |
| tokenizer=tokenizer, | |
| optimizers=(optimizer, None), # No need to pass a learning rate scheduler if you're managing learning rates manually, | |
| compute_metrics=compute_metrics | |
| ) | |
| ## Train the head of the model | |
| trainer.train() | |
| ## Unfreeze all layers | |
| for param in model.parameters(): | |
| param.requires_grad = True | |
| head_lr = 1e-4 # Slightly lower learning rate for the head | |
| base_lr = 5e-6 # Much lower learning rate for the base layers | |
| optimizer_grouped_parameters = [ | |
| {'params': model.classifier.parameters(), 'lr': head_lr}, | |
| {'params': [p for n, p in model.named_parameters() if 'classifier' not in n], 'lr': base_lr} | |
| ] | |
| ## train the entire model | |
| optimizer = AdamW(optimizer_grouped_parameters) | |
| training_args.num_train_epochs = 5 # Set the number of additional epochs | |
| trainer.train() | |
| model.save_pretrained('transferLearningResults') | |
| tokenizer.save_pretrained('transferLearningResults') | |