Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """Roberta sentiment Analysis | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/10L1VfVMZLa62qTFdUIOURELW194TjJ4e | |
| """ | |
| # Install required libraries | |
| !pip install datasets transformers huggingface_hub -q | |
| # Import key libraries and packages | |
| import numpy as np | |
| import os | |
| import pandas as pd | |
| from datasets import load_dataset, load_metric | |
| from huggingface_hub import notebook_login | |
| from sklearn.model_selection import train_test_split | |
| from transformers import AutoTokenizer, TrainingArguments, Trainer | |
| from google.colab import files | |
| from google.colab import drive | |
| # Disable Weights & Biases | |
| os.environ["WANDB_DISABLED"] = "true" | |
| drive.mount('/content/drive') | |
| # Load the datasets | |
| train_df =pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/hugging.csv").dropna(axis = 0) | |
| test_df = pd.read_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/Testhugging.csv").fillna("") | |
| train_df.head() | |
| test_df.head() | |
| train_df.isnull().sum() | |
| test_df.isnull().sum() | |
| """Fine-tuning the roberta model""" | |
| train_df, eval = train_test_split(train_df, test_size=0.2, random_state=42, stratify= train_df['label']) | |
| print(f"new dataframe shapes: train is {train_df.shape}, eval is {eval.shape}") | |
| # Save splitted subsets | |
| train_df.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv", index=False) | |
| eval.to_csv("/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv", index=False) | |
| dataset = load_dataset('csv', | |
| data_files={'train': '/content/drive/MyDrive/PostBAP_ASSESSMENT/train_subset.csv', | |
| 'eval': '/content/drive/MyDrive/PostBAP_ASSESSMENT/eval_subset.csv'}, encoding = "ISO-8859-1") | |
| # Instantiate the tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) | |
| # Define helper functions | |
| ## Function to transform labels | |
| def transform_labels(label): | |
| label = label['label'] | |
| num = 0 | |
| if label == -1: #'Negative' | |
| num = 0 | |
| elif label == 0: #'Neutral' | |
| num = 1 | |
| elif label == 1: #'Positive' | |
| num = 2 | |
| return {'labels': num} | |
| ## Function to tokenize data | |
| def tokenize_data(example): | |
| return tokenizer(example['safe_text'], padding='max_length',truncation=True, max_length = 256) | |
| # Tokenize the tweets | |
| dataset = dataset.map(tokenize_data, batched=True) | |
| # Transform labels and limit the columns | |
| remove_columns = ['tweet_id', 'label', 'safe_text', 'agreement'] | |
| dataset = dataset.map(transform_labels, remove_columns=remove_columns) | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| "covid_tweets_sentiment_analysis_model", | |
| num_train_epochs=4, | |
| load_best_model_at_end=True, | |
| evaluation_strategy="epoch", | |
| save_strategy="epoch" | |
| ) | |
| # Load the pretrained model | |
| from transformers import AutoModelForSequenceClassification | |
| model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3) | |
| # Define evaluation metrics | |
| metric = load_metric("accuracy") | |
| def compute_metrics(eval_pred): | |
| logits, labels = eval_pred | |
| predictions = np.argmax(logits, axis=-1) | |
| return metric.compute(predictions=predictions, references=labels) | |
| # Instantiate the training and evaluation sets | |
| train_dataset = dataset["train"].shuffle(seed=24) | |
| eval_dataset = dataset["eval"].shuffle(seed=24) | |
| #converting training data to PyTorch tensors to speed up training and adding padding: | |
| from transformers import DataCollatorWithPadding | |
| data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
| # Instantiate the trainer | |
| trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,compute_metrics=compute_metrics) | |
| trainer.train() | |
| # Reinstantiate the trainer for evaluation | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| tokenizer=tokenizer, | |
| data_collator=data_collator, | |
| compute_metrics=compute_metrics, | |
| ) | |
| # Launch the final evaluation | |
| trainer.evaluate() | |
| # Login to HF hub | |
| notebook_login() | |
| # Push model and tokenizer to HF Hub | |
| model.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") | |
| tokenizer.push_to_hub("MavisAJ/Sentiment_analysis_roberta_model") |