Spaces:
Runtime error
Runtime error
milestone-3
Browse files- app.py +22 -21
- milestone_3.py → train.py +5 -6
app.py
CHANGED
|
@@ -1,30 +1,31 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from transformers import AutoTokenizer,
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
if analyze_button:
|
| 19 |
-
if selected_model=="Model 1":
|
| 20 |
-
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
|
| 21 |
-
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
|
| 22 |
-
else:
|
| 23 |
-
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
| 24 |
-
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
|
| 25 |
-
inputs = tokenizer(text, return_tensors="pt")
|
| 26 |
-
with torch.no_grad():
|
| 27 |
-
logits = model(**inputs).logits
|
| 28 |
-
prediction_id = logits.argmax().item()
|
| 29 |
-
results = model.config.id2label[prediction_id]
|
| 30 |
-
st.write(results)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
| 3 |
import numpy as np
|
| 4 |
import torch
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
|
| 8 |
+
model_name = "unitary/toxic-bert"
|
| 9 |
+
|
| 10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 11 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
df = pd.DataFrame(columns=("Tweet", "Toxicity", "Probability"))
|
| 15 |
|
| 16 |
+
sample_tweets = ["Ask Sityush to clean up his behavior than issue me nonsensical warnings...", "be a man and lets discuss it-maybe over the phone?", "Don't look, come or think of comming back! Tosser."]
|
| 17 |
|
| 18 |
+
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
|
| 19 |
+
results = classifier(sample_tweets)
|
| 20 |
+
|
| 21 |
+
batch = tokenizer(sample_tweets, padding=True, truncation=True, max_length=512, return_tensors="pt")
|
| 22 |
+
|
| 23 |
+
# assignment 3
|
| 24 |
+
st.title("CS482 Project Sentiment Analysis")
|
| 25 |
|
| 26 |
+
st.markdown("**:red[unitary/toxic-bert]**")
|
| 27 |
|
| 28 |
+
for i in range(len(sample_tweets)):
|
| 29 |
+
df.loc[len(df.index)] = [sample_tweets[i], results[i]["label"], results[i]["score"]]
|
| 30 |
|
| 31 |
+
st.table(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
milestone_3.py → train.py
RENAMED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from transformers import
|
| 2 |
import torch
|
| 3 |
from torch.utils.data import Dataset
|
| 4 |
# from torch.optim import AdamW
|
|
@@ -7,7 +7,7 @@ from sklearn.model_selection import train_test_split
|
|
| 7 |
|
| 8 |
|
| 9 |
# assignment 3
|
| 10 |
-
model_name = "
|
| 11 |
|
| 12 |
class ToxicDataset(Dataset):
|
| 13 |
|
|
@@ -18,7 +18,6 @@ class ToxicDataset(Dataset):
|
|
| 18 |
def __getitem__(self, idx):
|
| 19 |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
| 20 |
item["labels"] = torch.tensor(self.labels[idx])
|
| 21 |
-
print(item)
|
| 22 |
return item
|
| 23 |
|
| 24 |
def __len__(self):
|
|
@@ -35,7 +34,7 @@ train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.t
|
|
| 35 |
|
| 36 |
|
| 37 |
print("Data split. Tokenizing data...")
|
| 38 |
-
tokenizer =
|
| 39 |
|
| 40 |
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
|
| 41 |
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
|
|
@@ -59,7 +58,7 @@ training_args = TrainingArguments(
|
|
| 59 |
|
| 60 |
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 61 |
|
| 62 |
-
model =
|
| 63 |
|
| 64 |
trainer = Trainer(
|
| 65 |
model=model,
|
|
@@ -101,7 +100,7 @@ trainer.train()
|
|
| 101 |
|
| 102 |
print("Training complete. Saving model...")
|
| 103 |
|
| 104 |
-
save_directory = "
|
| 105 |
model.save_pretrained(save_directory)
|
| 106 |
|
| 107 |
print("Model saved.")
|
|
|
|
| 1 |
+
from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
|
| 2 |
import torch
|
| 3 |
from torch.utils.data import Dataset
|
| 4 |
# from torch.optim import AdamW
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
# assignment 3
|
| 10 |
+
model_name = "bert-base-uncased"
|
| 11 |
|
| 12 |
class ToxicDataset(Dataset):
|
| 13 |
|
|
|
|
| 18 |
def __getitem__(self, idx):
|
| 19 |
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
|
| 20 |
item["labels"] = torch.tensor(self.labels[idx])
|
|
|
|
| 21 |
return item
|
| 22 |
|
| 23 |
def __len__(self):
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
print("Data split. Tokenizing data...")
|
| 37 |
+
tokenizer = BertTokenizerFast.from_pretrained(model_name)
|
| 38 |
|
| 39 |
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt')
|
| 40 |
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt')
|
|
|
|
| 58 |
|
| 59 |
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 60 |
|
| 61 |
+
model = BertModel.from_pretrained(model_name, num_labels=6)
|
| 62 |
|
| 63 |
trainer = Trainer(
|
| 64 |
model=model,
|
|
|
|
| 100 |
|
| 101 |
print("Training complete. Saving model...")
|
| 102 |
|
| 103 |
+
save_directory = "./results/model"
|
| 104 |
model.save_pretrained(save_directory)
|
| 105 |
|
| 106 |
print("Model saved.")
|