Spaces:

ogtega
/

datamining-project

Sleeping

Teslim Olunlade commited on Apr 20, 2023

Commit

75c005d

1 Parent(s): 25ac0e5

Successfully trained

Files changed (3) hide show

app/main.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import streamlit as st
 from transformers import AutoTokenizer
 from transformers import (
@@ -15,9 +17,7 @@ model_name = st.selectbox(
     "Select the model you want to use below.",
     (
         "distilbert-base-uncased-finetuned-sst-2-english",
-        "cardiffnlp/twitter-roberta-base-sentiment",
-        "finiteautomata/bertweet-base-sentiment-analysis",
-        "ProsusAI/finbert",
     ),
 )

+#!/usr/bin/env python3
 import streamlit as st
 from transformers import AutoTokenizer
 from transformers import (
     "Select the model you want to use below.",
     (
         "distilbert-base-uncased-finetuned-sst-2-english",
+        "roberta-large-mnli",
     ),
 )

app/train.py ADDED Viewed

+#!/usr/bin/env python3
+from collections import defaultdict
+import pandas as pd
+import tensorflow as tf
+from transformers import (
+    AutoTokenizer,
+    TFAutoModelForSequenceClassification,
+    TFTrainer,
+    TFTrainingArguments,
+)
+labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
+label2id = {label: id for id, label in enumerate(labels)}
+id2label = {id: label for id, label in enumerate(labels)}
+data = pd.read_csv("./train.csv")
+batch_encodings = defaultdict(list)
+batch_labels = list()
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+for _, row in data.sample(n=3).iterrows():
+    text = row["comment_text"]
+    text_labels = {k: row[k] for k in row.keys() if k in labels}
+    encoding = tokenizer(text, padding="max_length", truncation=True)
+    batch_labels.append([text_labels[id2label[id]] for id in range(len(labels))])
+    for key in encoding.keys():
+        batch_encodings[key].append(encoding[key])
+model = TFAutoModelForSequenceClassification.from_pretrained(
+    "bert-base-cased",
+    problem_type="multi_label_classification",
+    num_labels=len(labels),
+    label2id=label2id,
+    id2label=id2label,
+)
+training_args = TFTrainingArguments(output_dir="test_trainer")
+train_dataset = tf.data.Dataset.from_tensor_slices(
+    (dict(batch_encodings), batch_labels)
+)
+trainer = TFTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+)
+trainer.train()

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ cachetools==5.3.0
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 decorator==5.1.1
 entrypoints==0.4
 filelock==3.10.7

 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
+datasets==2.11.0
 decorator==5.1.1
 entrypoints==0.4
 filelock==3.10.7