Spaces:

ogtega
/

datamining-project

Sleeping

App Files Files Community

Teslim Olunlade commited on May 1, 2023

Commit

bb01739

1 Parent(s): 7070a94

Updated demo url and added video demo

Browse files

Files changed (3) hide show

README.md +1 -1
app/main.py +5 -4
app/train.py +8 -0

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ pinned: false
 ## Datamining Project
-Milestone 3 - [Demo](https://huggingface.co/spaces/ogtega/datamining-project)
 ## Requirements

 ## Datamining Project
+Milestone 4 - [Demo](https://sites.google.com/njit.edu/toxic-tweet) | [Video Demo](https://youtu.be/j03gtVWdiNs)
 ## Requirements

app/main.py CHANGED Viewed

@@ -14,15 +14,16 @@ demo = """AI never ceases to amaze me! On one hand, it's incredible to see the p
 text = ""
 submit = False
 model_name = ""
-col1, col2, col3 = st.columns([2,1,1])
-with st.container():
     model_name = st.selectbox(
         "Select the model you want to use below.",
         ("ogtega/tweet-toxicity-classifier",),
     )
     submit = st.button("Submit", type="primary", use_container_width=True)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 clf = pipeline(
@@ -40,9 +41,9 @@ with col3:
     st.subheader("Probability")
-input = tokenizer(text, return_tensors="tf")
-if submit:
     results = dict(d.values() for d in clf(text)[0])
     classes = {k: results[k] for k in results.keys() if not k == "toxic"}

 text = ""
 submit = False
 model_name = ""
+col1, col2, col3 = st.columns([2,1,1]) # Initiate columns
+with st.container(): # Container for model selection and submit button
     model_name = st.selectbox(
         "Select the model you want to use below.",
         ("ogtega/tweet-toxicity-classifier",),
     )
     submit = st.button("Submit", type="primary", use_container_width=True)
+# Obtain the pre-trained model
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 clf = pipeline(
     st.subheader("Probability")
+input = tokenizer(text, return_tensors="tf") # Run input text through tokenizer
+if submit: # Code ran after submit button is clicked
     results = dict(d.values() for d in clf(text)[0])
     classes = {k: results[k] for k in results.keys() if not k == "toxic"}

app/train.py CHANGED Viewed

@@ -19,10 +19,12 @@ labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate
 label2id = {label: id for id, label in enumerate(labels)}
 id2label = {id: label for id, label in enumerate(labels)}
 dataset = load_dataset("csv", data_files="train.csv")
 tokenizer = AutoTokenizer.from_pretrained(base_model)
 def process_data(row):
     text = row["comment_text"]
     labels_batch = {k: row[k] for k in row.keys() if k in labels}
@@ -40,6 +42,7 @@ def process_data(row):
     return encoding
 model = TFAutoModelForSequenceClassification.from_pretrained(
     base_model,
     problem_type="multi_label_classification",
@@ -48,25 +51,30 @@ model = TFAutoModelForSequenceClassification.from_pretrained(
     id2label=id2label,
 )
 encoded = dataset.map(
     process_data,
     remove_columns=["id", "comment_text"],
     num_proc=int(multiprocessing.cpu_count()),
 )
 tf_dataset = model.prepare_tf_dataset(
     encoded["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
 )
 cp_callback = tf.keras.callbacks.ModelCheckpoint(
     filepath=checkpoint_path, save_weights_only=True, verbose=1
 )
 push_to_hub_callback = PushToHubCallback(
     output_dir=output_dir,
     tokenizer=tokenizer,
     hub_model_id="ogtega/tweet-toxicity-classifier",
 )
 model.compile(optimizer=Adam(3e-5), loss="BinaryCrossentropy")
 model.fit(tf_dataset, callbacks=[cp_callback, push_to_hub_callback])

 label2id = {label: id for id, label in enumerate(labels)}
 id2label = {id: label for id, label in enumerate(labels)}
+# Load csv file of tweets in the script's directory
 dataset = load_dataset("csv", data_files="train.csv")
 tokenizer = AutoTokenizer.from_pretrained(base_model)
+# Asynchronous processing of data rows
 def process_data(row):
     text = row["comment_text"]
     labels_batch = {k: row[k] for k in row.keys() if k in labels}
     return encoding
+# Initiate the model
 model = TFAutoModelForSequenceClassification.from_pretrained(
     base_model,
     problem_type="multi_label_classification",
     id2label=id2label,
 )
+# Start processing and encoding data rows using available cores
 encoded = dataset.map(
     process_data,
     remove_columns=["id", "comment_text"],
     num_proc=int(multiprocessing.cpu_count()),
 )
+# Convert encoding to tensors
 tf_dataset = model.prepare_tf_dataset(
     encoded["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
 )
+# Compile completion callback
 cp_callback = tf.keras.callbacks.ModelCheckpoint(
     filepath=checkpoint_path, save_weights_only=True, verbose=1
 )
+# Callback to submit our model to modelhub
 push_to_hub_callback = PushToHubCallback(
     output_dir=output_dir,
     tokenizer=tokenizer,
     hub_model_id="ogtega/tweet-toxicity-classifier",
 )
+# Compile and train the model
 model.compile(optimizer=Adam(3e-5), loss="BinaryCrossentropy")
 model.fit(tf_dataset, callbacks=[cp_callback, push_to_hub_callback])