Teslim Olunlade commited on
Commit
bb01739
·
1 Parent(s): 7070a94

Updated demo url and added video demo

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app/main.py +5 -4
  3. app/train.py +8 -0
README.md CHANGED
@@ -8,7 +8,7 @@ pinned: false
8
 
9
  ## Datamining Project
10
 
11
- Milestone 3 - [Demo](https://huggingface.co/spaces/ogtega/datamining-project)
12
 
13
  ## Requirements
14
 
 
8
 
9
  ## Datamining Project
10
 
11
+ Milestone 4 - [Demo](https://sites.google.com/njit.edu/toxic-tweet) | [Video Demo](https://youtu.be/j03gtVWdiNs)
12
 
13
  ## Requirements
14
 
app/main.py CHANGED
@@ -14,15 +14,16 @@ demo = """AI never ceases to amaze me! On one hand, it's incredible to see the p
14
  text = ""
15
  submit = False
16
  model_name = ""
17
- col1, col2, col3 = st.columns([2,1,1])
18
 
19
- with st.container():
20
  model_name = st.selectbox(
21
  "Select the model you want to use below.",
22
  ("ogtega/tweet-toxicity-classifier",),
23
  )
24
  submit = st.button("Submit", type="primary", use_container_width=True)
25
 
 
26
  tokenizer = AutoTokenizer.from_pretrained(model_name)
27
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
28
  clf = pipeline(
@@ -40,9 +41,9 @@ with col3:
40
  st.subheader("Probability")
41
 
42
 
43
- input = tokenizer(text, return_tensors="tf")
44
 
45
- if submit:
46
  results = dict(d.values() for d in clf(text)[0])
47
  classes = {k: results[k] for k in results.keys() if not k == "toxic"}
48
 
 
14
  text = ""
15
  submit = False
16
  model_name = ""
17
+ col1, col2, col3 = st.columns([2,1,1]) # Initiate columns
18
 
19
+ with st.container(): # Container for model selection and submit button
20
  model_name = st.selectbox(
21
  "Select the model you want to use below.",
22
  ("ogtega/tweet-toxicity-classifier",),
23
  )
24
  submit = st.button("Submit", type="primary", use_container_width=True)
25
 
26
+ # Obtain the pre-trained model
27
  tokenizer = AutoTokenizer.from_pretrained(model_name)
28
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
29
  clf = pipeline(
 
41
  st.subheader("Probability")
42
 
43
 
44
+ input = tokenizer(text, return_tensors="tf") # Run input text through tokenizer
45
 
46
+ if submit: # Code ran after submit button is clicked
47
  results = dict(d.values() for d in clf(text)[0])
48
  classes = {k: results[k] for k in results.keys() if not k == "toxic"}
49
 
app/train.py CHANGED
@@ -19,10 +19,12 @@ labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate
19
  label2id = {label: id for id, label in enumerate(labels)}
20
  id2label = {id: label for id, label in enumerate(labels)}
21
 
 
22
  dataset = load_dataset("csv", data_files="train.csv")
23
  tokenizer = AutoTokenizer.from_pretrained(base_model)
24
 
25
 
 
26
  def process_data(row):
27
  text = row["comment_text"]
28
  labels_batch = {k: row[k] for k in row.keys() if k in labels}
@@ -40,6 +42,7 @@ def process_data(row):
40
  return encoding
41
 
42
 
 
43
  model = TFAutoModelForSequenceClassification.from_pretrained(
44
  base_model,
45
  problem_type="multi_label_classification",
@@ -48,25 +51,30 @@ model = TFAutoModelForSequenceClassification.from_pretrained(
48
  id2label=id2label,
49
  )
50
 
 
51
  encoded = dataset.map(
52
  process_data,
53
  remove_columns=["id", "comment_text"],
54
  num_proc=int(multiprocessing.cpu_count()),
55
  )
56
 
 
57
  tf_dataset = model.prepare_tf_dataset(
58
  encoded["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
59
  )
60
 
 
61
  cp_callback = tf.keras.callbacks.ModelCheckpoint(
62
  filepath=checkpoint_path, save_weights_only=True, verbose=1
63
  )
64
 
 
65
  push_to_hub_callback = PushToHubCallback(
66
  output_dir=output_dir,
67
  tokenizer=tokenizer,
68
  hub_model_id="ogtega/tweet-toxicity-classifier",
69
  )
70
 
 
71
  model.compile(optimizer=Adam(3e-5), loss="BinaryCrossentropy")
72
  model.fit(tf_dataset, callbacks=[cp_callback, push_to_hub_callback])
 
19
  label2id = {label: id for id, label in enumerate(labels)}
20
  id2label = {id: label for id, label in enumerate(labels)}
21
 
22
+ # Load csv file of tweets in the script's directory
23
  dataset = load_dataset("csv", data_files="train.csv")
24
  tokenizer = AutoTokenizer.from_pretrained(base_model)
25
 
26
 
27
+ # Asynchronous processing of data rows
28
  def process_data(row):
29
  text = row["comment_text"]
30
  labels_batch = {k: row[k] for k in row.keys() if k in labels}
 
42
  return encoding
43
 
44
 
45
+ # Initiate the model
46
  model = TFAutoModelForSequenceClassification.from_pretrained(
47
  base_model,
48
  problem_type="multi_label_classification",
 
51
  id2label=id2label,
52
  )
53
 
54
+ # Start processing and encoding data rows using available cores
55
  encoded = dataset.map(
56
  process_data,
57
  remove_columns=["id", "comment_text"],
58
  num_proc=int(multiprocessing.cpu_count()),
59
  )
60
 
61
+ # Convert encoding to tensors
62
  tf_dataset = model.prepare_tf_dataset(
63
  encoded["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
64
  )
65
 
66
+ # Compile completion callback
67
  cp_callback = tf.keras.callbacks.ModelCheckpoint(
68
  filepath=checkpoint_path, save_weights_only=True, verbose=1
69
  )
70
 
71
+ # Callback to submit our model to modelhub
72
  push_to_hub_callback = PushToHubCallback(
73
  output_dir=output_dir,
74
  tokenizer=tokenizer,
75
  hub_model_id="ogtega/tweet-toxicity-classifier",
76
  )
77
 
78
+ # Compile and train the model
79
  model.compile(optimizer=Adam(3e-5), loss="BinaryCrossentropy")
80
  model.fit(tf_dataset, callbacks=[cp_callback, push_to_hub_callback])