kadabengaran commited on
Commit
a37c6c7
·
1 Parent(s): 401d6fa

Update app/main.py

Browse files
Files changed (1) hide show
  1. app/main.py +37 -140
app/main.py CHANGED
@@ -3,16 +3,18 @@ try:
3
  import pandas as pd
4
  import streamlit as st
5
  import re
6
- from transformers import BertTokenizer
7
  from model import IndoBERTBiLSTM
8
  from stqdm import stqdm
 
9
  except Exception as e:
10
  print(e)
11
 
12
  # Config
13
- MAX_SEQ_LEN = 128
14
  MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
15
- LABELS = {'Not Useful': 0, 'Useful': 1}
 
 
16
 
17
  def get_device():
18
  if torch.cuda.is_available():
@@ -32,7 +34,8 @@ def get_key(val, my_dict):
32
  return key
33
 
34
  def load_tokenizer(model_path):
35
- tokenizer = BertTokenizer.from_pretrained(model_path)
 
36
  return tokenizer
37
 
38
  def remove_special_characters(text):
@@ -47,71 +50,30 @@ def remove_special_characters(text):
47
  text = re.sub(r"\s+", " ", text)
48
 
49
  return text
50
-
51
- def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
52
- return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
53
- pad_to_max_length=True,
54
- return_attention_mask=True,
55
- return_tensors='pt'
56
- )
57
 
58
  def load_model():
59
- model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
60
- return model
 
 
 
 
 
61
 
62
  def classify_single(text, model, tokenizer, device):
63
 
64
  if device.type == 'cuda':
65
  model.cuda()
66
 
67
- # We need Token IDs and Attention Mask for inference on the new sentence
68
- test_ids = []
69
- test_attention_mask = []
70
-
71
- # Apply preprocessing to the new sentence
72
- new_sentence = remove_special_characters(text)
73
- encoding = preprocess(new_sentence, tokenizer)
74
-
75
- # Extract IDs and Attention Mask
76
- test_ids.append(encoding['input_ids'])
77
- test_attention_mask.append(encoding['attention_mask'])
78
- test_ids = torch.cat(test_ids, dim=0)
79
- test_attention_mask = torch.cat(test_attention_mask, dim=0)
80
-
81
- # Forward pass, calculate logit
82
- with torch.no_grad():
83
- outputs = model(test_ids.to(device),
84
- test_attention_mask.to(device))
85
- print("output ", outputs)
86
- result = torch.argmax(outputs, dim=-1)
87
- print("output ", result)
88
- return result.item()
89
-
90
- def classify_multiple(data, model, tokenizer, device):
91
-
92
- if device.type == 'cuda':
93
- model.cuda()
94
-
95
- input_ids = []
96
- attention_masks = []
97
- for row in data.tolist():
98
- text = remove_special_characters(row)
99
- text = preprocess(text, tokenizer)
100
- input_ids.append(text['input_ids'])
101
- attention_masks.append(text['attention_mask'])
102
-
103
- result_list = []
104
-
105
- with torch.no_grad():
106
- for i in stqdm(range(len(input_ids))):
107
- test_ids = input_ids[i]
108
- test_attention_mask = attention_masks[i]
109
- outputs = model(test_ids.to(device), test_attention_mask.to(device))
110
- result = torch.argmax(outputs, dim= -1)
111
- result_label = get_key(result.item(), LABELS)
112
- result_list.append(result_label)
113
-
114
- return result_list
115
 
116
  tab_labels = ["Single Input", "Multiple Input"]
117
  class App:
@@ -123,99 +85,34 @@ class App:
123
  self.csv_process = None
124
 
125
  def run(self):
126
- self.init_session_state() # Initialize session state
127
- tokenizer = load_tokenizer(MODELS_PATH)
128
- model = load_model()
129
- """App Review Classifier"""
130
  html_temp = """
131
- <div style="background-color:blue;padding:10px">
132
- <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
133
  </div>
134
  """
135
  st.markdown(html_temp, unsafe_allow_html=True)
136
  st.markdown("")
137
- self.render_tabs()
138
- st.divider()
139
- self.render_process_button(model, tokenizer, device)
140
-
141
- def init_session_state(self):
142
- if "tab_selected" not in st.session_state:
143
- st.session_state.tab_selected = tab_labels[0]
144
-
145
-
146
- def render_tabs(self):
147
- tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
148
- tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
149
  if USE_CUDA:
150
  st.sidebar.markdown(footer,unsafe_allow_html=True)
 
 
 
151
 
152
- if tab_selected == tab_labels[0]:
153
- self.render_single_input()
154
- elif tab_selected == tab_labels[1]:
155
- self.render_multiple_input()
156
-
157
- st.session_state.tab_selected = tab_selected
158
 
159
  def render_single_input(self):
160
  self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")
161
 
162
- def render_multiple_input(self):
163
- """
164
- Upload File
165
- """
166
- st.markdown("Upload file")
167
- file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.",
168
- type=self.fileTypes)
169
-
170
- if not file:
171
- st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
172
- return
173
-
174
- data = pd.read_csv(file)
175
-
176
- placeholder = st.empty()
177
- placeholder.dataframe(data.head(10))
178
-
179
- header_list = data.columns.tolist()
180
- header_list.insert(0, "---------- select column -------------")
181
- ques = st.radio("Select column to process", header_list, index=0)
182
-
183
- if header_list.index(ques) == 0:
184
- st.warning("Please select a column to process")
185
- return
186
-
187
- df_process = data[ques].astype(str)
188
- self.csv_input = data
189
- self.csv_process = df_process
190
-
191
  def render_process_button(self, model, tokenizer, device):
192
  if st.button("Process"):
193
- if st.session_state.tab_selected == tab_labels[0]:
194
- input_text = self.input_text
195
- if input_text:
196
- classification = classify_single(input_text, model, tokenizer, device)
197
- classification_label = get_key(classification, LABELS)
198
- st.write("Classification result:", classification_label)
199
- else:
200
- st.warning('Please enter text to process', icon="⚠️")
201
- elif st.session_state.tab_selected == tab_labels[1]:
202
- df_process = self.csv_process
203
- if df_process is not None:
204
- classification = classify_multiple(df_process, model, tokenizer, device)
205
-
206
- st.divider()
207
- st.write("Classification Result")
208
- input_file = self.csv_input
209
- input_file["classification_result"] = classification
210
- st.dataframe(input_file.head(10))
211
- st.download_button(
212
- label="Download Result",
213
- data=input_file.to_csv().encode("utf-8"),
214
- file_name="classification_result.csv",
215
- mime="text/csv",
216
- )
217
- else:
218
- st.warning('Please upload a file to process', icon="⚠️")
219
 
220
  footer="""<style>
221
  .footer {
 
3
  import pandas as pd
4
  import streamlit as st
5
  import re
6
+ from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification
7
  from model import IndoBERTBiLSTM
8
  from stqdm import stqdm
9
+ from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
10
  except Exception as e:
11
  print(e)
12
 
13
  # Config
 
14
  MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
15
+
16
+ id2label= {0: 'Other', 1: 'Problem Discovery', 2: 'Information Seeking', 3: 'Feature Request'}
17
+ label2id= {'Other': 0, 'Problem Discovery': 1, 'Information Seeking': 2, 'Feature Request': 3}
18
 
19
  def get_device():
20
  if torch.cuda.is_available():
 
34
  return key
35
 
36
  def load_tokenizer(model_path):
37
+ # create tokenizer
38
+ tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
39
  return tokenizer
40
 
41
  def remove_special_characters(text):
 
50
  text = re.sub(r"\s+", " ", text)
51
 
52
  return text
 
 
 
 
 
 
 
53
 
54
  def load_model():
55
+ config = PeftConfig.from_pretrained(MODELS_PATH)
56
+ inference_model = AutoModelForSequenceClassification.from_pretrained(
57
+ config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
58
+ )
59
+ tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
60
+ model = PeftModel.from_pretrained(inference_model, MODELS_PATH)
61
+ return model, tokenizer
62
 
63
  def classify_single(text, model, tokenizer, device):
64
 
65
  if device.type == 'cuda':
66
  model.cuda()
67
 
68
+ # tokenize text
69
+ inputs = tokenizer.encode(text, return_tensors="pt").to(device)
70
+
71
+ # compute logits
72
+ logits = model(inputs).logits
73
+ # convert logits to label
74
+ predictions = torch.argmax(logits)
75
+ return id2label[predictions.tolist()]
76
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  tab_labels = ["Single Input", "Multiple Input"]
79
  class App:
 
85
  self.csv_process = None
86
 
87
  def run(self):
88
+ model, tokenizer = load_model()
 
 
 
89
  html_temp = """
90
+ <div style="padding:10px">
91
+ <h1 style="color:white;text-align:center;">User Question Classification</h1>
92
  </div>
93
  """
94
  st.markdown(html_temp, unsafe_allow_html=True)
95
  st.markdown("")
 
 
 
 
 
 
 
 
 
 
 
 
96
  if USE_CUDA:
97
  st.sidebar.markdown(footer,unsafe_allow_html=True)
98
+ self.render_single_input()
99
+ st.divider()
100
+ self.render_process_button(model, tokenizer, device)
101
 
 
 
 
 
 
 
102
 
103
  def render_single_input(self):
104
  self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")
105
 
106
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def render_process_button(self, model, tokenizer, device):
108
  if st.button("Process"):
109
+ input_text = self.input_text
110
+ if input_text:
111
+ classification_result = classify_single(input_text, model, tokenizer, device)
112
+ st.write("Classification result:", classification_result)
113
+ else:
114
+ st.warning('Please enter text to process', icon="⚠️")
115
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  footer="""<style>
118
  .footer {