software-maintenance-classification

Runtime error

App Files Files Community

kadabengaran commited on Dec 3, 2023

Commit

a37c6c7

1 Parent(s): 401d6fa

Update app/main.py

Browse files

Files changed (1) hide show

app/main.py +37 -140

app/main.py CHANGED Viewed

@@ -3,16 +3,18 @@ try:
     import pandas as pd
     import streamlit as st
     import re
-    from transformers import BertTokenizer
     from model import IndoBERTBiLSTM
     from stqdm import stqdm
 except Exception as e:
     print(e)
 # Config
-MAX_SEQ_LEN = 128
 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
-LABELS = {'Not Useful': 0, 'Useful': 1}
 def get_device():
     if torch.cuda.is_available():
@@ -32,7 +34,8 @@ def get_key(val, my_dict):
             return key
 def load_tokenizer(model_path):
-    tokenizer = BertTokenizer.from_pretrained(model_path)
     return tokenizer
 def remove_special_characters(text):
@@ -47,71 +50,30 @@ def remove_special_characters(text):
     text = re.sub(r"\s+", " ", text)
     return text
-def preprocess(text, tokenizer, max_seq=MAX_SEQ_LEN):
-    return tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_seq,
-                                 pad_to_max_length=True,
-                                 return_attention_mask=True,
-                                 return_tensors='pt'
-                                 )
 def load_model():
-    model = IndoBERTBiLSTM.from_pretrained(MODELS_PATH)
-    return model
 def classify_single(text, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
-    # We need Token IDs and Attention Mask for inference on the new sentence
-    test_ids = []
-    test_attention_mask = []
-    # Apply preprocessing to the new sentence
-    new_sentence = remove_special_characters(text)
-    encoding = preprocess(new_sentence, tokenizer)
-    # Extract IDs and Attention Mask
-    test_ids.append(encoding['input_ids'])
-    test_attention_mask.append(encoding['attention_mask'])
-    test_ids = torch.cat(test_ids, dim=0)
-    test_attention_mask = torch.cat(test_attention_mask, dim=0)
-    # Forward pass, calculate logit
-    with torch.no_grad():
-        outputs = model(test_ids.to(device),
-                        test_attention_mask.to(device))
-    print("output ", outputs)
-    result = torch.argmax(outputs, dim=-1)
-    print("output ", result)
-    return result.item()
-def classify_multiple(data, model, tokenizer, device):
-    if device.type == 'cuda':
-        model.cuda()
-    input_ids = []
-    attention_masks = []
-    for row in data.tolist():
-        text = remove_special_characters(row)
-        text = preprocess(text, tokenizer)
-        input_ids.append(text['input_ids'])
-        attention_masks.append(text['attention_mask'])
-    result_list = []
-    with torch.no_grad():
-        for i in stqdm(range(len(input_ids))):
-            test_ids = input_ids[i]
-            test_attention_mask = attention_masks[i]
-            outputs = model(test_ids.to(device), test_attention_mask.to(device))
-            result = torch.argmax(outputs, dim= -1)
-            result_label = get_key(result.item(), LABELS)
-            result_list.append(result_label)
-    return result_list
 tab_labels = ["Single Input", "Multiple Input"]
 class App:
@@ -123,99 +85,34 @@ class App:
         self.csv_process = None
     def run(self):
-        self.init_session_state()  # Initialize session state
-        tokenizer = load_tokenizer(MODELS_PATH)
-        model = load_model()
-        """App Review Classifier"""
         html_temp = """
-        <div style="background-color:blue;padding:10px">
-        <h1 style="color:white;text-align:center;">Klasifikasi Ulasan Aplikasi yang Berguna</h1>
         </div>
         """
         st.markdown(html_temp, unsafe_allow_html=True)
         st.markdown("")
-        self.render_tabs()
-        st.divider()
-        self.render_process_button(model, tokenizer, device)
-    def init_session_state(self):
-        if "tab_selected" not in st.session_state:
-            st.session_state.tab_selected = tab_labels[0]
-    def render_tabs(self):
-        tab_selected = st.session_state.get('tab_selected', self.default_tab_selected)
-        tab_selected = st.sidebar.radio("Select Input Type", tab_labels)
         if USE_CUDA:
             st.sidebar.markdown(footer,unsafe_allow_html=True)
-        if tab_selected == tab_labels[0]:
-            self.render_single_input()
-        elif tab_selected == tab_labels[1]:
-            self.render_multiple_input()
-        st.session_state.tab_selected = tab_selected
     def render_single_input(self):
         self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")
-    def render_multiple_input(self):
-        """
-        Upload File
-        """
-        st.markdown("Upload file")
-        file = st.file_uploader("To ensure a smooth process, please use a maximum of 500 rows of data in the CSV file.",
-                                type=self.fileTypes)
-        if not file:
-            st.info("Please upload a file of type: " + ", ".join(self.fileTypes))
-            return
-        data = pd.read_csv(file)
-        placeholder = st.empty()
-        placeholder.dataframe(data.head(10))
-        header_list = data.columns.tolist()
-        header_list.insert(0, "---------- select column -------------")
-        ques = st.radio("Select column to process", header_list, index=0)
-        if header_list.index(ques) == 0:
-            st.warning("Please select a column to process")
-            return
-        df_process = data[ques].astype(str)
-        self.csv_input = data
-        self.csv_process = df_process
     def render_process_button(self, model, tokenizer, device):
         if st.button("Process"):
-            if st.session_state.tab_selected == tab_labels[0]:
-                input_text = self.input_text
-                if input_text:
-                    classification = classify_single(input_text, model, tokenizer, device)
-                    classification_label = get_key(classification, LABELS)
-                    st.write("Classification result:", classification_label)
-                else:
-                    st.warning('Please enter text to process', icon="⚠️")
-            elif st.session_state.tab_selected == tab_labels[1]:
-                df_process = self.csv_process
-                if df_process is not None:
-                    classification = classify_multiple(df_process, model, tokenizer, device)
-                    st.divider()
-                    st.write("Classification Result")
-                    input_file = self.csv_input
-                    input_file["classification_result"] = classification
-                    st.dataframe(input_file.head(10))
-                    st.download_button(
-                        label="Download Result",
-                        data=input_file.to_csv().encode("utf-8"),
-                        file_name="classification_result.csv",
-                        mime="text/csv",
-                    )
-                else:
-                    st.warning('Please upload a file to process', icon="⚠️")
 footer="""<style>
 .footer {

     import pandas as pd
     import streamlit as st
     import re
+    from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification
     from model import IndoBERTBiLSTM
     from stqdm import stqdm
+    from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
 except Exception as e:
     print(e)
 # Config
 MODELS_PATH = "kadabengaran/IndoBERT-BiLSTM-Useful-App-Review"
+id2label= {0: 'Other', 1: 'Problem Discovery', 2: 'Information Seeking', 3: 'Feature Request'}
+label2id= {'Other': 0, 'Problem Discovery': 1, 'Information Seeking': 2, 'Feature Request': 3}
 def get_device():
     if torch.cuda.is_available():
             return key
 def load_tokenizer(model_path):
+    # create tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
     return tokenizer
 def remove_special_characters(text):
     text = re.sub(r"\s+", " ", text)
     return text
 def load_model():
+    config = PeftConfig.from_pretrained(MODELS_PATH)
+    inference_model = AutoModelForSequenceClassification.from_pretrained(
+        config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
+    )
+    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
+    model = PeftModel.from_pretrained(inference_model, MODELS_PATH)
+    return model, tokenizer
 def classify_single(text, model, tokenizer, device):
     if device.type == 'cuda':
         model.cuda()
+    # tokenize text
+    inputs = tokenizer.encode(text, return_tensors="pt").to(device)
+    # compute logits
+    logits = model(inputs).logits
+    # convert logits to label
+    predictions = torch.argmax(logits)
+    return id2label[predictions.tolist()]
 tab_labels = ["Single Input", "Multiple Input"]
 class App:
         self.csv_process = None
     def run(self):
+        model, tokenizer = load_model()
         html_temp = """
+        <div style="padding:10px">
+        <h1 style="color:white;text-align:center;">User Question Classification</h1>
         </div>
         """
         st.markdown(html_temp, unsafe_allow_html=True)
         st.markdown("")
         if USE_CUDA:
             st.sidebar.markdown(footer,unsafe_allow_html=True)
+        self.render_single_input()
+        st.divider()
+        self.render_process_button(model, tokenizer, device)
     def render_single_input(self):
         self.input_text = st.text_area("Enter Text Here", placeholder="Type Here")
     def render_process_button(self, model, tokenizer, device):
         if st.button("Process"):
+            input_text = self.input_text
+            if input_text:
+                classification_result = classify_single(input_text, model, tokenizer, device)
+                st.write("Classification result:", classification_result)
+            else:
+                st.warning('Please enter text to process', icon="⚠️")
 footer="""<style>
 .footer {