Spaces:

ferdmartin
/

GradApplicationDocsApp2

Runtime error

App Files Files Community

ferdmartin commited on Apr 29, 2023

Commit

4fc2f5a

1 Parent(s): e59445e

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -12

app.py CHANGED Viewed

@@ -49,7 +49,12 @@ def main():
                                 translate(str.maketrans('', '', string.punctuation)).strip().lstrip()
     # Define the function to classify text
-    def nb_lr(model, text):
         # Clean and format the input text
         text = format_text(text)
         # Predict using either LR or NB and get prediction probability
@@ -58,6 +63,11 @@ def main():
         return prediction, predict_proba
     def torch_pred(tokenizer, model, text):
         # DL models (BERT/DistilBERT based models)
         cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
         with torch.inference_mode():
@@ -70,7 +80,11 @@ def main():
             predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4)
             return prediction, predict_proba
-    def pred_str(prediction):
     # Map the predicted class to string output
         if prediction == 0:
             return "Human-made 🤷‍♂️🤷‍♀️"
@@ -79,6 +93,9 @@ def main():
     @st.cache(allow_output_mutation=True, suppress_st_warning=True)
     def load_tokenizer(option):
         if option == "BERT-based model":
             tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
         else:
@@ -87,6 +104,9 @@ def main():
     @st.cache(allow_output_mutation=True, suppress_st_warning=True)
     def load_model(option):
         if option == "BERT-based model":
             model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs").to(device)
         else:
@@ -95,7 +115,7 @@ def main():
     # Streamlit app:
     models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib",
                         "Naive Bayes": "models/baseline_model_nb2.joblib",
                         "DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs",
@@ -108,11 +128,12 @@ def main():
     # Check the model to use
     def restore_prediction_state():
         if "prediction" in st.session_state:
             del st.session_state.prediction
     option = st.selectbox("Select a model to use:", models_available, on_change=restore_prediction_state)
     # Load the selected trained model
     if option in ("BERT-based model", "DistilBERT-based model (BERT light)"):
         tokenizer = load_tokenizer(option)
@@ -135,20 +156,21 @@ def main():
     # Use model
     if st.button("Let's check this text!"):
         if text.strip() == "":
             st.error("Please enter some text")
         else:
             with st.spinner("Wait for the magic 🪄🔮"):
-                # Use model
-                if option in ("Naive Bayes", "Logistic Regression"):
                     prediction, predict_proba = nb_lr(model, text)
                     st.session_state["sklearn"] = True
                 else:
-                    prediction, predict_proba = torch_pred(tokenizer, model, text)
                     st.session_state["torch"] = True
             # Store the result in session state
-            st.session_state["color_pred"] = "blue" if prediction == 0 else "red"
-            prediction = pred_str(prediction)
             st.session_state["prediction"] = prediction
             st.session_state["predict_proba"] = predict_proba
             st.session_state["text"] = text
@@ -171,15 +193,14 @@ def main():
                     html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
             else:
                 with st.spinner('Wait for it 💭... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'):
-                # TORCH EXPLAINER PRED FUNC (USES logits)
                     def f(x):
                         tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda()
                         outputs = model(tv).detach().cpu().numpy()
                         scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
                         val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
                         return val
-                    # build an explainer using a token masker
-                    explainer = shap.Explainer(f, tokenizer)
                     shap_values = explainer([st.session_state["text"]], fixed_context=1)
                     html = shap.plots.text(shap_values, display=False)
             # Render HTML

                                 translate(str.maketrans('', '', string.punctuation)).strip().lstrip()
     # Define the function to classify text
+    def nb_lr(model, text: str) -> (int, float):
+        """
+        This function takes a previously trained Sklearn Pipeline
+        model (NaiveBayes or Logistic Regression), then returns prediction probability,
+        and the final prediction as a tuple.
+        """
         # Clean and format the input text
         text = format_text(text)
         # Predict using either LR or NB and get prediction probability
         return prediction, predict_proba
     def torch_pred(tokenizer, model, text):
+        """
+        This function takes a pre-trained tokenizer, a previously trained transformer-based model
+        model (DistilBert or Bert), then returns prediction probability,
+        and the final prediction as a tuple.
+        """
         # DL models (BERT/DistilBERT based models)
         cleaned_text_tokens = tokenizer([text], padding='max_length', max_length=512, truncation=True)
         with torch.inference_mode():
             predict_proba = round(torch.softmax(logits, 1).cpu().squeeze().tolist()[prediction],4)
             return prediction, predict_proba
+    def pred_str(prediction:int) -> str:
+        """
+        This function takes an integer value as input and returns a string representing the type of the input's source.
+        The input is expected to be a prediction from a classification model that distinguishes between human-made and AI-generated text.
+        """
     # Map the predicted class to string output
         if prediction == 0:
             return "Human-made 🤷‍♂️🤷‍♀️"
     @st.cache(allow_output_mutation=True, suppress_st_warning=True)
     def load_tokenizer(option):
+        """
+        Load pre-trained tokenizer and and save in cache memory.
+        """
         if option == "BERT-based model":
             tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding='max_length', max_length=512, truncation=True)
         else:
     @st.cache(allow_output_mutation=True, suppress_st_warning=True)
     def load_model(option):
+        """
+        Load trained Transformer-based models and save in cache memory.
+        """
         if option == "BERT-based model":
             model = HF_BertBasedModelAppDocs.from_pretrained("ferdmartin/HF_BertBasedModelAppDocs").to(device)
         else:
     # Streamlit app:
+        # List of models available
     models_available = {"Logistic Regression":"models/baseline_model_lr2.joblib",
                         "Naive Bayes": "models/baseline_model_nb2.joblib",
                         "DistilBERT-based model (BERT light)": "ferdmartin/HF_DistilBertBasedModelAppDocs",
     # Check the model to use
     def restore_prediction_state():
+        """Restore session_state variable to clear prediction after changing model"""
         if "prediction" in st.session_state:
             del st.session_state.prediction
     option = st.selectbox("Select a model to use:", models_available, on_change=restore_prediction_state)
     # Load the selected trained model
     if option in ("BERT-based model", "DistilBERT-based model (BERT light)"):
         tokenizer = load_tokenizer(option)
     # Use model
     if st.button("Let's check this text!"):
         if text.strip() == "":
+            # In case there is no input for the model
             st.error("Please enter some text")
         else:
             with st.spinner("Wait for the magic 🪄🔮"):
+                # Use models
+                if option in ("Naive Bayes", "Logistic Regression"): # Use Sklearn pipeline models
                     prediction, predict_proba = nb_lr(model, text)
                     st.session_state["sklearn"] = True
                 else:
+                    prediction, predict_proba = torch_pred(tokenizer, model, text) # Use transformers
                     st.session_state["torch"] = True
             # Store the result in session state
+            st.session_state["color_pred"] = "blue" if prediction == 0 else "red" # Set color for prediction output string
+            prediction = pred_str(prediction) # Map predictions (int => str)
             st.session_state["prediction"] = prediction
             st.session_state["predict_proba"] = predict_proba
             st.session_state["text"] = text
                     html = eli5.format_as_html(explainer.explain_prediction(target_names=["Human", "AI"]))
             else:
                 with st.spinner('Wait for it 💭... BERT-based model explanations take around 4-10 minutes. In case you want to abort, refresh the page.'):
                     def f(x):
+                        """TORCH EXPLAINER PRED FUNC (USES logits)"""
                         tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x])#.cuda()
                         outputs = model(tv).detach().cpu().numpy()
                         scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
                         val = scipy.special.logit(scores[:,1]) # use one vs rest logit units
                         return val
+                    explainer = shap.Explainer(f, tokenizer) # build explainer using masking tokens and selected transformer-based model
                     shap_values = explainer([st.session_state["text"]], fixed_context=1)
                     html = shap.plots.text(shap_values, display=False)
             # Render HTML