Spaces:

butterswords
/

nlc-explorer

Running

App Files Files Community

Nathan Butters commited on Jun 17, 2022

Commit

bdb6cd4

1 Parent(s): 9d6f821

abs_diff attempt 0

Browse files

Files changed (4) hide show

.DS_Store +0 -0
.ipynb_checkpoints/NLselector-checkpoint.py +37 -13
Assets/.DS_Store +0 -0
NLselector.py +1 -1

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.ipynb_checkpoints/NLselector-checkpoint.py CHANGED Viewed

@@ -1,7 +1,6 @@
 #Import the libraries we know we'll need for the Generator.
 import pandas as pd, spacy, nltk, numpy as np, re
 from spacy.matcher import Matcher
-#!python -m spacy download en_core_web_md #Not sure if we need this so I'm going to keep it just in case
 nlp = spacy.load("en_core_web_lg")
 import altair as alt
 import streamlit as st
@@ -14,6 +13,9 @@ import torch
 import torch.nn.functional as F
 from lime.lime_text import LimeTextExplainer
 class_names = ['negative', 'positive']
 explainer = LimeTextExplainer(class_names=class_names)
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
@@ -27,6 +29,10 @@ def predictor(texts):
 @st.experimental_singleton
 def critical_words(document, options=False):
     if type(document) is not spacy.tokens.doc.Doc:
         document = nlp(document)
     chunks = list(document.noun_chunks)
@@ -43,6 +49,31 @@ def critical_words(document, options=False):
         lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
     #Identify what we care about "parts of speech"
     for chunk in chunks:
         #The use of chunk[-1] is due to testing that it appears to always match the root
         root = chunk[-1]
@@ -58,7 +89,7 @@ def critical_words(document, options=False):
                     #creates a span for the entirety of the compound noun and adds it to the list.
                     span = -1 * (1 + len(compound))
                     pos_options.append(chunk[span:].text)
-                    cur_values + [token.text for token in chunk if token.pos_ == "ADJ"]
                 else:
                     print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
             else:
@@ -67,21 +98,14 @@ def critical_words(document, options=False):
                 pos_options.extend(cur_values)
                 print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
         elif len(chunk) >= 1:
-            cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ"]]
             if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
                 pos_options.extend(cur_values)
                 print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
         else:
             print(f"No options added for \'{chunk.text}\' ")
-        # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
-    for token in document:
-        if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
-            #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
-            if (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj"]):
-                pos_options.append(token.text)
-            elif (token.pos_ == "PRON") and (len(token.morph) !=0):
-                if (token.morph.get("PronType") == "Prs"):
-                    pos_options.append(token.text)
     if options:
         return pos_options, lime_results
@@ -157,6 +181,7 @@ def abs_dif(df,seed):
     text2 = Nearest Prediction
     text3 = Farthest Prediction'''
     target = df[df['Words'] == seed].pred.iloc[0]
     sub_df = df[df['Words'] != seed].reset_index()
     nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
@@ -186,7 +211,6 @@ def gen_cf_country(df,_document,selection):
     df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
     return df
 def gen_cf_profession(df,_document,selection):
     category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
     df = df[df.Major == category]

 #Import the libraries we know we'll need for the Generator.
 import pandas as pd, spacy, nltk, numpy as np, re
 from spacy.matcher import Matcher
 nlp = spacy.load("en_core_web_lg")
 import altair as alt
 import streamlit as st
 import torch.nn.functional as F
 from lime.lime_text import LimeTextExplainer
+#Import WNgen.py
+from WNgen import *
 class_names = ['negative', 'positive']
 explainer = LimeTextExplainer(class_names=class_names)
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
 @st.experimental_singleton
 def critical_words(document, options=False):
+    '''This function is meant to select the critical part of a sentence. Critical, in this context means
+    the part of the sentence that is either: A) a NOUN or PROPN from the correct entity group, B) a NOUN,
+    C) a NOUN + ADJ combination, or D) ADJ and PROPN used to modify other NOUN tokens.
+    It also checks this against what the model thinks is important if the user defines "options" as "LIME" or True.'''
     if type(document) is not spacy.tokens.doc.Doc:
         document = nlp(document)
     chunks = list(document.noun_chunks)
         lime_results = pd.DataFrame(lime_results, columns=["Word","Weight"])
     #Identify what we care about "parts of speech"
+    # Here I am going to try to pick up pronouns, which are people, and Adjectival Compliments.
+    for token in document:
+        if (token.text not in pos_options) and ((token.text in lime_options) or (options == False)):
+            #print(f"executed {token.text} with {token.pos_} and {token.dep_}") #QA
+            if (token.pos_ in ["ADJ","PROPN"]) and (token.dep_ in ["compound", "amod"]) and (document[token.i - 1].dep_ in ["compound", "amod"]):
+                compound = document[token.i - 1: token.i +1].text
+                pos_options.append(compound)
+                print(f'Added {compound} based on "amod" and "compound" adjectives.')
+            elif (token.pos_ in ["NOUN"]) and (token.dep_ in ["compound", "amod", "conj"]) and (document[token.i - 1].dep_ in ["compound"]):
+                compound = document[token.i - 1: token.i +1].text
+                pos_options.append(compound)
+                print(f'Added {compound} based on "amod" and "compound" and "conj" nouns.')
+            elif (token.pos_ == "PROPN") and (token.dep_ in ["prep","amod"]):
+                pos_options.append(token.text)
+                print(f"Added '{token.text}' based on their adjectival state.")
+            elif (token.pos_ == "ADJ") and (token.dep_ in ["acomp","conj","amod"]):
+                pos_options.append(token.text)
+                print(f"Added '{token.text}' based on their adjectival state.")
+            elif (token.pos_ == "PRON") and (len(token.morph) !=0):
+                if (token.morph.get("PronType") == "Prs"):
+                    pos_options.append(token.text)
+                    print(f"Added '{token.text}' because it's a human pronoun.")
+    #Noun Chunks parsing
     for chunk in chunks:
         #The use of chunk[-1] is due to testing that it appears to always match the root
         root = chunk[-1]
                     #creates a span for the entirety of the compound noun and adds it to the list.
                     span = -1 * (1 + len(compound))
                     pos_options.append(chunk[span:].text)
+                    cur_values + [token.text for token in chunk if token.pos_ in ["ADJ","NOUN","PROPN"]]
                 else:
                     print(f"The elmenents in {compound} could not be added to the final list because they are not all relevant to the model.")
             else:
                 pos_options.extend(cur_values)
                 print(f"From {chunk.text}, {cur_values} added to pos_options due to entity recognition.") #for QA
         elif len(chunk) >= 1:
+            cur_values = [token.text for token in chunk if token.pos_ in ["NOUN","ADJ","PROPN"]]
             if (all(elem in lime_options for elem in cur_values) and (options is True)) or ((options is False)):
                 pos_options.extend(cur_values)
                 print(f"From {chunk.text}, {cur_values} added to pos_options due to wildcard.") #for QA
         else:
             print(f"No options added for \'{chunk.text}\' ")
+    pos_options = list(set(pos_options))
     if options:
         return pos_options, lime_results
     text2 = Nearest Prediction
     text3 = Farthest Prediction'''
+    #seed = process_text(seed)
     target = df[df['Words'] == seed].pred.iloc[0]
     sub_df = df[df['Words'] != seed].reset_index()
     nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]
     df['similarity'] = df.Words.apply(lambda x: nlp(selection).similarity(nlp(x)))
     return df
 def gen_cf_profession(df,_document,selection):
     category = df.loc[df['Words'] == selection, 'Major'].iloc[0]
     df = df[df.Major == category]

Assets/.DS_Store CHANGED Viewed

Binary files a/Assets/.DS_Store and b/Assets/.DS_Store differ

NLselector.py CHANGED Viewed

@@ -181,7 +181,7 @@ def abs_dif(df,seed):
     text2 = Nearest Prediction
     text3 = Farthest Prediction'''
-    seed = process_text(seed)
     target = df[df['Words'] == seed].pred.iloc[0]
     sub_df = df[df['Words'] != seed].reset_index()
     nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]

     text2 = Nearest Prediction
     text3 = Farthest Prediction'''
+    #seed = process_text(seed)
     target = df[df['Words'] == seed].pred.iloc[0]
     sub_df = df[df['Words'] != seed].reset_index()
     nearest_prediction = sub_df.pred[(sub_df.pred-target).abs().argsort()[:1]]