Spaces:

ana-bernal
/

StackOverflowTagSuggestion

Runtime error

App Files Files Community

ana-bernal commited on Mar 2, 2023

Commit

2c67c5b

1 Parent(s): 332a5b2

Modified app.py

Browse files

Files changed (1) hide show

app.py +257 -0

app.py CHANGED Viewed

@@ -1,5 +1,262 @@
 import gradio as gr
 def greet(name):
     return "Hello " + name + "!!"

 import gradio as gr
+# For loading files
+from joblib import dump, load
+# Model hub
+import tensorflow_hub as hub
+# Language/text
+import spacy
+from bs4 import BeautifulSoup
+from spacy.symbols import ORTH
+# for listing tags from binary sequence
+from itertools import compress
+#---------------------------------------
+# Loading files
+path = './trained_models/'
+filename_model = 'multinomialNB-use.joblib'
+filename_scaler = 'scaler.joblib'
+# Loading model
+clf = load(path + filename_model)
+# Loading scaler
+scaler = load(path + filename_scaler)
+#------------------------------------------
+# Parameters
+thresh = 0.4
+tag_list = ['c#',
+            'java',
+            'javascript',
+            'python',
+            'c++',
+            'ios',
+            'android',
+            '.net',
+            'html',
+            'php',
+            'objective-c',
+            'jquery',
+            'c',
+            'iphone',
+            'sql',
+            'asp.net',
+            'css',
+            'linux',
+            'node.js',
+            'performance',
+            'spring',
+            'windows',
+            'swift',
+            'xcode',
+            'ruby-on-rails',
+            'mysql',
+            'json',
+            'sql-server',
+            'multithreading',
+            'asp.net-mvc',
+            'ruby',
+            'database',
+            'wpf',
+            'unit-testing',
+            'macos',
+            'arrays',
+            'c++11',
+            'django']
+# --------------------------------------------
+# Functions
+def remove_code(text):
+    """
+    Removes "<code> some text </code>" from a text.
+    or "<script> some text </script>"
+    Parameters
+        - text : str
+    """
+    soup = BeautifulSoup(text,'lxml')
+    code_to_remove = soup.findAll('code')
+    for code in code_to_remove:
+        code.replace_with(' ')
+    code_to_remove = soup.findAll('script')
+    for code in code_to_remove:
+        code.replace_with(' ')
+    return str(soup)
+def instantiate_spacy():
+    global nlp
+    # Instantiating language model, english
+    nlp = spacy.load("en_core_web_sm")
+def import_stopwords():
+    # Importing stopwords
+    with open('./stopwords/stopwords.txt') as file:
+        my_stopwords = {line.rstrip() for line in file}
+    # Adding my_stopwords to spacy stopwords
+    nlp.Defaults.stop_words = nlp.Defaults.stop_words.union(my_stopwords)
+def clean(text,tokenize=False,strict=False, **kwargs):
+    """
+    Returns a dictionnary with keys 'text' or 'tokens', where
+    'tokens' corresponds tothe list of lemmatized tokens from
+    the string text. Ommiting stopwords and punctuation, and the text is
+    the joint text.
+    Parameters:
+        - text: str
+        - tokenize: bool
+            If True returns list of tokens, if False returns string.
+        - strict: bool
+            If true only keeps nouns
+    """
+    # Removing <code>some code</code>
+    clean_txt = remove_code(text)
+    # Removing HTML tags
+    soup = BeautifulSoup(clean_txt, features='html.parser')
+    clean_txt = soup.get_text()
+    # Removing new line character: \n
+    clean_txt = clean_txt.replace('\n', ' ')
+    # Removing unicode characters
+    clean_txt = clean_txt.encode("ascii", "ignore").decode()
+    # Removing digits
+    clean_txt = ''.join(char for char in clean_txt if not char.isdigit())
+    # Replacing 'c ++' and 'c #' for 'c++' and 'c#' and others
+    clean_txt = clean_txt.replace('c ++', 'c++')
+    clean_txt = clean_txt.replace('c #', 'c#')
+    clean_txt = clean_txt.replace('C ++', 'c++')
+    clean_txt = clean_txt.replace('C #', 'c#')
+    clean_txt = clean_txt.replace('C#', 'c#')
+    clean_txt = clean_txt.replace('C ++', 'c++')
+    # Adding special case rule
+    special_case = [{ORTH: "c#"}]
+    nlp.tokenizer.add_special_case("c#", special_case)
+    special_case = [{ORTH: ".net"}]
+    nlp.tokenizer.add_special_case(".net", special_case)
+    special_case = [{ORTH: "objective-c"}]
+    nlp.tokenizer.add_special_case("objective-c", special_case)
+    special_case = [{ORTH: "asp.net"}]
+    nlp.tokenizer.add_special_case("asp.net", special_case)
+    special_case = [{ORTH: "node.js"}]
+    nlp.tokenizer.add_special_case("node.js", special_case)
+    special_case = [{ORTH: "ruby-on-rails"}]
+    nlp.tokenizer.add_special_case("ruby-on-rails", special_case)
+    special_case = [{ORTH: "sql-server"}]
+    nlp.tokenizer.add_special_case("sql-server", special_case)
+    special_case = [{ORTH: "unit-testing"}]
+    nlp.tokenizer.add_special_case("unit-testing", special_case)
+    # Tokenize with spacy
+    doc = nlp(clean_txt)
+    # Tokenize properties
+    if strict == True:
+        tokens = [token.lemma_.lower() for token in doc
+                    if token.pos_ in ['NOUN', 'PROPN', 'VERB'] and
+                        (not (token.is_stop or
+                              token.is_punct or
+                              token.is_space
+                              )
+                        )
+                 ]
+    else:
+        tokens = [token.lemma_.lower() for token in doc
+                    if not (token.is_stop or
+                            token.is_punct or
+                            token.is_space
+                            )
+                 ]
+    clean_txt = ' '.join(tokens)
+    # Ask if return text or tokens
+    if tokenize == True:
+        result = tokens
+    else:
+        result = clean_txt
+    # Option for list of entities in output
+    if 'ent' in kwargs:
+        result = {'output':result, 'ents': doc.ents}
+    return result
+def my_pred(X):
+    """
+    Takes an embedding X obtained from the USE
+    model, scales it with our scaler first and
+    returns the prediction of our tag suggestion model in
+    form of a binary list.
+    """
+    # Scaling with pre-trained scaler
+    X_scaled = scaler.transform(X)
+    # Predicting probabilities, using best thresh pre-trained
+    y_pred_proba = clf.predict_proba(X_scaled)
+    y_pred = (y_pred_proba > thresh).astype(int).reshape((len(tag_list),))
+    return y_pred
+def binary_to_tag_list(binary):
+    """
+    Converts a binary list to the list of tags (str).
+    """
+    fil = [bool(x) for x in list(binary)]
+    list_tags = list(compress(tag_list,fil))
+    return list_tags
+def tag_suggestion(raw_text):
+    """
+    Returns a list of tags suggested for the question raw_text.
+    """
+    # Clean text first
+    clean_text = clean(raw_text)
+    document = [clean_text]
+    # Find an embedding of the text with USE
+    X = embed(document)
+    # Predict a tag set with our classification model
+    pred = my_pred(X)
+    return binary_to_tag_list(pred)
+# --------------------------------------------------
+# Execution
+instantiate_spacy()
+import_stopwords()
+# Import and insantiate embedding model
+embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
+# --------------------------------------------------
 def greet(name):
     return "Hello " + name + "!!"