Spaces:

ana-bernal
/

StackOverflowTagSuggestion

Runtime error

File size: 8,874 Bytes

import gradio as gr

# For loading files
from joblib import dump, load

# Model hub
import tensorflow_hub as hub

# Language/text
import spacy
from bs4 import BeautifulSoup
from spacy.symbols import ORTH

# for listing tags from binary sequence
from itertools import compress

#------------------------------------------

# Loading files
path = './trained_models/'
filename_model = 'multinomialNB-use.joblib'
filename_scaler = 'scaler.joblib'

# Loading model
clf = load(path + filename_model)

# Loading scaler
scaler = load(path + filename_scaler)

# Defining parameters
thresh = 0.4
tag_list = ['c#', 
            'java', 
            'javascript', 
            'python', 
            'c++', 
            'ios', 
            'android', 
            '.net', 
            'html', 
            'php', 
            'objective-c', 
            'jquery', 
            'c', 
            'iphone', 
            'sql', 
            'asp.net', 
            'css', 
            'linux', 
            'node.js', 
            'performance', 
            'spring', 
            'windows', 
            'swift', 
            'xcode', 
            'ruby-on-rails', 
            'mysql', 
            'json', 
            'sql-server', 
            'multithreading', 
            'asp.net-mvc', 
            'ruby', 
            'database', 
            'wpf', 
            'unit-testing', 
            'macos', 
            'arrays', 
            'c++11', 
            'django']

# Instantiating language model, english
nlp = spacy.load("en_core_web_sm")

import en_core_web_sm
nlp = en_core_web_sm.load()

# Importing stopwords
with open('./stopwords/stopwords.txt') as file:
    my_stopwords = {line.rstrip() for line in file}

# Adding my_stopwords to spacy stopwords
nlp.Defaults.stop_words = nlp.Defaults.stop_words.union(my_stopwords)

# Import and instantiate embedding model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Function definitions
def remove_code(text):
    """
    Removes "<code> some text </code>" from a text.
    or "<script> some text </script>"

    Parameters
        - text : str
    """
    soup = BeautifulSoup(text,'lxml')
    code_to_remove = soup.findAll('code')
    for code in code_to_remove:
        code.replace_with(' ')
        
    code_to_remove = soup.findAll('script')
    for code in code_to_remove:
        code.replace_with(' ')

    return str(soup)

def clean(text,tokenize=False,strict=False, **kwargs):
    """
    Returns a dictionnary with keys 'text' or 'tokens', where
    'tokens' corresponds tothe list of lemmatized tokens from
    the string text. Ommiting stopwords and punctuation, and the text is
    the joint text.

    Parameters:
        - text: str
        - tokenize: bool
            If True returns list of tokens, if False returns string.
        - strict: bool
            If true only keeps nouns
    """


    # Removing <code>some code</code>
    clean_txt = remove_code(text)

    # Removing HTML tags
    soup = BeautifulSoup(clean_txt, features='html.parser')
    clean_txt = soup.get_text()

    # Removing new line character: \n
    clean_txt = clean_txt.replace('\n', ' ')

    # Removing unicode characters
    clean_txt = clean_txt.encode("ascii", "ignore").decode()
    
    # Removing digits
    clean_txt = ''.join(char for char in clean_txt if not char.isdigit())

    # Replacing 'c ++' and 'c #' for 'c++' and 'c#' and others
    clean_txt = clean_txt.replace('c ++', 'c++')
    clean_txt = clean_txt.replace('c #', 'c#')
    clean_txt = clean_txt.replace('C ++', 'c++')
    clean_txt = clean_txt.replace('C #', 'c#')
    clean_txt = clean_txt.replace('C#', 'c#')
    clean_txt = clean_txt.replace('C ++', 'c++')

    # Adding special case rule
    special_case = [{ORTH: "c#"}] 
    nlp.tokenizer.add_special_case("c#", special_case)
    special_case = [{ORTH: ".net"}] 
    nlp.tokenizer.add_special_case(".net", special_case)
    special_case = [{ORTH: "objective-c"}] 
    nlp.tokenizer.add_special_case("objective-c", special_case)
    special_case = [{ORTH: "asp.net"}]
    nlp.tokenizer.add_special_case("asp.net", special_case)
    special_case = [{ORTH: "node.js"}]
    nlp.tokenizer.add_special_case("node.js", special_case)
    special_case = [{ORTH: "ruby-on-rails"}]
    nlp.tokenizer.add_special_case("ruby-on-rails", special_case)    
    special_case = [{ORTH: "sql-server"}] 
    nlp.tokenizer.add_special_case("sql-server", special_case)    
    special_case = [{ORTH: "unit-testing"}] 
    nlp.tokenizer.add_special_case("unit-testing", special_case)
  
    # Tokenize with spacy
    doc = nlp(clean_txt)

    # Tokenize properties
    if strict == True:
        tokens = [token.lemma_.lower() for token in doc
                    if token.pos_ in ['NOUN', 'PROPN', 'VERB'] and
                        (not (token.is_stop or 
                              token.is_punct or
                              token.is_space
                              )
                        )
                 ]    
    else:
        tokens = [token.lemma_.lower() for token in doc
                    if not (token.is_stop or 
                            token.is_punct or
                            token.is_space
                            )
                 ]

    clean_txt = ' '.join(tokens)
    
    # Ask if return text or tokens
    if tokenize == True:
        result = tokens
    else:
        result = clean_txt

    # Option for list of entities in output
    if 'ent' in kwargs:
        result = {'output':result, 'ents': doc.ents}

    return result

def my_pred(X):
    """
    Takes an embedding X obtained from the USE
    model, scales it with our scaler first and
    returns the prediction of our tag suggestion model in 
    form of a binary list.
    """
    # Scaling with pre-trained scaler
    X_scaled = scaler.transform(X)

    # Predicting probabilities, using best thresh pre-trained
    y_pred_proba = clf.predict_proba(X_scaled)
    y_pred = (y_pred_proba > thresh).astype(int).reshape((len(tag_list),))

    return y_pred

def binary_to_tag_list(binary):
    """
    Converts a binary list to the list of tags (str).
    """
    fil = [bool(x) for x in list(binary)]
    list_tags = list(compress(tag_list,fil))

    return list_tags

def tag_suggestion(raw_text):
    """
    Returns a list of tags suggested for the question raw_text.
    """
    # Clean text first
    clean_text = clean(raw_text)
    document = [clean_text]

    # Find an embedding of the text with USE 
    X = embed(document)

    # Predict a tag set with our classification model
    pred = my_pred(X)

    return binary_to_tag_list(pred)

# --------------------------------------------------

examples = [
    ["Jquery/Javascript Opacity animation with scroll <p>I'm looking to change the opacity on an object (and have the transition be animated) based on a users scroll.\nexample(http://davegamache.com/)</p>\n\n<p>I've searched everywhere\nlike here, but it ends up pointing me to the waypoints plugin (http://stackoverflow.com/questions/6316757/opacity-based-on-scroll-position)</p>\n\n<p>I've implemented the [waypoints][1] plugin and have the object fading once it's higher than 100px. [Using the offet attribute] but would like to basically control the opacity of an object and have the animation be visible like the above example.</p>\n\n<p>I've searched all over- this is my last resort.\nAny help is greatly appreciated.</p>\n"],
    ['Setting cross-domain cookies in Safari <p>I have to call domain A.com (which sets the cookies with http) from domain B.com.\nAll I do on domain B.com is (javascript): </p>\n\n<pre><code>var head = document.getElementsByTagName("head")[0];\nvar script = document.createElement("script");\nscript.src = "A.com/setCookie?cache=1231213123";\nhead.appendChild(script);\n</code></pre>\n\n<p>This sets the cookie on A.com on every browser I\'ve tested, except Safari.\nAmazingly this works in IE6, even without the P3P headers.</p>\n\n<p>Is there any way to make this work in Safari?</p>\n'],
    ['Database migrations for SQL Server <p>I need a database migration framework for SQL Server, capable of managing both schema changes and data migrations.</p>\n\n<p>I guess I am looking for something similar to django\'s <a href="http://south.aeracode.org/" rel="noreferrer">South</a> framework here.</p>\n\n<p>Given the fact that South is tightly coupled with django\'s ORM, and the fact that there\'s so many ORMs for SQL Server I guess having just a generic migration framework, enabling you to write and execute in controlled and sequential manner SQL data/schema change scripts should be sufficient.</p>\n'],
]

demo = gr.Interface(fn=tag_suggestion, 
                    inputs="text", 
                    outputs=["text"],
                    examples=examples)


if __name__ == "__main__":
    demo.launch()