ana-bernal's picture
not sure what changed
670b215
import gradio as gr
# For loading files
from joblib import dump, load
# Model hub
import tensorflow_hub as hub
# Language/text
import spacy
from bs4 import BeautifulSoup
from spacy.symbols import ORTH
# for listing tags from binary sequence
from itertools import compress
#------------------------------------------
# Loading files
path = './trained_models/'
filename_model = 'multinomialNB-use.joblib'
filename_scaler = 'scaler.joblib'
# Loading model
clf = load(path + filename_model)
# Loading scaler
scaler = load(path + filename_scaler)
# Defining parameters
thresh = 0.4
tag_list = ['c#',
'java',
'javascript',
'python',
'c++',
'ios',
'android',
'.net',
'html',
'php',
'objective-c',
'jquery',
'c',
'iphone',
'sql',
'asp.net',
'css',
'linux',
'node.js',
'performance',
'spring',
'windows',
'swift',
'xcode',
'ruby-on-rails',
'mysql',
'json',
'sql-server',
'multithreading',
'asp.net-mvc',
'ruby',
'database',
'wpf',
'unit-testing',
'macos',
'arrays',
'c++11',
'django']
# Instantiating language model, english
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
# Importing stopwords
with open('./stopwords/stopwords.txt') as file:
my_stopwords = {line.rstrip() for line in file}
# Adding my_stopwords to spacy stopwords
nlp.Defaults.stop_words = nlp.Defaults.stop_words.union(my_stopwords)
# Import and instantiate embedding model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# Function definitions
def remove_code(text):
"""
Removes "<code> some text </code>" from a text.
or "<script> some text </script>"
Parameters
- text : str
"""
soup = BeautifulSoup(text,'lxml')
code_to_remove = soup.findAll('code')
for code in code_to_remove:
code.replace_with(' ')
code_to_remove = soup.findAll('script')
for code in code_to_remove:
code.replace_with(' ')
return str(soup)
def clean(text,tokenize=False,strict=False, **kwargs):
"""
Returns a dictionnary with keys 'text' or 'tokens', where
'tokens' corresponds tothe list of lemmatized tokens from
the string text. Ommiting stopwords and punctuation, and the text is
the joint text.
Parameters:
- text: str
- tokenize: bool
If True returns list of tokens, if False returns string.
- strict: bool
If true only keeps nouns
"""
# Removing <code>some code</code>
clean_txt = remove_code(text)
# Removing HTML tags
soup = BeautifulSoup(clean_txt, features='html.parser')
clean_txt = soup.get_text()
# Removing new line character: \n
clean_txt = clean_txt.replace('\n', ' ')
# Removing unicode characters
clean_txt = clean_txt.encode("ascii", "ignore").decode()
# Removing digits
clean_txt = ''.join(char for char in clean_txt if not char.isdigit())
# Replacing 'c ++' and 'c #' for 'c++' and 'c#' and others
clean_txt = clean_txt.replace('c ++', 'c++')
clean_txt = clean_txt.replace('c #', 'c#')
clean_txt = clean_txt.replace('C ++', 'c++')
clean_txt = clean_txt.replace('C #', 'c#')
clean_txt = clean_txt.replace('C#', 'c#')
clean_txt = clean_txt.replace('C ++', 'c++')
# Adding special case rule
special_case = [{ORTH: "c#"}]
nlp.tokenizer.add_special_case("c#", special_case)
special_case = [{ORTH: ".net"}]
nlp.tokenizer.add_special_case(".net", special_case)
special_case = [{ORTH: "objective-c"}]
nlp.tokenizer.add_special_case("objective-c", special_case)
special_case = [{ORTH: "asp.net"}]
nlp.tokenizer.add_special_case("asp.net", special_case)
special_case = [{ORTH: "node.js"}]
nlp.tokenizer.add_special_case("node.js", special_case)
special_case = [{ORTH: "ruby-on-rails"}]
nlp.tokenizer.add_special_case("ruby-on-rails", special_case)
special_case = [{ORTH: "sql-server"}]
nlp.tokenizer.add_special_case("sql-server", special_case)
special_case = [{ORTH: "unit-testing"}]
nlp.tokenizer.add_special_case("unit-testing", special_case)
# Tokenize with spacy
doc = nlp(clean_txt)
# Tokenize properties
if strict == True:
tokens = [token.lemma_.lower() for token in doc
if token.pos_ in ['NOUN', 'PROPN', 'VERB'] and
(not (token.is_stop or
token.is_punct or
token.is_space
)
)
]
else:
tokens = [token.lemma_.lower() for token in doc
if not (token.is_stop or
token.is_punct or
token.is_space
)
]
clean_txt = ' '.join(tokens)
# Ask if return text or tokens
if tokenize == True:
result = tokens
else:
result = clean_txt
# Option for list of entities in output
if 'ent' in kwargs:
result = {'output':result, 'ents': doc.ents}
return result
def my_pred(X):
"""
Takes an embedding X obtained from the USE
model, scales it with our scaler first and
returns the prediction of our tag suggestion model in
form of a binary list.
"""
# Scaling with pre-trained scaler
X_scaled = scaler.transform(X)
# Predicting probabilities, using best thresh pre-trained
y_pred_proba = clf.predict_proba(X_scaled)
y_pred = (y_pred_proba > thresh).astype(int).reshape((len(tag_list),))
return y_pred
def binary_to_tag_list(binary):
"""
Converts a binary list to the list of tags (str).
"""
fil = [bool(x) for x in list(binary)]
list_tags = list(compress(tag_list,fil))
return list_tags
def tag_suggestion(raw_text):
"""
Returns a list of tags suggested for the question raw_text.
"""
# Clean text first
clean_text = clean(raw_text)
document = [clean_text]
# Find an embedding of the text with USE
X = embed(document)
# Predict a tag set with our classification model
pred = my_pred(X)
return binary_to_tag_list(pred)
# --------------------------------------------------
examples = [
["Jquery/Javascript Opacity animation with scroll <p>I'm looking to change the opacity on an object (and have the transition be animated) based on a users scroll.\nexample(http://davegamache.com/)</p>\n\n<p>I've searched everywhere\nlike here, but it ends up pointing me to the waypoints plugin (http://stackoverflow.com/questions/6316757/opacity-based-on-scroll-position)</p>\n\n<p>I've implemented the [waypoints][1] plugin and have the object fading once it's higher than 100px. [Using the offet attribute] but would like to basically control the opacity of an object and have the animation be visible like the above example.</p>\n\n<p>I've searched all over- this is my last resort.\nAny help is greatly appreciated.</p>\n"],
['Setting cross-domain cookies in Safari <p>I have to call domain A.com (which sets the cookies with http) from domain B.com.\nAll I do on domain B.com is (javascript): </p>\n\n<pre><code>var head = document.getElementsByTagName("head")[0];\nvar script = document.createElement("script");\nscript.src = "A.com/setCookie?cache=1231213123";\nhead.appendChild(script);\n</code></pre>\n\n<p>This sets the cookie on A.com on every browser I\'ve tested, except Safari.\nAmazingly this works in IE6, even without the P3P headers.</p>\n\n<p>Is there any way to make this work in Safari?</p>\n'],
['Database migrations for SQL Server <p>I need a database migration framework for SQL Server, capable of managing both schema changes and data migrations.</p>\n\n<p>I guess I am looking for something similar to django\'s <a href="http://south.aeracode.org/" rel="noreferrer">South</a> framework here.</p>\n\n<p>Given the fact that South is tightly coupled with django\'s ORM, and the fact that there\'s so many ORMs for SQL Server I guess having just a generic migration framework, enabling you to write and execute in controlled and sequential manner SQL data/schema change scripts should be sufficient.</p>\n'],
]
demo = gr.Interface(fn=tag_suggestion,
inputs="text",
outputs=["text"],
examples=examples)
if __name__ == "__main__":
demo.launch()