Spaces:

ana-bernal
/

StackOverflowTagSuggestion

Runtime error

App Files Files Community

StackOverflowTagSuggestion / app.py

ana-bernal

not sure what changed

670b215 almost 3 years ago

raw

history blame contribute delete

8.87 kB

	import gradio as gr

	# For loading files
	from joblib import dump, load

	# Model hub
	import tensorflow_hub as hub

	# Language/text
	import spacy
	from bs4 import BeautifulSoup
	from spacy.symbols import ORTH

	# for listing tags from binary sequence
	from itertools import compress

	#------------------------------------------

	# Loading files
	path = './trained_models/'
	filename_model = 'multinomialNB-use.joblib'
	filename_scaler = 'scaler.joblib'

	# Loading model
	clf = load(path + filename_model)

	# Loading scaler
	scaler = load(path + filename_scaler)

	# Defining parameters
	thresh = 0.4
	tag_list = ['c#',
	'java',
	'javascript',
	'python',
	'c++',
	'ios',
	'android',
	'.net',
	'html',
	'php',
	'objective-c',
	'jquery',
	'c',
	'iphone',
	'sql',
	'asp.net',
	'css',
	'linux',
	'node.js',
	'performance',
	'spring',
	'windows',
	'swift',
	'xcode',
	'ruby-on-rails',
	'mysql',
	'json',
	'sql-server',
	'multithreading',
	'asp.net-mvc',
	'ruby',
	'database',
	'wpf',
	'unit-testing',
	'macos',
	'arrays',
	'c++11',
	'django']

	# Instantiating language model, english
	nlp = spacy.load("en_core_web_sm")

	import en_core_web_sm
	nlp = en_core_web_sm.load()

	# Importing stopwords
	with open('./stopwords/stopwords.txt') as file:
	my_stopwords = {line.rstrip() for line in file}

	# Adding my_stopwords to spacy stopwords
	nlp.Defaults.stop_words = nlp.Defaults.stop_words.union(my_stopwords)

	# Import and instantiate embedding model
	embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

	# Function definitions
	def remove_code(text):
	"""
	Removes "<code> some text </code>" from a text.
	or "<script> some text </script>"

	Parameters
	- text : str
	"""
	soup = BeautifulSoup(text,'lxml')
	code_to_remove = soup.findAll('code')
	for code in code_to_remove:
	code.replace_with(' ')

	code_to_remove = soup.findAll('script')
	for code in code_to_remove:
	code.replace_with(' ')

	return str(soup)

	def clean(text,tokenize=False,strict=False, **kwargs):
	"""
	Returns a dictionnary with keys 'text' or 'tokens', where
	'tokens' corresponds tothe list of lemmatized tokens from
	the string text. Ommiting stopwords and punctuation, and the text is
	the joint text.

	Parameters:
	- text: str
	- tokenize: bool
	If True returns list of tokens, if False returns string.
	- strict: bool
	If true only keeps nouns
	"""


	# Removing <code>some code</code>
	clean_txt = remove_code(text)

	# Removing HTML tags
	soup = BeautifulSoup(clean_txt, features='html.parser')
	clean_txt = soup.get_text()

	# Removing new line character: \n
	clean_txt = clean_txt.replace('\n', ' ')

	# Removing unicode characters
	clean_txt = clean_txt.encode("ascii", "ignore").decode()

	# Removing digits
	clean_txt = ''.join(char for char in clean_txt if not char.isdigit())

	# Replacing 'c ++' and 'c #' for 'c++' and 'c#' and others
	clean_txt = clean_txt.replace('c ++', 'c++')
	clean_txt = clean_txt.replace('c #', 'c#')
	clean_txt = clean_txt.replace('C ++', 'c++')
	clean_txt = clean_txt.replace('C #', 'c#')
	clean_txt = clean_txt.replace('C#', 'c#')
	clean_txt = clean_txt.replace('C ++', 'c++')

	# Adding special case rule
	special_case = [{ORTH: "c#"}]
	nlp.tokenizer.add_special_case("c#", special_case)
	special_case = [{ORTH: ".net"}]
	nlp.tokenizer.add_special_case(".net", special_case)
	special_case = [{ORTH: "objective-c"}]
	nlp.tokenizer.add_special_case("objective-c", special_case)
	special_case = [{ORTH: "asp.net"}]
	nlp.tokenizer.add_special_case("asp.net", special_case)
	special_case = [{ORTH: "node.js"}]
	nlp.tokenizer.add_special_case("node.js", special_case)
	special_case = [{ORTH: "ruby-on-rails"}]
	nlp.tokenizer.add_special_case("ruby-on-rails", special_case)
	special_case = [{ORTH: "sql-server"}]
	nlp.tokenizer.add_special_case("sql-server", special_case)
	special_case = [{ORTH: "unit-testing"}]
	nlp.tokenizer.add_special_case("unit-testing", special_case)

	# Tokenize with spacy
	doc = nlp(clean_txt)

	# Tokenize properties
	if strict == True:
	tokens = [token.lemma_.lower() for token in doc
	if token.pos_ in ['NOUN', 'PROPN', 'VERB'] and
	(not (token.is_stop or
	token.is_punct or
	token.is_space
	)
	)
	]
	else:
	tokens = [token.lemma_.lower() for token in doc
	if not (token.is_stop or
	token.is_punct or
	token.is_space
	)
	]

	clean_txt = ' '.join(tokens)

	# Ask if return text or tokens
	if tokenize == True:
	result = tokens
	else:
	result = clean_txt

	# Option for list of entities in output
	if 'ent' in kwargs:
	result = {'output':result, 'ents': doc.ents}

	return result

	def my_pred(X):
	"""
	Takes an embedding X obtained from the USE
	model, scales it with our scaler first and
	returns the prediction of our tag suggestion model in
	form of a binary list.
	"""
	# Scaling with pre-trained scaler
	X_scaled = scaler.transform(X)

	# Predicting probabilities, using best thresh pre-trained
	y_pred_proba = clf.predict_proba(X_scaled)
	y_pred = (y_pred_proba > thresh).astype(int).reshape((len(tag_list),))

	return y_pred

	def binary_to_tag_list(binary):
	"""
	Converts a binary list to the list of tags (str).
	"""
	fil = [bool(x) for x in list(binary)]
	list_tags = list(compress(tag_list,fil))

	return list_tags

	def tag_suggestion(raw_text):
	"""
	Returns a list of tags suggested for the question raw_text.
	"""
	# Clean text first
	clean_text = clean(raw_text)
	document = [clean_text]

	# Find an embedding of the text with USE
	X = embed(document)

	# Predict a tag set with our classification model
	pred = my_pred(X)

	return binary_to_tag_list(pred)

	# --------------------------------------------------

	examples = [
	["Jquery/Javascript Opacity animation with scroll <p>I'm looking to change the opacity on an object (and have the transition be animated) based on a users scroll.\nexample(http://davegamache.com/)</p>\n\n<p>I've searched everywhere\nlike here, but it ends up pointing me to the waypoints plugin (http://stackoverflow.com/questions/6316757/opacity-based-on-scroll-position)</p>\n\n<p>I've implemented the [waypoints][1] plugin and have the object fading once it's higher than 100px. [Using the offet attribute] but would like to basically control the opacity of an object and have the animation be visible like the above example.</p>\n\n<p>I've searched all over- this is my last resort.\nAny help is greatly appreciated.</p>\n"],
	['Setting cross-domain cookies in Safari <p>I have to call domain A.com (which sets the cookies with http) from domain B.com.\nAll I do on domain B.com is (javascript): </p>\n\n<pre><code>var head = document.getElementsByTagName("head")[0];\nvar script = document.createElement("script");\nscript.src = "A.com/setCookie?cache=1231213123";\nhead.appendChild(script);\n</code></pre>\n\n<p>This sets the cookie on A.com on every browser I\'ve tested, except Safari.\nAmazingly this works in IE6, even without the P3P headers.</p>\n\n<p>Is there any way to make this work in Safari?</p>\n'],
	['Database migrations for SQL Server <p>I need a database migration framework for SQL Server, capable of managing both schema changes and data migrations.</p>\n\n<p>I guess I am looking for something similar to django\'s <a href="http://south.aeracode.org/" rel="noreferrer">South</a> framework here.</p>\n\n<p>Given the fact that South is tightly coupled with django\'s ORM, and the fact that there\'s so many ORMs for SQL Server I guess having just a generic migration framework, enabling you to write and execute in controlled and sequential manner SQL data/schema change scripts should be sufficient.</p>\n'],
	]

	demo = gr.Interface(fn=tag_suggestion,
	inputs="text",
	outputs=["text"],
	examples=examples)


	if __name__ == "__main__":
	demo.launch()