Spaces:

felipekitamura
/

word_embeddings

Build error

App Files Files Community

word_embeddings / app.py

felipekitamura

Update app.py

34222dc verified 12 months ago

raw

history blame contribute delete

3.05 kB

	import gensim.downloader
	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.decomposition import PCA
	from sklearn.manifold import TSNE
	model = gensim.downloader.load("word2vec-google-news-300") #glove-wiki-gigaword-50

	cache = "/home/user/app/d.png"

	# Function to reduce dimensions
	def reduce_dimensions(data, method='PCA'):
	if method == 'PCA':
	model = PCA(n_components=2)
	elif method == 'TSNE':
	model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=4)
	return model.fit_transform(data)

	# Plotting function
	def plot_reduced_data(reduced_data, labels, title):
	plt.figure(figsize=(10, 8))
	plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
	for i, label in enumerate(labels):
	plt.annotate(" " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
	plt.title(title)
	# Data for the arrow 1
	start_point = (reduced_data[0, 0], reduced_data[0, 1]) # Starting point of the arrow
	end_point = (reduced_data[1, 0], reduced_data[1, 1]) # Ending point of the arrow

	# Adding an arrow 1
	plt.annotate('', xy=end_point, xytext=start_point,
	arrowprops=dict(arrowstyle="->", color='green', lw=3))

	# Data for the arrow 2
	end_point = (reduced_data[-1, 0] , reduced_data[-1, 1]) # Starting point of the arrow
	start_point = (reduced_data[2, 0], reduced_data[2, 1]) # Ending point of the arrow

	# Adding an arrow 2
	plt.annotate('', xy=end_point, xytext=start_point,
	arrowprops=dict(arrowstyle="->", color='green', lw=3))

	plt.xlabel('Component 1')
	plt.ylabel('Component 2')
	plt.grid(True)
	plt.savefig(cache) #, dpi=300)

	description = """
	### Word Embedding Demo App
	Universidade Federal de São Paulo - Escola Paulista de Medicina

	The output is Word3 + (Word2 - Word1)

	Credits:
	* Gensim
	* Word2Vec
	"""

	Word1 = gr.Textbox()
	Word2 = gr.Textbox()
	Word3 = gr.Textbox()
	label = gr.Label(show_label=True, label="Word4")
	sp = gr.Image()


	def inference(word1, word2, word3):
	transform = model[word3] + model[word2] - model[word1]
	output = model.similar_by_vector(transform)
	print(output)
	word_list = [word1, word2, word3]
	word_list.extend([x for x,y in [item for item in output[:6]]])
	words = {key: model[key] for key in word_list}
	words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
	data = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
	print(data.shape)
	labels = words.keys()
	reduced_data_pca = reduce_dimensions(data, method='PCA')
	print(reduced_data_pca.shape)
	plot_reduced_data(reduced_data_pca, labels, 'PCA Results')
	return cache

	examples = [
	["woman", "man", "girl"],
	["woman", "man", "granddaughter"],
	["woman", "man", "aunt"],
	]

	iface = gr.Interface(
	fn=inference,
	inputs=[Word1, Word2, Word3],
	outputs=sp,
	description=description,
	examples=examples
	)

	iface.launch()