Spaces:

nazneen
/

interactive-model-cards

Build error

App Files Files Community

interactive-model-cards / interactive_model_cards /utils /misc.py

nazneen

fixed downsamp bug

fa5cfac almost 4 years ago

raw

history blame contribute delete

4.57 kB

	import pandas as pd
	from numpy import floor


	#--- gensim ---
	from nltk.tokenize import word_tokenize
	from gensim.models.doc2vec import Doc2Vec, TaggedDocument


	def conf_level(val):
	""" Translates probability value into
	a plain english statement """
	# https://www.dni.gov/files/documents/ICD/ICD%20203%20Analytic%20Standards.pdf
	conf = "undefined"

	if val < 0.05:
	conf = "Extremely Low Probability"
	elif val >= 0.05 and val < 0.20:
	conf = "Very Low Probability"
	elif val >= 0.20 and val < 0.45:
	conf = "Low Probability"
	elif val >= 0.45 and val < 0.55:
	conf = "Middling Probability"
	elif val >= 0.55 and val < 0.80:
	conf = "High Probability"
	elif val >= 0.80 and val < 0.95:
	conf = "Very High Probability"
	elif val >= 0.95:
	conf = "Extremely High Probability"

	return conf


	def subsample_df(df=None, size=10, sample_type="Random Sample"):
	""" Subsample the dataframe """
	size = int(size)
	if sample_type == "Random Sample":
	return df.sample(size)
	elif sample_type == "Highest Probabilities":
	df.sort_values(by="probability", ascending=False, inplace=True)
	return df.head(size)
	elif sample_type == "Lowest Probabilities":
	df.sort_values(by="probability", ascending=True, inplace=True)
	return df.head(size)
	else:
	# sample probabilities in the middle
	tmp = df[(df["probability"] > 0.45) & (df["probability"] < 0.55)]
	samp = min([size, int(tmp.shape[0])])
	return tmp.sample(samp)


	def down_samp(embedding):
	"""Down sample a data frame for altiar visualization """
	#total number of positive and negative sentiments in the class
	total_size = embedding.groupby(['name', 'sentiment'],as_index=False).count()

	user_data = 0
	if 'Your Sentences' in str(total_size['name']):
	tmp = embedding.groupby(['name'],as_index=False).count()
	val = int(tmp[tmp['name'] == "Your Sentences"]['source'])
	user_data=val

	max_sample = total_size.groupby('name').max()['source']

	#down sample to meeting altair's max values
	#but keep the proportional representation of groups
	down_samp = 1/(sum(max_sample)/(5000-user_data))

	max_samp = max_sample.apply(lambda x: floor(x*down_samp)).astype(int).to_dict()
	max_samp['Your Sentences'] = user_data

	#sample down for each group in the data frame
	embedding= embedding.groupby('name').apply(lambda x: x.sample(n=max_samp.get(x.name))).reset_index(drop = True)

	#order the embedding
	return(embedding.sort_values(['sort_order'],ascending=True))



	def prep_embed_data(data,model):
	''' Basic data tagging'''
	tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]
	embedding = [model.infer_vector(tagged_data[i].words) for i in range(len(tagged_data))]
	return embedding

	def prep_sentence_embedding(name,source, sentence, sentiment, sort_order,embed_model,idx,type="single"):
	""" Prepare a custom sentence to add to the embedding"""


	if type == "single":
	#get vector embedding
	tagged_data = TaggedDocument(words=word_tokenize(sentence.lower()), tags=['source'])
	vector = embed_model.infer_vector(tagged_data.words)

	tmp = {
	'source': source,
	'name': name,
	'sort_order': sort_order,
	'sentence': sentence,
	'sentiment': sentiment,
	'x': vector[0],
	'y':vector[1]
	}

	return(pd.DataFrame(tmp,index=[idx]))
	else:
	#go through each group and add
	df = {"source":[],
	"name":[],
	"sentence":[],
	"sentiment":[],
	"x":[],
	"y":[],
	"sort_order":[]
	}


	slice_short = sentence
	slice_sentiment = sentiment
	vec_embedding = prep_embed_data(sentence,embed_model)

	df['source'] = df['source'] + [source]*len(slice_short)
	df['name'] = df['name'] + [name]*len(slice_short)

	#the sort order effects how its drawn by altair
	df['sort_order'] = df['sort_order'] + [sort_order]*len(slice_short)

	#add individual elements
	for i in range(len(slice_short)):
	df['sentence'].append(slice_short[i])
	df['sentiment'].append(slice_sentiment[i])
	df['x'].append(vec_embedding[i][0])
	df['y'].append(vec_embedding[i][1])

	df = pd.DataFrame(df)
	return(df)