Spaces:

cogcorp
/

homework

Sleeping

App Files Files Community

homework / app.py

cogcorp

Update app.py

0bbc874 over 2 years ago

raw

history blame contribute delete

8.75 kB

	import gradio as gr
	import pandas as pd
	import os
	import spacy
	import spacy.cli
	import numpy as np
	import zipfile
	import tempfile
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from keras.preprocessing.text import text_to_word_sequence
	import openai
	import re


	#vendor name (from column 0)
	# Put your OpenAI API key here
	openai.api_key = os.getenv('API')

	try:
	nlp = spacy.load('en_core_web_md')
	except IOError:
	print("Model not found. Downloading...")
	spacy.cli.download("en_core_web_md")
	nlp = spacy.load('en_core_web_md')


	vendor_df_dict = {}

	# A dictionary to store the total Trellis Score of each vendor
	total_trellis_scores = {}


	def preprocess_text(text):
	if isinstance(text, str):
	text = text.lower()
	filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{\|}~\t\n'
	text = text.translate(str.maketrans('', '', filters))
	stop_words = ['does', 'the', 'offer', 'do', 'you', 'require']
	words = text_to_word_sequence(text)
	words = [word for word in words if word not in stop_words]
	return ' '.join(words)
	else:
	return text

	def semantic_search(query, data):
	query = str(query)
	data = [str(text) for text in data]
	query = preprocess_text(query)
	data = [preprocess_text(text) for text in data]
	vectorizer = TfidfVectorizer().fit_transform([query] + data)
	cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
	return np.argmax(cosine_similarities[1:])

	def parse_score(score):
	level_scores = {
	'Level 1 - Basic': 1,
	'Level 2 - Developing': 2,
	'Level 3 - Intermediate': 3,
	'Level 4 - Advanced': 4,
	'Level 5 - Leading': 5,
	'1 - Low': 1,
	'2 - Below average': 2,
	'3 - Average': 3,
	'4 - Above average': 4,
	'5 - High': 5,
	'1 - Very Low': 1,
	'2 - Low': 2,
	'3 - Medium': 3,
	'4 - High-Medium': 4,
	'5 - Very High': 5
	}

	if score is None or str(score).strip() == '':
	return 3

	if isinstance(score, str):
	score = score.replace(',', '.')
	if score in level_scores:
	return level_scores[score]
	else:
	number = re.findall(r"[-+]?\d*\.\d+\|\d+", score)
	if number:
	return float(number[0])
	return 0

	def load_vendor_files(zip_file_path):
	global vendor_df_dict
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	temp_dir = tempfile.TemporaryDirectory()
	zip_ref.extractall(temp_dir.name)
	for file_name in os.listdir(temp_dir.name):
	if file_name.endswith(".xlsx"):
	vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
	vendor_df_dict[file_name] = vendor_df_dict[file_name][vendor_df_dict[file_name].iloc[:, 2] != 'No'] # Exclude rows with 'No' in the 3rd column
	vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
	vendor_df_dict[file_name].iloc[:, 4].fillna('3 - Medium', inplace=True)
	vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
	vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
	temp_dir.cleanup()



	def process_file(client_file):
	zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
	load_vendor_files(zip_file_path) # Load vendor files from zip file
	client_df = pd.read_excel(client_file.name)

	client_df.iloc[:, 2].fillna('3 - Medium', inplace=True)

	client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) #return numbr only from client score

	client_df = client_df[client_df.iloc[:, 1] == 'Yes']
	client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
	client_df['score_client'] = client_df['score_client'].astype(float)

	matches_found = False # Flag to check if any matches were found
	highest_score_vendor = None
	highest_score = 0
	total_scores = {} # dictionary to store the total Trellis Score for each vendor

	with pd.ExcelWriter('matches.xlsx') as writer:
	common_list = []
	for vendor_file, vendor_df in vendor_df_dict.items():
	for index, row in client_df.iterrows():

	most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])

	most_similar_row = vendor_df.iloc[most_similar_index, :]
	client_score = row['score_client'] # directly access the pre-parsed client score
	vendor_score = parse_score(most_similar_row[4])

	client_row_selected = row[[0, 2, 3]]
	vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
	combined_row = pd.concat([client_row_selected, vendor_row_selected])

	trellis_score = client_score * vendor_score
	combined_row['Trellis Score'] = trellis_score
	common_list.append(combined_row)

	if trellis_score > highest_score:
	highest_score = trellis_score
	highest_score_vendor = vendor_file

	if common_list:
	common_df = pd.DataFrame(common_list)
	# Compute the total Trellis Score
	total_trellis_score = common_df['Trellis Score'].sum()
	total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
	# Add a row with the total Trellis Score to the DataFrame
	common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
	common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score

	common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
	common_list = [] # Reset the common_list for next vendor_file
	matches_found = True # Set the flag to True as matches were found

	top_three_vendors = sorted(total_trellis_scores, key=total_trellis_scores.get, reverse=True)[:3]
	top_three_vendors = [vendor.split(".")[0] for vendor in top_three_vendors]


	if not matches_found: # In case there were no matches
	return "No matching data found.", None, None
	else:
	def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=3000, temperature=0.3):
	try:
	response = openai.ChatCompletion.create(
	model=engine,
	messages=[
	{"role": "system", "content": "You are a helpful AI."},
	{"role": "user", "content": prompt}
	],
	max_tokens=max_tokens,
	temperature=temperature
	)
	return response['choices'][0]['message']['content'].strip()

	except Exception as e:
	print(f"Error in gpt3_query: {str(e)}")
	return None

	# Get GPT-3.5-turbo to create a summary text
	summary = gpt3_query(f"""Based on the Trellis Score, the top three vendors are {', '.join(top_three_vendors)}.

	We have analyzed the performance of several vendors and identified the top three based on their Trellis Scores. Here are the key data points for each:

	1. [name vendor 1]
	- Trellis Score: {total_trellis_scores}
	- Strengths: [brief summary of strengths]
	- Weaknesses: [brief summary of weaknesses]
	- Key Features: [brief summary of key features]

	2. [name vendor 2]
	- Strengths: [brief summary of strengths]
	- Weaknesses: [brief summary of weaknesses]
	- Key Features: [brief summary of key features]

	3. [name vendor 3]
	- Strengths: [brief summary of strengths]
	- Weaknesses: [brief summary of weaknesses]
	- Key Features: [brief summary of key features]

	Based on this format, please provide a detailed breakdown of the strengths and weaknesses of each vendor, along with an overall comparison.
	""")


	return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), top_three_vendors




	iface = gr.Interface(
	fn=process_file,
	inputs=[gr.components.File(label="Client File")],
	outputs=[
	gr.components.Textbox(label="Status"),
	gr.components.File(label="Download Match Results"),
	gr.components.Textbox(label="Top Three Vendors")
	],
	)

	iface.launch()