homework / app.py
cogcorp's picture
Update app.py
0bbc874
import gradio as gr
import pandas as pd
import os
import spacy
import spacy.cli
import numpy as np
import zipfile
import tempfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing.text import text_to_word_sequence
import openai
import re
#vendor name (from column 0)
# Put your OpenAI API key here
openai.api_key = os.getenv('API')
try:
nlp = spacy.load('en_core_web_md')
except IOError:
print("Model not found. Downloading...")
spacy.cli.download("en_core_web_md")
nlp = spacy.load('en_core_web_md')
vendor_df_dict = {}
# A dictionary to store the total Trellis Score of each vendor
total_trellis_scores = {}
def preprocess_text(text):
if isinstance(text, str):
text = text.lower()
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
text = text.translate(str.maketrans('', '', filters))
stop_words = ['does', 'the', 'offer', 'do', 'you', 'require']
words = text_to_word_sequence(text)
words = [word for word in words if word not in stop_words]
return ' '.join(words)
else:
return text
def semantic_search(query, data):
query = str(query)
data = [str(text) for text in data]
query = preprocess_text(query)
data = [preprocess_text(text) for text in data]
vectorizer = TfidfVectorizer().fit_transform([query] + data)
cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
return np.argmax(cosine_similarities[1:])
def parse_score(score):
level_scores = {
'Level 1 - Basic': 1,
'Level 2 - Developing': 2,
'Level 3 - Intermediate': 3,
'Level 4 - Advanced': 4,
'Level 5 - Leading': 5,
'1 - Low': 1,
'2 - Below average': 2,
'3 - Average': 3,
'4 - Above average': 4,
'5 - High': 5,
'1 - Very Low': 1,
'2 - Low': 2,
'3 - Medium': 3,
'4 - High-Medium': 4,
'5 - Very High': 5
}
if score is None or str(score).strip() == '':
return 3
if isinstance(score, str):
score = score.replace(',', '.')
if score in level_scores:
return level_scores[score]
else:
number = re.findall(r"[-+]?\d*\.\d+|\d+", score)
if number:
return float(number[0])
return 0
def load_vendor_files(zip_file_path):
global vendor_df_dict
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
temp_dir = tempfile.TemporaryDirectory()
zip_ref.extractall(temp_dir.name)
for file_name in os.listdir(temp_dir.name):
if file_name.endswith(".xlsx"):
vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
vendor_df_dict[file_name] = vendor_df_dict[file_name][vendor_df_dict[file_name].iloc[:, 2] != 'No'] # Exclude rows with 'No' in the 3rd column
vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
vendor_df_dict[file_name].iloc[:, 4].fillna('3 - Medium', inplace=True)
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
temp_dir.cleanup()
def process_file(client_file):
zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
load_vendor_files(zip_file_path) # Load vendor files from zip file
client_df = pd.read_excel(client_file.name)
client_df.iloc[:, 2].fillna('3 - Medium', inplace=True)
client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) #return numbr only from client score
client_df = client_df[client_df.iloc[:, 1] == 'Yes']
client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
client_df['score_client'] = client_df['score_client'].astype(float)
matches_found = False # Flag to check if any matches were found
highest_score_vendor = None
highest_score = 0
total_scores = {} # dictionary to store the total Trellis Score for each vendor
with pd.ExcelWriter('matches.xlsx') as writer:
common_list = []
for vendor_file, vendor_df in vendor_df_dict.items():
for index, row in client_df.iterrows():
most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])
most_similar_row = vendor_df.iloc[most_similar_index, :]
client_score = row['score_client'] # directly access the pre-parsed client score
vendor_score = parse_score(most_similar_row[4])
client_row_selected = row[[0, 2, 3]]
vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
combined_row = pd.concat([client_row_selected, vendor_row_selected])
trellis_score = client_score * vendor_score
combined_row['Trellis Score'] = trellis_score
common_list.append(combined_row)
if trellis_score > highest_score:
highest_score = trellis_score
highest_score_vendor = vendor_file
if common_list:
common_df = pd.DataFrame(common_list)
# Compute the total Trellis Score
total_trellis_score = common_df['Trellis Score'].sum()
total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
# Add a row with the total Trellis Score to the DataFrame
common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score
common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
common_list = [] # Reset the common_list for next vendor_file
matches_found = True # Set the flag to True as matches were found
top_three_vendors = sorted(total_trellis_scores, key=total_trellis_scores.get, reverse=True)[:3]
top_three_vendors = [vendor.split(".")[0] for vendor in top_three_vendors]
if not matches_found: # In case there were no matches
return "No matching data found.", None, None
else:
def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=3000, temperature=0.3):
try:
response = openai.ChatCompletion.create(
model=engine,
messages=[
{"role": "system", "content": "You are a helpful AI."},
{"role": "user", "content": prompt}
],
max_tokens=max_tokens,
temperature=temperature
)
return response['choices'][0]['message']['content'].strip()
except Exception as e:
print(f"Error in gpt3_query: {str(e)}")
return None
# Get GPT-3.5-turbo to create a summary text
summary = gpt3_query(f"""Based on the Trellis Score, the top three vendors are {', '.join(top_three_vendors)}.
We have analyzed the performance of several vendors and identified the top three based on their Trellis Scores. Here are the key data points for each:
1. [name vendor 1]
- Trellis Score: {total_trellis_scores}
- Strengths: [brief summary of strengths]
- Weaknesses: [brief summary of weaknesses]
- Key Features: [brief summary of key features]
2. [name vendor 2]
- Strengths: [brief summary of strengths]
- Weaknesses: [brief summary of weaknesses]
- Key Features: [brief summary of key features]
3. [name vendor 3]
- Strengths: [brief summary of strengths]
- Weaknesses: [brief summary of weaknesses]
- Key Features: [brief summary of key features]
Based on this format, please provide a detailed breakdown of the strengths and weaknesses of each vendor, along with an overall comparison.
""")
return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), top_three_vendors
iface = gr.Interface(
fn=process_file,
inputs=[gr.components.File(label="Client File")],
outputs=[
gr.components.Textbox(label="Status"),
gr.components.File(label="Download Match Results"),
gr.components.Textbox(label="Top Three Vendors")
],
)
iface.launch()