|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import os |
|
|
import spacy |
|
|
import spacy.cli |
|
|
import numpy as np |
|
|
import zipfile |
|
|
import tempfile |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from keras.preprocessing.text import text_to_word_sequence |
|
|
import openai |
|
|
import re |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
openai.api_key = os.getenv('API') |
|
|
|
|
|
try: |
|
|
nlp = spacy.load('en_core_web_md') |
|
|
except IOError: |
|
|
print("Model not found. Downloading...") |
|
|
spacy.cli.download("en_core_web_md") |
|
|
nlp = spacy.load('en_core_web_md') |
|
|
|
|
|
|
|
|
vendor_df_dict = {} |
|
|
|
|
|
|
|
|
total_trellis_scores = {} |
|
|
|
|
|
|
|
|
def preprocess_text(text): |
|
|
if isinstance(text, str): |
|
|
text = text.lower() |
|
|
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' |
|
|
text = text.translate(str.maketrans('', '', filters)) |
|
|
stop_words = ['does', 'the', 'offer', 'do', 'you', 'require'] |
|
|
words = text_to_word_sequence(text) |
|
|
words = [word for word in words if word not in stop_words] |
|
|
return ' '.join(words) |
|
|
else: |
|
|
return text |
|
|
|
|
|
def semantic_search(query, data): |
|
|
query = str(query) |
|
|
data = [str(text) for text in data] |
|
|
query = preprocess_text(query) |
|
|
data = [preprocess_text(text) for text in data] |
|
|
vectorizer = TfidfVectorizer().fit_transform([query] + data) |
|
|
cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten() |
|
|
return np.argmax(cosine_similarities[1:]) |
|
|
|
|
|
def parse_score(score): |
|
|
level_scores = { |
|
|
'Level 1 - Basic': 1, |
|
|
'Level 2 - Developing': 2, |
|
|
'Level 3 - Intermediate': 3, |
|
|
'Level 4 - Advanced': 4, |
|
|
'Level 5 - Leading': 5, |
|
|
'1 - Low': 1, |
|
|
'2 - Below average': 2, |
|
|
'3 - Average': 3, |
|
|
'4 - Above average': 4, |
|
|
'5 - High': 5, |
|
|
'1 - Very Low': 1, |
|
|
'2 - Low': 2, |
|
|
'3 - Medium': 3, |
|
|
'4 - High-Medium': 4, |
|
|
'5 - Very High': 5 |
|
|
} |
|
|
|
|
|
if score is None or str(score).strip() == '': |
|
|
return 3 |
|
|
|
|
|
if isinstance(score, str): |
|
|
score = score.replace(',', '.') |
|
|
if score in level_scores: |
|
|
return level_scores[score] |
|
|
else: |
|
|
number = re.findall(r"[-+]?\d*\.\d+|\d+", score) |
|
|
if number: |
|
|
return float(number[0]) |
|
|
return 0 |
|
|
|
|
|
def load_vendor_files(zip_file_path): |
|
|
global vendor_df_dict |
|
|
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: |
|
|
temp_dir = tempfile.TemporaryDirectory() |
|
|
zip_ref.extractall(temp_dir.name) |
|
|
for file_name in os.listdir(temp_dir.name): |
|
|
if file_name.endswith(".xlsx"): |
|
|
vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name)) |
|
|
vendor_df_dict[file_name] = vendor_df_dict[file_name][vendor_df_dict[file_name].iloc[:, 2] != 'No'] |
|
|
vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text) |
|
|
vendor_df_dict[file_name].iloc[:, 4].fillna('3 - Medium', inplace=True) |
|
|
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float) |
|
|
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0) |
|
|
temp_dir.cleanup() |
|
|
|
|
|
|
|
|
|
|
|
def process_file(client_file): |
|
|
zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip') |
|
|
load_vendor_files(zip_file_path) |
|
|
client_df = pd.read_excel(client_file.name) |
|
|
|
|
|
client_df.iloc[:, 2].fillna('3 - Medium', inplace=True) |
|
|
|
|
|
client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) |
|
|
|
|
|
client_df = client_df[client_df.iloc[:, 1] == 'Yes'] |
|
|
client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text) |
|
|
client_df['score_client'] = client_df['score_client'].astype(float) |
|
|
|
|
|
matches_found = False |
|
|
highest_score_vendor = None |
|
|
highest_score = 0 |
|
|
total_scores = {} |
|
|
|
|
|
with pd.ExcelWriter('matches.xlsx') as writer: |
|
|
common_list = [] |
|
|
for vendor_file, vendor_df in vendor_df_dict.items(): |
|
|
for index, row in client_df.iterrows(): |
|
|
|
|
|
most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2]) |
|
|
|
|
|
most_similar_row = vendor_df.iloc[most_similar_index, :] |
|
|
client_score = row['score_client'] |
|
|
vendor_score = parse_score(most_similar_row[4]) |
|
|
|
|
|
client_row_selected = row[[0, 2, 3]] |
|
|
vendor_row_selected = most_similar_row[[0, 2, 4, 5]] |
|
|
combined_row = pd.concat([client_row_selected, vendor_row_selected]) |
|
|
|
|
|
trellis_score = client_score * vendor_score |
|
|
combined_row['Trellis Score'] = trellis_score |
|
|
common_list.append(combined_row) |
|
|
|
|
|
if trellis_score > highest_score: |
|
|
highest_score = trellis_score |
|
|
highest_score_vendor = vendor_file |
|
|
|
|
|
if common_list: |
|
|
common_df = pd.DataFrame(common_list) |
|
|
|
|
|
total_trellis_score = common_df['Trellis Score'].sum() |
|
|
total_trellis_scores[vendor_file] = total_trellis_score |
|
|
|
|
|
common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns) |
|
|
common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score |
|
|
|
|
|
common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False) |
|
|
common_list = [] |
|
|
matches_found = True |
|
|
|
|
|
top_three_vendors = sorted(total_trellis_scores, key=total_trellis_scores.get, reverse=True)[:3] |
|
|
top_three_vendors = [vendor.split(".")[0] for vendor in top_three_vendors] |
|
|
|
|
|
|
|
|
if not matches_found: |
|
|
return "No matching data found.", None, None |
|
|
else: |
|
|
def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=3000, temperature=0.3): |
|
|
try: |
|
|
response = openai.ChatCompletion.create( |
|
|
model=engine, |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are a helpful AI."}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
max_tokens=max_tokens, |
|
|
temperature=temperature |
|
|
) |
|
|
return response['choices'][0]['message']['content'].strip() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error in gpt3_query: {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
summary = gpt3_query(f"""Based on the Trellis Score, the top three vendors are {', '.join(top_three_vendors)}. |
|
|
|
|
|
We have analyzed the performance of several vendors and identified the top three based on their Trellis Scores. Here are the key data points for each: |
|
|
|
|
|
1. [name vendor 1] |
|
|
- Trellis Score: {total_trellis_scores} |
|
|
- Strengths: [brief summary of strengths] |
|
|
- Weaknesses: [brief summary of weaknesses] |
|
|
- Key Features: [brief summary of key features] |
|
|
|
|
|
2. [name vendor 2] |
|
|
- Strengths: [brief summary of strengths] |
|
|
- Weaknesses: [brief summary of weaknesses] |
|
|
- Key Features: [brief summary of key features] |
|
|
|
|
|
3. [name vendor 3] |
|
|
- Strengths: [brief summary of strengths] |
|
|
- Weaknesses: [brief summary of weaknesses] |
|
|
- Key Features: [brief summary of key features] |
|
|
|
|
|
Based on this format, please provide a detailed breakdown of the strengths and weaknesses of each vendor, along with an overall comparison. |
|
|
""") |
|
|
|
|
|
|
|
|
return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), top_three_vendors |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=process_file, |
|
|
inputs=[gr.components.File(label="Client File")], |
|
|
outputs=[ |
|
|
gr.components.Textbox(label="Status"), |
|
|
gr.components.File(label="Download Match Results"), |
|
|
gr.components.Textbox(label="Top Three Vendors") |
|
|
], |
|
|
) |
|
|
|
|
|
iface.launch() |
|
|
|