Spaces:

cogcorp
/

homework

Sleeping

File size: 8,748 Bytes

import gradio as gr
import pandas as pd
import os
import spacy
import spacy.cli
import numpy as np
import zipfile
import tempfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from keras.preprocessing.text import text_to_word_sequence
import openai
import re


#vendor name (from column 0)
# Put your OpenAI API key here
openai.api_key = os.getenv('API')

try:
    nlp = spacy.load('en_core_web_md')
except IOError:
    print("Model not found. Downloading...")
    spacy.cli.download("en_core_web_md")
    nlp = spacy.load('en_core_web_md')


vendor_df_dict = {}

# A dictionary to store the total Trellis Score of each vendor
total_trellis_scores = {}


def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        text = text.translate(str.maketrans('', '', filters))
        stop_words = ['does', 'the', 'offer', 'do', 'you', 'require']
        words = text_to_word_sequence(text)
        words = [word for word in words if word not in stop_words]
        return ' '.join(words)
    else:
        return text

def semantic_search(query, data):
    query = str(query)
    data = [str(text) for text in data]
    query = preprocess_text(query)
    data = [preprocess_text(text) for text in data]
    vectorizer = TfidfVectorizer().fit_transform([query] + data)
    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
    return np.argmax(cosine_similarities[1:])

def parse_score(score):
    level_scores = {
        'Level 1 - Basic': 1,
        'Level 2 - Developing': 2,
        'Level 3 - Intermediate': 3,
        'Level 4 - Advanced': 4,
        'Level 5 - Leading': 5,
        '1 - Low': 1,
        '2 - Below average': 2,
        '3 - Average': 3,
        '4 - Above average': 4,
        '5 - High': 5,
        '1 - Very Low': 1,
        '2 - Low': 2,
        '3 - Medium': 3,
        '4 - High-Medium': 4,  
        '5 - Very High': 5
    }
    
    if score is None or str(score).strip() == '':
        return 3

    if isinstance(score, str):
        score = score.replace(',', '.')
        if score in level_scores:
            return level_scores[score]
        else:
            number = re.findall(r"[-+]?\d*\.\d+|\d+", score)
            if number:
                return float(number[0])
    return 0

def load_vendor_files(zip_file_path):
    global vendor_df_dict
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        temp_dir = tempfile.TemporaryDirectory()
        zip_ref.extractall(temp_dir.name)
        for file_name in os.listdir(temp_dir.name):
            if file_name.endswith(".xlsx"):
                vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
                vendor_df_dict[file_name] = vendor_df_dict[file_name][vendor_df_dict[file_name].iloc[:, 2] != 'No']  # Exclude rows with 'No' in the 3rd column
                vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
                vendor_df_dict[file_name].iloc[:, 4].fillna('3 - Medium', inplace=True)
                vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
                vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
        temp_dir.cleanup()



def process_file(client_file):
    zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
    load_vendor_files(zip_file_path)  # Load vendor files from zip file
    client_df = pd.read_excel(client_file.name)

    client_df.iloc[:, 2].fillna('3 - Medium', inplace=True)
    
    client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float)  #return numbr only from client score

    client_df = client_df[client_df.iloc[:, 1] == 'Yes']
    client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
    client_df['score_client'] = client_df['score_client'].astype(float)

    matches_found = False  # Flag to check if any matches were found
    highest_score_vendor = None
    highest_score = 0
    total_scores = {} # dictionary to store the total Trellis Score for each vendor

    with pd.ExcelWriter('matches.xlsx') as writer:
        common_list = []
        for vendor_file, vendor_df in vendor_df_dict.items():
            for index, row in client_df.iterrows():

                most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])

                most_similar_row = vendor_df.iloc[most_similar_index, :]
                client_score = row['score_client'] # directly access the pre-parsed client score
                vendor_score = parse_score(most_similar_row[4])

                client_row_selected = row[[0, 2, 3]]
                vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
                combined_row = pd.concat([client_row_selected, vendor_row_selected])

                trellis_score = client_score * vendor_score
                combined_row['Trellis Score'] = trellis_score
                common_list.append(combined_row)

                if trellis_score > highest_score:
                    highest_score = trellis_score
                    highest_score_vendor = vendor_file

            if common_list:
                common_df = pd.DataFrame(common_list)
                # Compute the total Trellis Score
                total_trellis_score = common_df['Trellis Score'].sum()
                total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
                # Add a row with the total Trellis Score to the DataFrame
                common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
                common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score

                common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
                common_list = []  # Reset the common_list for next vendor_file
                matches_found = True  # Set the flag to True as matches were found
        
        top_three_vendors = sorted(total_trellis_scores, key=total_trellis_scores.get, reverse=True)[:3]
        top_three_vendors = [vendor.split(".")[0] for vendor in top_three_vendors]


        if not matches_found:  # In case there were no matches
            return "No matching data found.", None, None
        else:
            def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=3000, temperature=0.3):
                try: 
                    response = openai.ChatCompletion.create(
                        model=engine,
                        messages=[
                            {"role": "system", "content": "You are a helpful AI."},
                            {"role": "user", "content": prompt}
                        ],
                        max_tokens=max_tokens,
                        temperature=temperature
                    )
                    return response['choices'][0]['message']['content'].strip()
                
                except Exception as e:
                    print(f"Error in gpt3_query: {str(e)}")
                    return None

        # Get GPT-3.5-turbo to create a summary text
            summary = gpt3_query(f"""Based on the Trellis Score, the top three vendors are {', '.join(top_three_vendors)}. 

We have analyzed the performance of several vendors and identified the top three based on their Trellis Scores. Here are the key data points for each:

1. [name vendor 1] 
    - Trellis Score: {total_trellis_scores}
    - Strengths: [brief summary of strengths]
    - Weaknesses: [brief summary of weaknesses]
    - Key Features: [brief summary of key features]

2. [name vendor 2]
    - Strengths: [brief summary of strengths]
    - Weaknesses: [brief summary of weaknesses]
    - Key Features: [brief summary of key features]

3. [name vendor 3]
    - Strengths: [brief summary of strengths]
    - Weaknesses: [brief summary of weaknesses]
    - Key Features: [brief summary of key features]

Based on this format, please provide a detailed breakdown of the strengths and weaknesses of each vendor, along with an overall comparison.
""")


            return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), top_three_vendors
         



iface = gr.Interface(
    fn=process_file, 
    inputs=[gr.components.File(label="Client File")], 
    outputs=[
        gr.components.Textbox(label="Status"),
        gr.components.File(label="Download Match Results"),
        gr.components.Textbox(label="Top Three Vendors")
    ],
)

iface.launch()