Spaces:

Perceptechai
/

code-tagging

Sleeping

File size: 6,595 Bytes

import gradio as gr
import os
import pandas as pd
from openai import OpenAI
import openpyxl
import json

client = OpenAI(
  api_key=os.getenv('OPENAI_KEY'),
  organization=os.getenv('ORG_KEY'),
  project=os.getenv('PROJ_KEY')
)

# Static Username and Password
VALID_USERNAME = "tft@perceptech.ai"
VALID_PASSWORD = "Perceptech@2024!"

#need to give info on how to convert to CSV
title = "Automated Research Code Tagger"
description = """
ABOUT:\n
This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags, 
and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.\n

HOW TO USE:\n
1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)\n
2)Type in the name of the column where the quotes are located\n
3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.\n
4)All the responses from ChatGPT will be collated and displayed in the table on the right, together with the original quotes.
You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!\n

Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them. 
I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
-Kenneth
"""

prompt = """
Given the quote below and the regular tag list below, evaluate each tag in the tag list and determine if the meaning of the quote can be described by that tag topic.
If so, return the relevant tag in your response. Use only the tags provided in the list. Under no circumstances should you create new tag names.

For the tags starting with a *, these tags should be treated as proper nouns(usually product names or slogans) and should not be used unless the quote explicitly contains the entire tag.
For quotes with meanings that are more ambiguous and can relate to multiple tags, make no assumptions about their meanings and only add tags if the topic of the tag is actually mentioned in the quote.
If there are no relevant tags to the quote, return an empty list.

Quote:
{quote}

Tag list:
{tags_list}

Respond in the following format:
{{
  "tags":[<tagName1>, <tagName2>]
}}
"""
def tag_quote(quote, tags_list):
    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
            {"role": "user", "content": prompt.format(tags_list=tags_list,  quote=quote)}
        ]
    )
    print(response.choices[0].message.content)
    return json.loads(response.choices[0].message.content)['tags']
    
def process_quotes(quotes_file_path, quotes_col_name, tags_string):
    print(quotes_file_path)
    print(quotes_col_name)
    print(tags_string)
    tags_list = tags_string.split(',')
    tags_list = [tag.strip() for tag in tags_list]

    #next 3 lines are necessary as pd.read_excel will rename duplicate columns found in the excel file eg foo -> foo.1, hence we need to extract the first row alone and not as header, and then set it as header for the rest of the DF later.
    quotes_df_cols= pd.read_excel(quotes_file_path, header=None, nrows=1).values[0] #creates a df without header from the excel and takes the first row
    quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1) # converts row 2 onwards into the DF, without specifying a header
    quotes_df.columns = quotes_df_cols # sets the first row of excel file as header

    count = quotes_df.columns.tolist().count(quotes_col_name)
    if count == 0:
        raise gr.Error("No columns with this name found")
    elif count > 1:
        print("Count>1!!")
        raise gr.Error("Multiple columns with this name found, please rename to something unique")
    quotes_data = quotes_df[quotes_col_name]
    quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
    return quotes_df[[quotes_col_name, 'Tags']]
    
# def authenticate(username, password):
#     """Authenticate the user using static username and password"""
#     if username == VALID_USERNAME and password == VALID_PASSWORD:
#         return True
#     else:
#         return False
        
# def auth_interface(username, password):
#     """Handle the authentication and proceed with the main function if valid"""
#     if authenticate(username, password):
#         return gr.Interface(
#             fn=process_quotes, 
#             inputs=[
#                 gr.File(label="Quotes Excel File"),  # File as generated by TFT software
#                 gr.Textbox(label="Name of quotes column"),  # use this to identify the col with the quotes 
#                 gr.Textbox(label="List of tags separated by commas")
#             ],
#             outputs=gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
#             title=title,
#             description=description
#         ).launch()
#     else:
#         return "Invalid username or password!"

# # Create the authentication fields before launching the main app
# auth_app = gr.Interface(
#     fn=auth_interface,
#     inputs=[
#         gr.Textbox(label="Username", type="text"),
#         gr.Textbox(label="Password", type="password")
#     ],
#     outputs="text",
#     title="Login to Automated Research Code Tagger",
#     description="Please enter the correct username and password to access the tool."
# )

# auth_app.launch()
demo = gr.Interface(
    fn=process_quotes, 
    inputs=[
        gr.File(label="Quotes Excel File"), # File as generated by TFT software
        gr.Textbox(label="Name of quotes column"), # use this to identify the col with the quotes 
        gr.Textbox(label = "List of tags separated by commas")
    ],
    outputs=gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
    title=title,
    description=description 
    )

demo.launch()

# For later when I enable usage of own API key
# api_key = gr.Textbox(
#         type="password", label="Enter your OpenAI API key here (Optional for Perceptech users)"
#     )