Spaces:
Sleeping
Sleeping
| # Import necessary modules | |
| import gradio as gr # For creating the interactive user interface | |
| import os # For accessing environment variables | |
| import pandas as pd # For easier data handling | |
| from openai import OpenAI # For sending the quotes to OpenAI for tagging | |
| import openpyxl # Requirement for reading Excel files into pandas Dataframes | |
| import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted | |
| # Import prompt for requesting the tags from OpenAI | |
| with open("prompt.txt", "r") as prompt_file: | |
| PROMPT = prompt_file.read() | |
| # Import user instructions for display on screen | |
| with open("user_instructions.txt", "r") as user_instruction_file: | |
| INSTRUCTIONS = user_instruction_file.read() | |
| #Initialising the OpenAI client | |
| client = OpenAI( | |
| api_key=os.getenv('OPENAI_KEY'), | |
| organization=os.getenv('ORG_KEY'), | |
| project=os.getenv('PROJ_KEY') | |
| ) | |
| # Function to send the prompt with quote and tag list to OpenAI and get the tags for that quote back | |
| def tag_quote(quote: str, tags_list: list) -> list: | |
| """ | |
| Generates a list of tags for a given quote based on a predefined list of potential tags. | |
| This function uses a GPT-based language model to analyze the input quote and determine | |
| the most relevant tags from the provided list. The response is parsed from the JSON | |
| output of the model and returned as a list of tags. | |
| Args: | |
| quote (str): The quote or text to be analyzed. | |
| tags_list (list): A list of potential tags to match against the quote. | |
| Returns: | |
| list: A list of tags that are relevant to the quote, as determined by the model. | |
| """ | |
| response = client.chat.completions.create( | |
| model = "gpt-4o-mini", | |
| response_format={"type": "json_object"}, | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant designed to output JSON."}, | |
| {"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)} | |
| ] | |
| ) | |
| return json.loads(response.choices[0].message.content)['tags'] | |
| # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame | |
| def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame: | |
| """ | |
| Processes quotes from an Excel file and assigns relevant tags to each quote. | |
| This function reads an Excel file containing quotes, validates the column containing | |
| the quotes, and applies the `tag_quote` function to assign tags to each quote. | |
| The tags are derived from a user-provided comma-separated string. | |
| Args: | |
| quotes_file_path (str): Path to the Excel file containing the quotes. | |
| quotes_col_name (str): The name of the column containing the quotes. | |
| tags_string (str): A comma-separated string of potential tags. | |
| Returns: | |
| pd.DataFrame: A DataFrame with two columns: | |
| - The original column containing the quotes. | |
| - A new column 'Tags' with the tags assigned to each quote. | |
| Raises: | |
| gr.Error: If the specified column name does not exist or is not unique. | |
| Example: | |
| >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational") | |
| Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes. | |
| """ | |
| tags_list = tags_string.split(',') | |
| tags_list = [tag.strip() for tag in tags_list] | |
| # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file | |
| # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name | |
| # Extract the first row(the actual header for the DataFrame) as a DataFrame without header. | |
| quotes_df_cols= pd.read_excel(quotes_file_path, header=None, nrows=1).values[0] | |
| # Extract all the other rows of the Excel file as a DataFrame without header | |
| quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1) | |
| # Set the extracted first row as the header for the DataFrame resultant from the other rows | |
| quotes_df.columns = quotes_df_cols | |
| # Check that the column name given by the user exists and is unique | |
| count = quotes_df.columns.tolist().count(quotes_col_name) | |
| if count == 0: | |
| raise gr.Error("No columns with this name found") | |
| elif count > 1: | |
| raise gr.Error("Multiple columns with this name found, please rename to something unique") | |
| quotes_data = quotes_df[quotes_col_name] | |
| # Tag all the quotes one by one using tag_quote function | |
| quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,)) | |
| # Return only the quotes column and the new tags column | |
| return quotes_df[[quotes_col_name, 'Tags']] | |
| demo = gr.Interface( | |
| fn=process_quotes, | |
| inputs=[ | |
| gr.File(label="Quotes Excel File"), | |
| gr.Textbox(label="Name of quotes column"), | |
| gr.Textbox(label = "List of tags separated by commas") | |
| ], | |
| outputs=gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2), | |
| title="Automated Research Code Tagger", | |
| description=INSTRUCTIONS | |
| ) | |
| demo.launch() | |