Spaces:

Perceptechai
/

code-tagging

Sleeping

App Files Files Community

code-tagging / app.py

kenleeyx

feat: add per-quote logging

95e3994 11 months ago

raw

history blame

8.55 kB

	# Import necessary modules
	import gradio as gr # For creating the interactive user interface
	import os # For accessing environment variables
	import pandas as pd # For easier data handling
	from openai import OpenAI # For sending the quotes to OpenAI for tagging
	import openpyxl # Requirement for reading Excel files into pandas Dataframes
	import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
	from dotenv import load_dotenv # For loading environment variables in local environment
	from collections import Counter # For tabulating tag occurrences
	import logging

	logger = logging.getLogger()
	logger.setLevel(logging.INFO)
	logging.basicConfig(level=logging.INFO, force=True)

	# Load environment variables from local .env file if it exists; otherwise this does nothing
	load_dotenv()

	# Import prompt for requesting the tags from OpenAI
	with open("prompt.txt", "r") as prompt_file:
	PROMPT = prompt_file.read()
	logger.info(f"Loaded prompt: {PROMPT}")

	# Import user instructions for display on screen
	with open("user_instructions.txt", "r") as user_instruction_file:
	INSTRUCTIONS = user_instruction_file.read()

	#Initialising the OpenAI client
	client = OpenAI(
	api_key=os.getenv('OPENAI_KEY'),
	organization=os.getenv('ORG_KEY'),
	project=os.getenv('PROJ_KEY')
	)
	logger.info("Initialised OpenAI client")

	# Function to send the prompt with quote and tag list to OpenAI and get the tags for that quote back
	def tag_quote(quote: str, tags_list: list) -> list:
	"""
	Generates a list of tags for a given quote based on a predefined list of potential tags.

	This function uses a GPT-based language model to analyze the input quote and determine
	the most relevant tags from the provided list. The response is parsed from the JSON
	output of the model and returned as a list of tags.

	Args:
	quote (str): The quote or text to be analyzed.
	tags_list (list): A list of potential tags to match against the quote.

	Returns:
	list: A list of tags that are relevant to the quote, as determined by the model.
	"""
	logger.info(f"Tagging quote {quote}")
	response = client.chat.completions.create(
	model = "gpt-4o-mini",
	response_format={"type": "json_object"},
	messages=[
	{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
	{"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
	]
	)
	return json.loads(response.choices[0].message.content)['tags']

	def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
	"""
	Creates a DataFrame indicating number of occurences of each tag from a DataFrame column containing lists of tags.

	This function also takes in a tags_list; all tags in the tags_list will be in the output
	DataFrame even if they do not occur in the input tags_col. There may be some tags appearing
	in the output which were not in the original tag_list; these will be marked with a ! prefix.

	Args:
	tags_list (list): The list of tags given by the user
	tags_col (pd.Series): A column of lists where each list contains tags which are
	(ideally but not always; depending on OpenAI) selected from the tags_list.

	Returns:
	pd.DataFrame: A DataFrame with two columns. The first contains individual tags(str) which have
	appeared either in the tags_list, the lists within the tags_col, or both. The second
	contains the number of occurrences(int) of that tag within the lists in the tags_col.
	"""
	# Initialise Counter hash table
	tags_counter = Counter({tag: 0 for tag in tags_list})
	# Iterate over the lists in tags_col
	for sublist in tags_col:
	# Iterate over the tags in each list
	for tag in sublist:
	# Update the tags_counter for each tag
	if tag in tags_list:
	tags_counter.update([tag])
	# If the tag was not in the tags_list given by the user, prefix it with a ! before updating
	else:
	tags_counter.update([f"!{tag}"])
	# Convert the tags_counter to a DataFrame and return it
	tags_counter_df = pd.DataFrame(tags_counter.items(), columns=['Tag', 'Count'])
	return tags_counter_df


	# Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
	def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
	"""
	Processes quotes from an Excel file and assigns relevant tags to each quote.

	This function reads an Excel file containing quotes, validates the column containing
	the quotes, and applies the `tag_quote` function to assign tags to each quote.
	The tags are derived from a user-provided comma-separated string.

	Args:
	quotes_file_path (str): Path to the Excel file containing the quotes.
	quotes_col_name (str): The name of the column containing the quotes.
	tags_string (str): A comma-separated string of potential tags.

	Returns:
	pd.DataFrame: A DataFrame with two columns:
	- The original column containing the quotes.
	- A new column 'Tags' with the tags assigned to each quote.

	Raises:
	gr.Error: If the specified column name does not exist or is not unique.

	Example:
	>>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
	Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
	"""
	tags_list = tags_string.split(',')
	tags_list = [tag.strip() for tag in tags_list]

	# Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
	# pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
	# Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
	quotes_df_cols= pd.read_excel(quotes_file_path, header=None, nrows=1).values[0]
	# Extract all the other rows of the Excel file as a DataFrame without header
	quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
	# Set the extracted first row as the header for the DataFrame resultant from the other rows
	quotes_df.columns = quotes_df_cols

	# Check that the column name given by the user exists and is unique
	count = quotes_df.columns.tolist().count(quotes_col_name)
	if count == 0:
	raise gr.Error("No columns with this name found")
	elif count > 1:
	raise gr.Error("Multiple columns with this name found, please rename to something unique")
	quotes_data = quotes_df[quotes_col_name]

	# Tag all the quotes one by one using tag_quote function
	tags_column = []
	for i, quote in enumerate(quotes_data):
	logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
	tags = tag_quote(quote, tags_list)
	tags_column.append(tags)

	quotes_df['Tags'] = tags_column
	logger.info("Quotes tagged")

	# Create hash table of tag occurrences using count_tags function
	tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
	logger.info("Tags counted")

	# Return only the quotes column and the new tags column
	output_df = quotes_df[[quotes_col_name, 'Tags']]
	output_file_path = "output.xlsx"
	with pd.ExcelWriter(output_file_path) as writer:
	output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
	tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
	logger.info('Results written to Excel')
	return output_df, tags_counter_df, output_file_path

	# Define user interface structure
	demo = gr.Interface(
	fn=process_quotes,
	inputs=[
	gr.File(label="Quotes Excel File"),
	gr.Textbox(label="Name of quotes column"),
	gr.Textbox(label = "List of tags separated by commas")
	],
	outputs=[
	gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
	gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
	gr.File(label="Output data in file format")

	],
	title="Automated Research Code Tagger",
	description=INSTRUCTIONS
	)

	demo.launch(share=True, auth=(os.getenv('APP_USERNAME'), os.getenv('APP_PASSWORD')), ssr_mode=False)