Spaces:
Sleeping
Sleeping
feat: include hash table of tagged tags as output
Browse files-Added function count_tags which counts occurrences of each tag tagged
and returns the data in a DataFrame
-Updated Interface to also display above DataFrame
-Output file link now includes data from both DataFrames in separate
sheets
-Updated user instructions to reflect above changes
-Amend: update documentation comments for Counter import
- app.py +46 -3
- user_instructions.txt +15 -9
app.py
CHANGED
|
@@ -6,6 +6,7 @@ from openai import OpenAI # For sending the quotes to OpenAI for tagging
|
|
| 6 |
import openpyxl # Requirement for reading Excel files into pandas Dataframes
|
| 7 |
import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
|
| 8 |
from dotenv import load_dotenv # For loading environment variables in local environment
|
|
|
|
| 9 |
|
| 10 |
# Load environment variables from local .env file if it exists; otherwise this does nothing
|
| 11 |
load_dotenv()
|
|
@@ -51,6 +52,41 @@ def tag_quote(quote: str, tags_list: list) -> list:
|
|
| 51 |
)
|
| 52 |
return json.loads(response.choices[0].message.content)['tags']
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
# Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
|
| 55 |
def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
|
| 56 |
"""
|
|
@@ -100,11 +136,16 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
|
|
| 100 |
# Tag all the quotes one by one using tag_quote function
|
| 101 |
quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
|
| 102 |
|
|
|
|
|
|
|
|
|
|
| 103 |
# Return only the quotes column and the new tags column
|
| 104 |
output_df = quotes_df[[quotes_col_name, 'Tags']]
|
| 105 |
output_file_path = "output.xlsx"
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
|
| 109 |
# Define user interface structure
|
| 110 |
demo = gr.Interface(
|
|
@@ -115,8 +156,10 @@ demo = gr.Interface(
|
|
| 115 |
gr.Textbox(label = "List of tags separated by commas")
|
| 116 |
],
|
| 117 |
outputs=[
|
| 118 |
-
gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
|
|
|
|
| 119 |
gr.File(label="Output data in file format")
|
|
|
|
| 120 |
],
|
| 121 |
title="Automated Research Code Tagger",
|
| 122 |
description=INSTRUCTIONS
|
|
|
|
| 6 |
import openpyxl # Requirement for reading Excel files into pandas Dataframes
|
| 7 |
import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
|
| 8 |
from dotenv import load_dotenv # For loading environment variables in local environment
|
| 9 |
+
from collections import Counter # For tabulating tag occurrences
|
| 10 |
|
| 11 |
# Load environment variables from local .env file if it exists; otherwise this does nothing
|
| 12 |
load_dotenv()
|
|
|
|
| 52 |
)
|
| 53 |
return json.loads(response.choices[0].message.content)['tags']
|
| 54 |
|
| 55 |
+
def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
|
| 56 |
+
"""
|
| 57 |
+
Creates a DataFrame indicating number of occurences of each tag from a DataFrame column containing lists of tags.
|
| 58 |
+
|
| 59 |
+
This function also takes in a tags_list; all tags in the tags_list will be in the output
|
| 60 |
+
DataFrame even if they do not occur in the input tags_col. There may be some tags appearing
|
| 61 |
+
in the output which were not in the original tag_list; these will be marked with a ! prefix.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
tags_list (list): The list of tags given by the user
|
| 65 |
+
tags_col (pd.Series): A column of lists where each list contains tags which are
|
| 66 |
+
(ideally but not always; depending on OpenAI) selected from the tags_list.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
pd.DataFrame: A DataFrame with two columns. The first contains individual tags(str) which have
|
| 70 |
+
appeared either in the tags_list, the lists within the tags_col, or both. The second
|
| 71 |
+
contains the number of occurrences(int) of that tag within the lists in the tags_col.
|
| 72 |
+
"""
|
| 73 |
+
# Initialise Counter hash table
|
| 74 |
+
tags_counter = Counter({tag: 0 for tag in tags_list})
|
| 75 |
+
# Iterate over the lists in tags_col
|
| 76 |
+
for sublist in tags_col:
|
| 77 |
+
# Iterate over the tags in each list
|
| 78 |
+
for tag in sublist:
|
| 79 |
+
# Update the tags_counter for each tag
|
| 80 |
+
if tag in tags_list:
|
| 81 |
+
tags_counter.update([tag])
|
| 82 |
+
# If the tag was not in the tags_list given by the user, prefix it with a ! before updating
|
| 83 |
+
else:
|
| 84 |
+
tags_counter.update([f"!{tag}"])
|
| 85 |
+
# Convert the tags_counter to a DataFrame and return it
|
| 86 |
+
tags_counter_df = pd.DataFrame(tags_counter.items(), columns=['Tag', 'Count'])
|
| 87 |
+
return tags_counter_df
|
| 88 |
+
|
| 89 |
+
|
| 90 |
# Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
|
| 91 |
def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
|
| 92 |
"""
|
|
|
|
| 136 |
# Tag all the quotes one by one using tag_quote function
|
| 137 |
quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
|
| 138 |
|
| 139 |
+
# Create hash table of tag occurrences using count_tags function
|
| 140 |
+
tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
|
| 141 |
+
|
| 142 |
# Return only the quotes column and the new tags column
|
| 143 |
output_df = quotes_df[[quotes_col_name, 'Tags']]
|
| 144 |
output_file_path = "output.xlsx"
|
| 145 |
+
with pd.ExcelWriter(output_file_path) as writer:
|
| 146 |
+
output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
|
| 147 |
+
tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
|
| 148 |
+
return output_df, tags_counter_df, output_file_path
|
| 149 |
|
| 150 |
# Define user interface structure
|
| 151 |
demo = gr.Interface(
|
|
|
|
| 156 |
gr.Textbox(label = "List of tags separated by commas")
|
| 157 |
],
|
| 158 |
outputs=[
|
| 159 |
+
gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
|
| 160 |
+
gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
|
| 161 |
gr.File(label="Output data in file format")
|
| 162 |
+
|
| 163 |
],
|
| 164 |
title="Automated Research Code Tagger",
|
| 165 |
description=INSTRUCTIONS
|
user_instructions.txt
CHANGED
|
@@ -2,14 +2,20 @@ ABOUT:
|
|
| 2 |
This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
|
| 3 |
and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
|
| 4 |
|
| 5 |
-
HOW TO USE:
|
| 6 |
-
1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)
|
| 7 |
-
2)Type in the name of the column where the quotes are located
|
| 8 |
-
3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
|
| 9 |
-
This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.
|
| 10 |
-
|
| 11 |
-
You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
|
| 14 |
-
I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
|
| 15 |
-
-Kenneth
|
|
|
|
| 2 |
This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
|
| 3 |
and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
|
| 4 |
|
| 5 |
+
HOW TO USE:<br>
|
| 6 |
+
1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
|
| 7 |
+
2)Type in the name of the column where the quotes are located<br>
|
| 8 |
+
3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
|
| 9 |
+
This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
|
| 10 |
+
Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
|
|
|
|
| 11 |
|
| 12 |
+
READING THE OUTPUT(in the right-side column): <br>
|
| 13 |
+
First table: All the responses are collated and displayed here, together with the original quotes.<br>
|
| 14 |
+
Second table: Displays all the tags used and the number of occurrences of each tag. <br>
|
| 15 |
+
You may also retrieve both tables in a single Excel file via the link below them.<br>
|
| 16 |
+
Sometimes a quote might be tagged with a tag that you didn't give; in such cases the tag will be prefixed with ! in the second table.
|
| 17 |
+
|
| 18 |
+
DISCLAIMER:<br>
|
| 19 |
Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
|
| 20 |
+
I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.<br>
|
| 21 |
+
-Kenneth Lee, Perceptech.AI
|