Spaces:

Perceptechai
/

code-tagging

Sleeping

kenleeyx commited on Jan 28, 2025

Commit

243f5cb

1 Parent(s): 0c5d2b6

feat: include hash table of tagged tags as output

-Added function count_tags which counts occurrences of each tag tagged
and returns the data in a DataFrame
-Updated Interface to also display above DataFrame
-Output file link now includes data from both DataFrames in separate
sheets
-Updated user instructions to reflect above changes
-Amend: update documentation comments for Counter import

Files changed (2) hide show

app.py +46 -3
user_instructions.txt +15 -9

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from openai import OpenAI # For sending the quotes to OpenAI for tagging
 import openpyxl # Requirement for reading Excel files into pandas Dataframes
 import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
 from dotenv import load_dotenv # For loading environment variables in local environment
 # Load environment variables from local .env file if it exists; otherwise this does nothing
 load_dotenv()
@@ -51,6 +52,41 @@ def tag_quote(quote: str, tags_list: list) -> list:
     )
     return json.loads(response.choices[0].message.content)['tags']
 # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
 def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
     """
@@ -100,11 +136,16 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
     # Tag all the quotes one by one using tag_quote function
     quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
     # Return only the quotes column and the new tags column
     output_df = quotes_df[[quotes_col_name, 'Tags']]
     output_file_path = "output.xlsx"
-    output_df.to_excel(output_file_path, index=False)
-    return output_df, output_file_path
 # Define user interface structure
 demo = gr.Interface(
@@ -115,8 +156,10 @@ demo = gr.Interface(
         gr.Textbox(label = "List of tags separated by commas")
     ],
     outputs=[
-        gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2),
         gr.File(label="Output data in file format")
     ],
     title="Automated Research Code Tagger",
     description=INSTRUCTIONS

 import openpyxl # Requirement for reading Excel files into pandas Dataframes
 import json # For conversion of OpenAI responses into json/dictionary objects so the contents can be extracted
 from dotenv import load_dotenv # For loading environment variables in local environment
+from collections import Counter # For tabulating tag occurrences
 # Load environment variables from local .env file if it exists; otherwise this does nothing
 load_dotenv()
     )
     return json.loads(response.choices[0].message.content)['tags']
+def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
+    """
+    Creates a DataFrame indicating number of occurences of each tag from a DataFrame column containing lists of tags.
+    This function also takes in a tags_list; all tags in the tags_list will be in the output
+    DataFrame even if they do not occur in the input tags_col. There may be some tags appearing
+    in the output which were not in the original tag_list; these will be marked with a ! prefix.
+    Args:
+        tags_list (list): The list of tags given by the user
+        tags_col (pd.Series): A column of lists where each list contains tags which are
+        (ideally but not always; depending on OpenAI) selected from the tags_list.
+    Returns:
+        pd.DataFrame: A DataFrame with two columns. The first contains individual tags(str) which have
+        appeared either in the tags_list, the lists within the tags_col, or both. The second
+        contains the number of occurrences(int) of that tag within the lists in the tags_col.
+    """
+    # Initialise Counter hash table
+    tags_counter = Counter({tag: 0 for tag in tags_list})
+    # Iterate over the lists in tags_col
+    for sublist in tags_col:
+        # Iterate over the tags in each list
+        for tag in sublist:
+            # Update the tags_counter for each tag
+            if tag in tags_list:
+                tags_counter.update([tag])
+            # If the tag was not in the tags_list given by the user, prefix it with a ! before updating
+            else:
+                tags_counter.update([f"!{tag}"])
+    # Convert the tags_counter to a DataFrame and return it
+    tags_counter_df = pd.DataFrame(tags_counter.items(), columns=['Tag', 'Count'])
+    return tags_counter_df
 # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
 def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
     """
     # Tag all the quotes one by one using tag_quote function
     quotes_df['Tags'] = quotes_data.apply(tag_quote, args=(tags_list,))
+    # Create hash table of tag occurrences using count_tags function
+    tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
     # Return only the quotes column and the new tags column
     output_df = quotes_df[[quotes_col_name, 'Tags']]
     output_file_path = "output.xlsx"
+    with pd.ExcelWriter(output_file_path) as writer:
+        output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
+        tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
+    return output_df, tags_counter_df, output_file_path
 # Define user interface structure
 demo = gr.Interface(
         gr.Textbox(label = "List of tags separated by commas")
     ],
     outputs=[
+        gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
+        gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
         gr.File(label="Output data in file format")
     ],
     title="Automated Research Code Tagger",
     description=INSTRUCTIONS

user_instructions.txt CHANGED Viewed

@@ -2,14 +2,20 @@ ABOUT:
 This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
 and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
-HOW TO USE:
-1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)
-2)Type in the name of the column where the quotes are located
-3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc.
-This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.
-4)All the responses from ChatGPT will be collated and displayed in the table on the right, together with the original quotes.
-You may then copy them into an Excel file for further processing. Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!
 Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
-I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.
--Kenneth

 This automated tagger takes in a list of tags and a list of input quotes. Each input quote is individually fed to OpenAI's ChatGPT together with the list of tags,
 and ChatGPT will respond with the subset of the input tags which are related to the content of the quote.
+HOW TO USE:<br>
+1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
+2)Type in the name of the column where the quotes are located<br>
+3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
+This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
+Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
+READING THE OUTPUT(in the right-side column): <br>
+First table: All the responses are collated and displayed here, together with the original quotes.<br>
+Second table: Displays all the tags used and the number of occurrences of each tag. <br>
+You may also retrieve both tables in a single Excel file via the link below them.<br>
+Sometimes a quote might be tagged with a tag that you didn't give; in such cases the tag will be prefixed with ! in the second table.
+DISCLAIMER:<br>
 Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
+I will not be responsible for mistakes made by the AI, but I can try to fix them if you alert me.<br>
+-Kenneth Lee, Perceptech.AI