Spaces:

Perceptechai
/

code-tagging

Sleeping

kenleeyx commited on May 28, 2025

Commit

76b64ac

1 Parent(s): f792567

feat: \n-separated taglist, progress bar, etc

Added new features:
1)Split tag list input by user by newlines instead of commas to
facilitate copy-paste input from Excel

2)Add progress bar to display what the app is doing and how many quotes
have been tagged. Accordingly the main function now yields instead of
returns.

3)Add option to allow user to specify columns to be retained in output

4)Tags in output are converted from list to string to remove brackets
from Excel output file

Files changed (2) hide show

app.py +23 -12
user_instructions.txt +3 -2

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from collections import Counter # For tabulating tag occurrences
 import logging
 import time
 from datetime import datetime
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@@ -98,7 +99,7 @@ def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
 # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
-def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
     """
     Processes quotes from an Excel file and assigns relevant tags to each quote.
@@ -123,9 +124,12 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
         >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
         Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
     """
-    tags_list = tags_string.split(',')
     tags_list = [tag.strip() for tag in tags_list]
     # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
     # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
     # Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
@@ -134,19 +138,21 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
     quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
     # Set the extracted first row as the header for the DataFrame resultant from the other rows
     quotes_df.columns = quotes_df_cols
-    # Check that the column name given by the user exists and is unique
-    count = quotes_df.columns.tolist().count(quotes_col_name)
-    if count == 0:
-        raise gr.Error("No columns with this name found")
-    elif count > 1:
-        raise gr.Error("Multiple columns with this name found, please rename to something unique")
     quotes_data = quotes_df[quotes_col_name]
     # Tag all the quotes one by one using tag_quote function
     tags_column = []
     for i, quote in enumerate(quotes_data):
         logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
         tags = tag_quote(quote, tags_list)
         valid_tags = []
         for tag in tags: # filter out any hallucinated tags
@@ -162,15 +168,18 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
     # Create hash table of tag occurrences using count_tags function
     tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
     logger.info("Tags counted")
-    # Return only the quotes column and the new tags column
-    output_df = quotes_df[[quotes_col_name, 'Tags']]
     output_file_path = "output.xlsx"
     with pd.ExcelWriter(output_file_path) as writer:
         output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
         tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
     logger.info('Results written to Excel')
-    return output_df, tags_counter_df, output_file_path
 def check_auth(username:str, password:str):
     """
@@ -203,9 +212,11 @@ demo = gr.Interface(
     inputs=[
         gr.File(label="Quotes Excel File"),
         gr.Textbox(label="Name of quotes column"),
-        gr.Textbox(label = "List of tags separated by commas")
     ],
     outputs=[
         gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
         gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
         gr.File(label="Output data in file format")

 import logging
 import time
 from datetime import datetime
+from typing import Generator
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
+def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns: str, tags_string: str) -> Generator[tuple[str, pd.DataFrame, pd.DataFrame, str]]:
     """
     Processes quotes from an Excel file and assigns relevant tags to each quote.
         >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
         Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
     """
+    tags_list = tags_string.split('\n')
     tags_list = [tag.strip() for tag in tags_list]
+    retained_cols_list = retained_columns.split(',')
+    retained_cols_list = [colname.strip() for colname in retained_cols_list]
     # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
     # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
     # Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
     quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
     # Set the extracted first row as the header for the DataFrame resultant from the other rows
     quotes_df.columns = quotes_df_cols
+    # Verify that all column names given are found in the quotes DF exactly once each
+    for colname in retained_cols_list + [quotes_col_name]:
+        count = quotes_df.columns.tolist().count(colname)
+        if count == 0:
+            raise gr.Error(f"No columns with name {colname} found, check your inputs")
+        elif count > 1:
+            raise gr.Error(f"Multiple columns with name {colname} found, please rename these columns to something unique")
     quotes_data = quotes_df[quotes_col_name]
     # Tag all the quotes one by one using tag_quote function
     tags_column = []
     for i, quote in enumerate(quotes_data):
         logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
+        yield (f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}", None, None, None)
         tags = tag_quote(quote, tags_list)
         valid_tags = []
         for tag in tags: # filter out any hallucinated tags
     # Create hash table of tag occurrences using count_tags function
     tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
     logger.info("Tags counted")
+    #Convert values in tags column from list to str
+    quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
+    # Return only the quotes column, the new tags column, and any other specified cols to retain
+    output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']]
     output_file_path = "output.xlsx"
     with pd.ExcelWriter(output_file_path) as writer:
         output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
         tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
     logger.info('Results written to Excel')
+    yield ("Not running", output_df[[quotes_col_name, 'Tags']], tags_counter_df, output_file_path)
 def check_auth(username:str, password:str):
     """
     inputs=[
         gr.File(label="Quotes Excel File"),
         gr.Textbox(label="Name of quotes column"),
+        gr.Textbox(label = "Names of columns(eg respondentID) to retain in output, separated by commas"),
+        gr.Textbox(label = "List of tags, each tag on a new line"),
     ],
     outputs=[
+        gr.Textbox(label="Progress", value = "Not running"),
         gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
         gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
         gr.File(label="Output data in file format")

user_instructions.txt CHANGED Viewed

@@ -5,15 +5,16 @@ and ChatGPT will respond with the subset of the input tags which are related to
 HOW TO USE:<br>
 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
 2)Type in the name of the column where the quotes are located<br>
-3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
 This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
 Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
 READING THE OUTPUT(in the right-side column): <br>
 First table: All the responses are collated and displayed here, together with the original quotes.<br>
 Second table: Displays all the tags used and the number of occurrences of each tag. <br>
 You may also retrieve both tables in a single Excel file via the link below them.<br>
-Sometimes a quote might be tagged with a tag that you didn't give; in such cases the tag will be prefixed with ! in the second table.
 DISCLAIMER:<br>
 Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.

 HOW TO USE:<br>
 1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
 2)Type in the name of the column where the quotes are located<br>
+3)Type in the names of any other columns which you wish to retain in the output <br>
+4)Type in a list of tags, each tag on a new line. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
 This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
 Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
 READING THE OUTPUT(in the right-side column): <br>
+Progress bar: Indicates which quote is currently being processed <br>
 First table: All the responses are collated and displayed here, together with the original quotes.<br>
 Second table: Displays all the tags used and the number of occurrences of each tag. <br>
 You may also retrieve both tables in a single Excel file via the link below them.<br>
 DISCLAIMER:<br>
 Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.