kenleeyx commited on
Commit
76b64ac
·
1 Parent(s): f792567

feat: \n-separated taglist, progress bar, etc

Browse files

Added new features:
1)Split tag list input by user by newlines instead of commas to
facilitate copy-paste input from Excel

2)Add progress bar to display what the app is doing and how many quotes
have been tagged. Accordingly the main function now yields instead of
returns.

3)Add option to allow user to specify columns to be retained in output

4)Tags in output are converted from list to string to remove brackets
from Excel output file

Files changed (2) hide show
  1. app.py +23 -12
  2. user_instructions.txt +3 -2
app.py CHANGED
@@ -10,6 +10,7 @@ from collections import Counter # For tabulating tag occurrences
10
  import logging
11
  import time
12
  from datetime import datetime
 
13
 
14
  logger = logging.getLogger()
15
  logger.setLevel(logging.INFO)
@@ -98,7 +99,7 @@ def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
98
 
99
 
100
  # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
101
- def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str) -> pd.DataFrame:
102
  """
103
  Processes quotes from an Excel file and assigns relevant tags to each quote.
104
 
@@ -123,9 +124,12 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
123
  >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
124
  Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
125
  """
126
- tags_list = tags_string.split(',')
127
  tags_list = [tag.strip() for tag in tags_list]
128
 
 
 
 
129
  # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
130
  # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
131
  # Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
@@ -134,19 +138,21 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
134
  quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
135
  # Set the extracted first row as the header for the DataFrame resultant from the other rows
136
  quotes_df.columns = quotes_df_cols
 
 
 
 
 
 
 
137
 
138
- # Check that the column name given by the user exists and is unique
139
- count = quotes_df.columns.tolist().count(quotes_col_name)
140
- if count == 0:
141
- raise gr.Error("No columns with this name found")
142
- elif count > 1:
143
- raise gr.Error("Multiple columns with this name found, please rename to something unique")
144
  quotes_data = quotes_df[quotes_col_name]
145
 
146
  # Tag all the quotes one by one using tag_quote function
147
  tags_column = []
148
  for i, quote in enumerate(quotes_data):
149
  logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
 
150
  tags = tag_quote(quote, tags_list)
151
  valid_tags = []
152
  for tag in tags: # filter out any hallucinated tags
@@ -162,15 +168,18 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, tags_string: str
162
  # Create hash table of tag occurrences using count_tags function
163
  tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
164
  logger.info("Tags counted")
 
 
 
165
 
166
- # Return only the quotes column and the new tags column
167
- output_df = quotes_df[[quotes_col_name, 'Tags']]
168
  output_file_path = "output.xlsx"
169
  with pd.ExcelWriter(output_file_path) as writer:
170
  output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
171
  tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
172
  logger.info('Results written to Excel')
173
- return output_df, tags_counter_df, output_file_path
174
 
175
  def check_auth(username:str, password:str):
176
  """
@@ -203,9 +212,11 @@ demo = gr.Interface(
203
  inputs=[
204
  gr.File(label="Quotes Excel File"),
205
  gr.Textbox(label="Name of quotes column"),
206
- gr.Textbox(label = "List of tags separated by commas")
 
207
  ],
208
  outputs=[
 
209
  gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
210
  gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
211
  gr.File(label="Output data in file format")
 
10
  import logging
11
  import time
12
  from datetime import datetime
13
+ from typing import Generator
14
 
15
  logger = logging.getLogger()
16
  logger.setLevel(logging.INFO)
 
99
 
100
 
101
  # Function that takes in a list of tags and an Excel file of quotes, calls tag_quote() on each quote, and returns all the quotes and tags in a DataFrame
102
+ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns: str, tags_string: str) -> Generator[tuple[str, pd.DataFrame, pd.DataFrame, str]]:
103
  """
104
  Processes quotes from an Excel file and assigns relevant tags to each quote.
105
 
 
124
  >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
125
  Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
126
  """
127
+ tags_list = tags_string.split('\n')
128
  tags_list = [tag.strip() for tag in tags_list]
129
 
130
+ retained_cols_list = retained_columns.split(',')
131
+ retained_cols_list = [colname.strip() for colname in retained_cols_list]
132
+
133
  # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
134
  # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
135
  # Extract the first row(the actual header for the DataFrame) as a DataFrame without header.
 
138
  quotes_df = pd.read_excel(quotes_file_path, header=None, skiprows=1)
139
  # Set the extracted first row as the header for the DataFrame resultant from the other rows
140
  quotes_df.columns = quotes_df_cols
141
+ # Verify that all column names given are found in the quotes DF exactly once each
142
+ for colname in retained_cols_list + [quotes_col_name]:
143
+ count = quotes_df.columns.tolist().count(colname)
144
+ if count == 0:
145
+ raise gr.Error(f"No columns with name {colname} found, check your inputs")
146
+ elif count > 1:
147
+ raise gr.Error(f"Multiple columns with name {colname} found, please rename these columns to something unique")
148
 
 
 
 
 
 
 
149
  quotes_data = quotes_df[quotes_col_name]
150
 
151
  # Tag all the quotes one by one using tag_quote function
152
  tags_column = []
153
  for i, quote in enumerate(quotes_data):
154
  logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
155
+ yield (f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}", None, None, None)
156
  tags = tag_quote(quote, tags_list)
157
  valid_tags = []
158
  for tag in tags: # filter out any hallucinated tags
 
168
  # Create hash table of tag occurrences using count_tags function
169
  tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
170
  logger.info("Tags counted")
171
+
172
+ #Convert values in tags column from list to str
173
+ quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
174
 
175
+ # Return only the quotes column, the new tags column, and any other specified cols to retain
176
+ output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']]
177
  output_file_path = "output.xlsx"
178
  with pd.ExcelWriter(output_file_path) as writer:
179
  output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
180
  tags_counter_df.to_excel(writer, sheet_name='Tag Count', index=False)
181
  logger.info('Results written to Excel')
182
+ yield ("Not running", output_df[[quotes_col_name, 'Tags']], tags_counter_df, output_file_path)
183
 
184
  def check_auth(username:str, password:str):
185
  """
 
212
  inputs=[
213
  gr.File(label="Quotes Excel File"),
214
  gr.Textbox(label="Name of quotes column"),
215
+ gr.Textbox(label = "Names of columns(eg respondentID) to retain in output, separated by commas"),
216
+ gr.Textbox(label = "List of tags, each tag on a new line"),
217
  ],
218
  outputs=[
219
+ gr.Textbox(label="Progress", value = "Not running"),
220
  gr.Dataframe(headers=["Quote", "Tags"], column_widths=["70%", "30%"], scale=2, label='Coded Quotes'),
221
  gr.Dataframe(headers=["Tag", "Count"], label='Tag Count'),
222
  gr.File(label="Output data in file format")
user_instructions.txt CHANGED
@@ -5,15 +5,16 @@ and ChatGPT will respond with the subset of the input tags which are related to
5
  HOW TO USE:<br>
6
  1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
7
  2)Type in the name of the column where the quotes are located<br>
8
- 3)Type in a list of tags separated by commas. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
 
9
  This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
10
  Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
11
 
12
  READING THE OUTPUT(in the right-side column): <br>
 
13
  First table: All the responses are collated and displayed here, together with the original quotes.<br>
14
  Second table: Displays all the tags used and the number of occurrences of each tag. <br>
15
  You may also retrieve both tables in a single Excel file via the link below them.<br>
16
- Sometimes a quote might be tagged with a tag that you didn't give; in such cases the tag will be prefixed with ! in the second table.
17
 
18
  DISCLAIMER:<br>
19
  Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.
 
5
  HOW TO USE:<br>
6
  1)Upload a single sheet Excel file containing quotes in a column.(It is ok for the file to contain other data also)<br>
7
  2)Type in the name of the column where the quotes are located<br>
8
+ 3)Type in the names of any other columns which you wish to retain in the output <br>
9
+ 4)Type in a list of tags, each tag on a new line. For proper names/slogans/other tags that should be treated as an inseparable unit eg. Nike's "Just Do It", add a * in front of the tag eg. tag1, *Just Do It, tag3, etc. <br>
10
  This will ensure only quotes containing "Just Do It" exactly are tagged and not other quotes about doing other things.<br>
11
  Please allow 5-10 min for processing, especially if you are giving upwards of 100 quotes!<br>
12
 
13
  READING THE OUTPUT(in the right-side column): <br>
14
+ Progress bar: Indicates which quote is currently being processed <br>
15
  First table: All the responses are collated and displayed here, together with the original quotes.<br>
16
  Second table: Displays all the tags used and the number of occurrences of each tag. <br>
17
  You may also retrieve both tables in a single Excel file via the link below them.<br>
 
18
 
19
  DISCLAIMER:<br>
20
  Please bear in mind that the tags are AI generated so check your results to ensure they make sense before using them.