kenleeyx commited on
Commit
ea8f271
·
1 Parent(s): 424ebdd

feat: add random quote retrieval for each tag

Browse files

-Also add one hot encoding for tags
-Fix bug where app throws error when retained_columns is empty
-Remove limit from ThreadPoolExecutor after determining impact on rate
limit is insignificant

Files changed (1) hide show
  1. app.py +21 -6
app.py CHANGED
@@ -9,6 +9,7 @@ from dotenv import load_dotenv # For loading environment variables in local envi
9
  from collections import Counter # For tabulating tag occurrences
10
  import logging
11
  import time
 
12
  from datetime import datetime
13
  from typing import Generator
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -140,8 +141,11 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
140
  tags_list = tags_string.split('\n')
141
  tags_list = [tag.strip() for tag in tags_list]
142
 
143
- retained_cols_list = retained_columns.split(',')
144
- retained_cols_list = [colname.strip() for colname in retained_cols_list]
 
 
 
145
 
146
  # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
147
  # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
@@ -165,7 +169,7 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
165
  tags_results = [None]*len(quotes_data)
166
 
167
  # Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
168
- with ThreadPoolExecutor(max_workers=5) as executor:
169
  # Generate futures for each of the quotes and map them to the quote indices
170
  future_to_index = {
171
  executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
@@ -184,17 +188,28 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
184
  yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
185
 
186
  quotes_df['Tags'] = tags_results
 
 
 
 
187
  logger.info("Quotes tagged")
188
 
189
  # Create hash table of tag occurrences using count_tags function
190
  tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
191
- logger.info("Tags counted")
192
-
 
 
 
 
 
 
 
193
  #Convert values in tags column from list to str
194
  quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
195
 
196
  # Return only the quotes column, the new tags column, and any other specified cols to retain
197
- output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']]
198
  output_file_path = "output.xlsx"
199
  with pd.ExcelWriter(output_file_path) as writer:
200
  output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
 
9
  from collections import Counter # For tabulating tag occurrences
10
  import logging
11
  import time
12
+ import random
13
  from datetime import datetime
14
  from typing import Generator
15
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
141
  tags_list = tags_string.split('\n')
142
  tags_list = [tag.strip() for tag in tags_list]
143
 
144
+ if retained_columns:
145
+ retained_cols_list = retained_columns.split(',')
146
+ retained_cols_list = [colname.strip() for colname in retained_cols_list]
147
+ else:
148
+ retained_cols_list = []
149
 
150
  # Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
151
  # pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
 
169
  tags_results = [None]*len(quotes_data)
170
 
171
  # Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
172
+ with ThreadPoolExecutor() as executor:
173
  # Generate futures for each of the quotes and map them to the quote indices
174
  future_to_index = {
175
  executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
 
188
  yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
189
 
190
  quotes_df['Tags'] = tags_results
191
+
192
+ # One hot encoding of tagged tags
193
+ for tag in tags_list:
194
+ quotes_df[tag]=quotes_df['Tags'].apply(lambda quote_tags: int(tag in quote_tags))
195
  logger.info("Quotes tagged")
196
 
197
  # Create hash table of tag occurrences using count_tags function
198
  tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
199
+
200
+ # Retrieve 2 quotes at random for each tag and put them in the tags counter DF
201
+ for tag in tags_counter_df['Tag']:
202
+ tagged_quotes_list = quotes_df.loc[quotes_df[tag]==1, quotes_col_name].tolist()
203
+ sample_quotes = random.sample(tagged_quotes_list, min(2, len(tagged_quotes_list)))
204
+ while len(sample_quotes) < 2:
205
+ sample_quotes.append(None)
206
+ [tags_counter_df.loc[tags_counter_df['Tag'] == tag, 'Quote 1'], tags_counter_df.loc[tags_counter_df['Tag'] == tag, 'Quote 2']] = sample_quotes
207
+
208
  #Convert values in tags column from list to str
209
  quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
210
 
211
  # Return only the quotes column, the new tags column, and any other specified cols to retain
212
+ output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']+tags_list]
213
  output_file_path = "output.xlsx"
214
  with pd.ExcelWriter(output_file_path) as writer:
215
  output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)