Spaces:
Sleeping
Sleeping
feat: add random quote retrieval for each tag
Browse files-Also add one hot encoding for tags
-Fix bug where app throws error when retained_columns is empty
-Remove limit from ThreadPoolExecutor after determining impact on rate
limit is insignificant
app.py
CHANGED
|
@@ -9,6 +9,7 @@ from dotenv import load_dotenv # For loading environment variables in local envi
|
|
| 9 |
from collections import Counter # For tabulating tag occurrences
|
| 10 |
import logging
|
| 11 |
import time
|
|
|
|
| 12 |
from datetime import datetime
|
| 13 |
from typing import Generator
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
@@ -140,8 +141,11 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
|
|
| 140 |
tags_list = tags_string.split('\n')
|
| 141 |
tags_list = [tag.strip() for tag in tags_list]
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
# Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
|
| 147 |
# pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
|
|
@@ -165,7 +169,7 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
|
|
| 165 |
tags_results = [None]*len(quotes_data)
|
| 166 |
|
| 167 |
# Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
|
| 168 |
-
with ThreadPoolExecutor(
|
| 169 |
# Generate futures for each of the quotes and map them to the quote indices
|
| 170 |
future_to_index = {
|
| 171 |
executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
|
|
@@ -184,17 +188,28 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
|
|
| 184 |
yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
|
| 185 |
|
| 186 |
quotes_df['Tags'] = tags_results
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
logger.info("Quotes tagged")
|
| 188 |
|
| 189 |
# Create hash table of tag occurrences using count_tags function
|
| 190 |
tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
#Convert values in tags column from list to str
|
| 194 |
quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
|
| 195 |
|
| 196 |
# Return only the quotes column, the new tags column, and any other specified cols to retain
|
| 197 |
-
output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']]
|
| 198 |
output_file_path = "output.xlsx"
|
| 199 |
with pd.ExcelWriter(output_file_path) as writer:
|
| 200 |
output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
|
|
|
|
| 9 |
from collections import Counter # For tabulating tag occurrences
|
| 10 |
import logging
|
| 11 |
import time
|
| 12 |
+
import random
|
| 13 |
from datetime import datetime
|
| 14 |
from typing import Generator
|
| 15 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
| 141 |
tags_list = tags_string.split('\n')
|
| 142 |
tags_list = [tag.strip() for tag in tags_list]
|
| 143 |
|
| 144 |
+
if retained_columns:
|
| 145 |
+
retained_cols_list = retained_columns.split(',')
|
| 146 |
+
retained_cols_list = [colname.strip() for colname in retained_cols_list]
|
| 147 |
+
else:
|
| 148 |
+
retained_cols_list = []
|
| 149 |
|
| 150 |
# Transfer quotes data from Excel file into pandas DataFrame, handling potential duplicate column names in the Excel file
|
| 151 |
# pd.read_excel will rename duplicates eg foo -> foo.1, causing a mismatch between quotes_col_name and the actual column name
|
|
|
|
| 169 |
tags_results = [None]*len(quotes_data)
|
| 170 |
|
| 171 |
# Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
|
| 172 |
+
with ThreadPoolExecutor() as executor:
|
| 173 |
# Generate futures for each of the quotes and map them to the quote indices
|
| 174 |
future_to_index = {
|
| 175 |
executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
|
|
|
|
| 188 |
yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
|
| 189 |
|
| 190 |
quotes_df['Tags'] = tags_results
|
| 191 |
+
|
| 192 |
+
# One hot encoding of tagged tags
|
| 193 |
+
for tag in tags_list:
|
| 194 |
+
quotes_df[tag]=quotes_df['Tags'].apply(lambda quote_tags: int(tag in quote_tags))
|
| 195 |
logger.info("Quotes tagged")
|
| 196 |
|
| 197 |
# Create hash table of tag occurrences using count_tags function
|
| 198 |
tags_counter_df = count_tags(tags_list, quotes_df['Tags'])
|
| 199 |
+
|
| 200 |
+
# Retrieve 2 quotes at random for each tag and put them in the tags counter DF
|
| 201 |
+
for tag in tags_counter_df['Tag']:
|
| 202 |
+
tagged_quotes_list = quotes_df.loc[quotes_df[tag]==1, quotes_col_name].tolist()
|
| 203 |
+
sample_quotes = random.sample(tagged_quotes_list, min(2, len(tagged_quotes_list)))
|
| 204 |
+
while len(sample_quotes) < 2:
|
| 205 |
+
sample_quotes.append(None)
|
| 206 |
+
[tags_counter_df.loc[tags_counter_df['Tag'] == tag, 'Quote 1'], tags_counter_df.loc[tags_counter_df['Tag'] == tag, 'Quote 2']] = sample_quotes
|
| 207 |
+
|
| 208 |
#Convert values in tags column from list to str
|
| 209 |
quotes_df['Tags'] = quotes_df["Tags"].apply(lambda x: ", ".join(x))
|
| 210 |
|
| 211 |
# Return only the quotes column, the new tags column, and any other specified cols to retain
|
| 212 |
+
output_df = quotes_df[retained_cols_list+[quotes_col_name, 'Tags']+tags_list]
|
| 213 |
output_file_path = "output.xlsx"
|
| 214 |
with pd.ExcelWriter(output_file_path) as writer:
|
| 215 |
output_df.to_excel(writer, sheet_name='Coded Quotes', index=False)
|