Spaces:
Sleeping
Sleeping
feat: Introduce threading for OpenAI API calls
Browse filesImplement concurrent processing for quotes tagging to speed up
processing time.
app.py
CHANGED
|
@@ -11,6 +11,7 @@ import logging
|
|
| 11 |
import time
|
| 12 |
from datetime import datetime
|
| 13 |
from typing import Generator
|
|
|
|
| 14 |
|
| 15 |
logger = logging.getLogger()
|
| 16 |
logger.setLevel(logging.INFO)
|
|
@@ -43,14 +44,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
|
|
| 43 |
|
| 44 |
This function uses a GPT-based language model to analyze the input quote and determine
|
| 45 |
the most relevant tags from the provided list. The response is parsed from the JSON
|
| 46 |
-
output of the model and returned as a list of tags.
|
|
|
|
| 47 |
|
| 48 |
Args:
|
| 49 |
quote (str): The quote or text to be analyzed.
|
| 50 |
tags_list (list): A list of potential tags to match against the quote.
|
| 51 |
|
| 52 |
Returns:
|
| 53 |
-
|
| 54 |
"""
|
| 55 |
logger.info(f"Tagging quote {quote}")
|
| 56 |
response = client.chat.completions.create(
|
|
@@ -61,7 +63,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
|
|
| 61 |
{"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
|
| 62 |
]
|
| 63 |
)
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
|
| 67 |
"""
|
|
@@ -105,24 +115,27 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
|
|
| 105 |
|
| 106 |
This function reads an Excel file containing quotes, validates the column containing
|
| 107 |
the quotes, and applies the `tag_quote` function to assign tags to each quote.
|
| 108 |
-
The tags are derived from a user-provided
|
| 109 |
|
| 110 |
Args:
|
| 111 |
quotes_file_path (str): Path to the Excel file containing the quotes.
|
| 112 |
quotes_col_name (str): The name of the column containing the quotes.
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
- The original column containing the quotes.
|
| 118 |
- A new column 'Tags' with the tags assigned to each quote.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
Raises:
|
| 121 |
gr.Error: If the specified column name does not exist or is not unique.
|
| 122 |
-
|
| 123 |
-
Example:
|
| 124 |
-
>>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
|
| 125 |
-
Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
|
| 126 |
"""
|
| 127 |
tags_list = tags_string.split('\n')
|
| 128 |
tags_list = [tag.strip() for tag in tags_list]
|
|
@@ -149,20 +162,28 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
|
|
| 149 |
quotes_data = quotes_df[quotes_col_name]
|
| 150 |
|
| 151 |
# Tag all the quotes one by one using tag_quote function
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
logger.info("Quotes tagged")
|
| 167 |
|
| 168 |
# Create hash table of tag occurrences using count_tags function
|
|
|
|
| 11 |
import time
|
| 12 |
from datetime import datetime
|
| 13 |
from typing import Generator
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
|
| 16 |
logger = logging.getLogger()
|
| 17 |
logger.setLevel(logging.INFO)
|
|
|
|
| 44 |
|
| 45 |
This function uses a GPT-based language model to analyze the input quote and determine
|
| 46 |
the most relevant tags from the provided list. The response is parsed from the JSON
|
| 47 |
+
output of the model and returned as a list of tags. This list is checked to ensure
|
| 48 |
+
all tags tagged are taken from the input tags_list.
|
| 49 |
|
| 50 |
Args:
|
| 51 |
quote (str): The quote or text to be analyzed.
|
| 52 |
tags_list (list): A list of potential tags to match against the quote.
|
| 53 |
|
| 54 |
Returns:
|
| 55 |
+
valid_tags: A list of tags that are relevant to the quote, as determined by the model.
|
| 56 |
"""
|
| 57 |
logger.info(f"Tagging quote {quote}")
|
| 58 |
response = client.chat.completions.create(
|
|
|
|
| 63 |
{"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
|
| 64 |
]
|
| 65 |
)
|
| 66 |
+
|
| 67 |
+
tags = json.loads(response.choices[0].message.content)['tags']
|
| 68 |
+
valid_tags = []
|
| 69 |
+
for tag in tags: # filter out any hallucinated tags
|
| 70 |
+
if tag in tags_list:
|
| 71 |
+
valid_tags.append(tag)
|
| 72 |
+
else:
|
| 73 |
+
logger.warning(f"Invalid tag {tag} found and has been filtered out.")
|
| 74 |
+
return valid_tags
|
| 75 |
|
| 76 |
def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
|
| 77 |
"""
|
|
|
|
| 115 |
|
| 116 |
This function reads an Excel file containing quotes, validates the column containing
|
| 117 |
the quotes, and applies the `tag_quote` function to assign tags to each quote.
|
| 118 |
+
The tags are derived from a user-provided newline-separated string.
|
| 119 |
|
| 120 |
Args:
|
| 121 |
quotes_file_path (str): Path to the Excel file containing the quotes.
|
| 122 |
quotes_col_name (str): The name of the column containing the quotes.
|
| 123 |
+
retained_columns (str): The names of the columns in the Excel file which are to be added to the output file.
|
| 124 |
+
tags_string (str): A newline-separated string of potential tags.
|
| 125 |
|
| 126 |
+
Yields:
|
| 127 |
+
tuple: A 4-element tuple containing:
|
| 128 |
+
- str: A progress indicator (or "Not running" if tagging is complete)
|
| 129 |
+
- pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
|
| 130 |
- The original column containing the quotes.
|
| 131 |
- A new column 'Tags' with the tags assigned to each quote.
|
| 132 |
+
- pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
|
| 133 |
+
-"Tag" - The list of tags that was passed in.
|
| 134 |
+
-"Count" - The total number of times each tag was used in tagging all the quotes.
|
| 135 |
+
- str: A path to an Excel file containing sheets derived from the previous 2 DataFrames. (or None if tagging is incomplete)
|
| 136 |
|
| 137 |
Raises:
|
| 138 |
gr.Error: If the specified column name does not exist or is not unique.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
"""
|
| 140 |
tags_list = tags_string.split('\n')
|
| 141 |
tags_list = [tag.strip() for tag in tags_list]
|
|
|
|
| 162 |
quotes_data = quotes_df[quotes_col_name]
|
| 163 |
|
| 164 |
# Tag all the quotes one by one using tag_quote function
|
| 165 |
+
tags_results = [None]*len(quotes_data)
|
| 166 |
+
|
| 167 |
+
# Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
|
| 168 |
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
| 169 |
+
# Generate futures for each of the quotes and map them to the quote indices
|
| 170 |
+
future_to_index = {
|
| 171 |
+
executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
|
| 172 |
+
}
|
| 173 |
+
# Enumerate the completed futures(ordered as completed which may be different from submitted order)
|
| 174 |
+
# This step waits for the tag_quote functions to complete
|
| 175 |
+
for completed, future in enumerate(as_completed(future_to_index), 1):
|
| 176 |
+
# Retrieve index of the completed future from above map
|
| 177 |
+
i = future_to_index[future]
|
| 178 |
+
# Insert the result of the completed future into the results list at its quote's original position
|
| 179 |
+
try:
|
| 180 |
+
tags_results[i] = future.result()
|
| 181 |
+
except Exception as e:
|
| 182 |
+
tags_results[i] = f"Error:{e}"
|
| 183 |
+
# Update UI by yielding a status update
|
| 184 |
+
yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
|
| 185 |
+
|
| 186 |
+
quotes_df['Tags'] = tags_results
|
| 187 |
logger.info("Quotes tagged")
|
| 188 |
|
| 189 |
# Create hash table of tag occurrences using count_tags function
|