kenleeyx commited on
Commit
424ebdd
·
1 Parent(s): e0618e8

feat: Introduce threading for OpenAI API calls

Browse files

Implement concurrent processing for quotes tagging to speed up
processing time.

Files changed (1) hide show
  1. app.py +46 -25
app.py CHANGED
@@ -11,6 +11,7 @@ import logging
11
  import time
12
  from datetime import datetime
13
  from typing import Generator
 
14
 
15
  logger = logging.getLogger()
16
  logger.setLevel(logging.INFO)
@@ -43,14 +44,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
43
 
44
  This function uses a GPT-based language model to analyze the input quote and determine
45
  the most relevant tags from the provided list. The response is parsed from the JSON
46
- output of the model and returned as a list of tags.
 
47
 
48
  Args:
49
  quote (str): The quote or text to be analyzed.
50
  tags_list (list): A list of potential tags to match against the quote.
51
 
52
  Returns:
53
- list: A list of tags that are relevant to the quote, as determined by the model.
54
  """
55
  logger.info(f"Tagging quote {quote}")
56
  response = client.chat.completions.create(
@@ -61,7 +63,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
61
  {"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
62
  ]
63
  )
64
- return json.loads(response.choices[0].message.content)['tags']
 
 
 
 
 
 
 
 
65
 
66
  def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
67
  """
@@ -105,24 +115,27 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
105
 
106
  This function reads an Excel file containing quotes, validates the column containing
107
  the quotes, and applies the `tag_quote` function to assign tags to each quote.
108
- The tags are derived from a user-provided comma-separated string.
109
 
110
  Args:
111
  quotes_file_path (str): Path to the Excel file containing the quotes.
112
  quotes_col_name (str): The name of the column containing the quotes.
113
- tags_string (str): A comma-separated string of potential tags.
 
114
 
115
- Returns:
116
- pd.DataFrame: A DataFrame with two columns:
 
 
117
  - The original column containing the quotes.
118
  - A new column 'Tags' with the tags assigned to each quote.
 
 
 
 
119
 
120
  Raises:
121
  gr.Error: If the specified column name does not exist or is not unique.
122
-
123
- Example:
124
- >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
125
- Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
126
  """
127
  tags_list = tags_string.split('\n')
128
  tags_list = [tag.strip() for tag in tags_list]
@@ -149,20 +162,28 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
149
  quotes_data = quotes_df[quotes_col_name]
150
 
151
  # Tag all the quotes one by one using tag_quote function
152
- tags_column = []
153
- for i, quote in enumerate(quotes_data):
154
- logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
155
- yield (f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}", None, None, None)
156
- tags = tag_quote(quote, tags_list)
157
- valid_tags = []
158
- for tag in tags: # filter out any hallucinated tags
159
- if tag in tags_list:
160
- valid_tags.append(tag)
161
- else:
162
- logger.warning(f"Invalid tag {tag} found and has been filtered out.")
163
- tags_column.append(valid_tags)
164
-
165
- quotes_df['Tags'] = tags_column
 
 
 
 
 
 
 
 
166
  logger.info("Quotes tagged")
167
 
168
  # Create hash table of tag occurrences using count_tags function
 
11
  import time
12
  from datetime import datetime
13
  from typing import Generator
14
+ from concurrent.futures import ThreadPoolExecutor, as_completed
15
 
16
  logger = logging.getLogger()
17
  logger.setLevel(logging.INFO)
 
44
 
45
  This function uses a GPT-based language model to analyze the input quote and determine
46
  the most relevant tags from the provided list. The response is parsed from the JSON
47
+ output of the model and returned as a list of tags. This list is checked to ensure
48
+ all tags tagged are taken from the input tags_list.
49
 
50
  Args:
51
  quote (str): The quote or text to be analyzed.
52
  tags_list (list): A list of potential tags to match against the quote.
53
 
54
  Returns:
55
+ valid_tags: A list of tags that are relevant to the quote, as determined by the model.
56
  """
57
  logger.info(f"Tagging quote {quote}")
58
  response = client.chat.completions.create(
 
63
  {"role": "user", "content": PROMPT.format(tags_list=tags_list, quote=quote)}
64
  ]
65
  )
66
+
67
+ tags = json.loads(response.choices[0].message.content)['tags']
68
+ valid_tags = []
69
+ for tag in tags: # filter out any hallucinated tags
70
+ if tag in tags_list:
71
+ valid_tags.append(tag)
72
+ else:
73
+ logger.warning(f"Invalid tag {tag} found and has been filtered out.")
74
+ return valid_tags
75
 
76
  def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
77
  """
 
115
 
116
  This function reads an Excel file containing quotes, validates the column containing
117
  the quotes, and applies the `tag_quote` function to assign tags to each quote.
118
+ The tags are derived from a user-provided newline-separated string.
119
 
120
  Args:
121
  quotes_file_path (str): Path to the Excel file containing the quotes.
122
  quotes_col_name (str): The name of the column containing the quotes.
123
+ retained_columns (str): The names of the columns in the Excel file which are to be added to the output file.
124
+ tags_string (str): A newline-separated string of potential tags.
125
 
126
+ Yields:
127
+ tuple: A 4-element tuple containing:
128
+ - str: A progress indicator (or "Not running" if tagging is complete)
129
+ - pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
130
  - The original column containing the quotes.
131
  - A new column 'Tags' with the tags assigned to each quote.
132
+ - pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
133
+ -"Tag" - The list of tags that was passed in.
134
+ -"Count" - The total number of times each tag was used in tagging all the quotes.
135
+ - str: A path to an Excel file containing sheets derived from the previous 2 DataFrames. (or None if tagging is incomplete)
136
 
137
  Raises:
138
  gr.Error: If the specified column name does not exist or is not unique.
 
 
 
 
139
  """
140
  tags_list = tags_string.split('\n')
141
  tags_list = [tag.strip() for tag in tags_list]
 
162
  quotes_data = quotes_df[quotes_col_name]
163
 
164
  # Tag all the quotes one by one using tag_quote function
165
+ tags_results = [None]*len(quotes_data)
166
+
167
+ # Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
168
+ with ThreadPoolExecutor(max_workers=5) as executor:
169
+ # Generate futures for each of the quotes and map them to the quote indices
170
+ future_to_index = {
171
+ executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
172
+ }
173
+ # Enumerate the completed futures(ordered as completed which may be different from submitted order)
174
+ # This step waits for the tag_quote functions to complete
175
+ for completed, future in enumerate(as_completed(future_to_index), 1):
176
+ # Retrieve index of the completed future from above map
177
+ i = future_to_index[future]
178
+ # Insert the result of the completed future into the results list at its quote's original position
179
+ try:
180
+ tags_results[i] = future.result()
181
+ except Exception as e:
182
+ tags_results[i] = f"Error:{e}"
183
+ # Update UI by yielding a status update
184
+ yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
185
+
186
+ quotes_df['Tags'] = tags_results
187
  logger.info("Quotes tagged")
188
 
189
  # Create hash table of tag occurrences using count_tags function