Spaces:

Perceptechai
/

code-tagging

Sleeping

App Files Files Community

kenleeyx commited on May 30, 2025

Commit

424ebdd

1 Parent(s): e0618e8

feat: Introduce threading for OpenAI API calls

Browse files

Implement concurrent processing for quotes tagging to speed up
processing time.

Files changed (1) hide show

app.py +46 -25

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import logging
 import time
 from datetime import datetime
 from typing import Generator
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
@@ -43,14 +44,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
     This function uses a GPT-based language model to analyze the input quote and determine
     the most relevant tags from the provided list. The response is parsed from the JSON
-    output of the model and returned as a list of tags.
     Args:
         quote (str): The quote or text to be analyzed.
         tags_list (list): A list of potential tags to match against the quote.
     Returns:
-        list: A list of tags that are relevant to the quote, as determined by the model.
     """
     logger.info(f"Tagging quote {quote}")
     response = client.chat.completions.create(
@@ -61,7 +63,15 @@ def tag_quote(quote: str, tags_list: list) -> list:
             {"role": "user", "content": PROMPT.format(tags_list=tags_list,  quote=quote)}
         ]
     )
-    return json.loads(response.choices[0].message.content)['tags']
 def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
     """
@@ -105,24 +115,27 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
     This function reads an Excel file containing quotes, validates the column containing
     the quotes, and applies the `tag_quote` function to assign tags to each quote.
-    The tags are derived from a user-provided comma-separated string.
     Args:
         quotes_file_path (str): Path to the Excel file containing the quotes.
         quotes_col_name (str): The name of the column containing the quotes.
-        tags_string (str): A comma-separated string of potential tags.
-    Returns:
-        pd.DataFrame: A DataFrame with two columns:
             - The original column containing the quotes.
             - A new column 'Tags' with the tags assigned to each quote.
     Raises:
         gr.Error: If the specified column name does not exist or is not unique.
-    Example:
-        >>> process_quotes("quotes.xlsx", "Quote", "inspirational, funny, motivational")
-        Outputs a DataFrame with 'Quote' and 'Tags' columns indicating which tags are assigned to which quotes.
     """
     tags_list = tags_string.split('\n')
     tags_list = [tag.strip() for tag in tags_list]
@@ -149,20 +162,28 @@ def process_quotes(quotes_file_path: str, quotes_col_name: str, retained_columns
     quotes_data = quotes_df[quotes_col_name]
     # Tag all the quotes one by one using tag_quote function
-    tags_column = []
-    for i, quote in enumerate(quotes_data):
-        logger.info(f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}")
-        yield (f"Tagging quote {i + 1}/{len(quotes_data)}: {quote}", None, None, None)
-        tags = tag_quote(quote, tags_list)
-        valid_tags = []
-        for tag in tags: # filter out any hallucinated tags
-            if tag in tags_list:
-                valid_tags.append(tag)
-            else:
-                logger.warning(f"Invalid tag {tag} found and has been filtered out.")
-        tags_column.append(valid_tags)
-    quotes_df['Tags'] = tags_column
     logger.info("Quotes tagged")
     # Create hash table of tag occurrences using count_tags function

 import time
 from datetime import datetime
 from typing import Generator
+from concurrent.futures import ThreadPoolExecutor, as_completed
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
     This function uses a GPT-based language model to analyze the input quote and determine
     the most relevant tags from the provided list. The response is parsed from the JSON
+    output of the model and returned as a list of tags. This list is checked to ensure
+    all tags tagged are taken from the input tags_list.
     Args:
         quote (str): The quote or text to be analyzed.
         tags_list (list): A list of potential tags to match against the quote.
     Returns:
+        valid_tags: A list of tags that are relevant to the quote, as determined by the model.
     """
     logger.info(f"Tagging quote {quote}")
     response = client.chat.completions.create(
             {"role": "user", "content": PROMPT.format(tags_list=tags_list,  quote=quote)}
         ]
     )
+    tags = json.loads(response.choices[0].message.content)['tags']
+    valid_tags = []
+    for tag in tags: # filter out any hallucinated tags
+        if tag in tags_list:
+            valid_tags.append(tag)
+        else:
+            logger.warning(f"Invalid tag {tag} found and has been filtered out.")
+    return valid_tags
 def count_tags(tags_list: list, tags_col: pd.Series )->pd.DataFrame:
     """
     This function reads an Excel file containing quotes, validates the column containing
     the quotes, and applies the `tag_quote` function to assign tags to each quote.
+    The tags are derived from a user-provided newline-separated string.
     Args:
         quotes_file_path (str): Path to the Excel file containing the quotes.
         quotes_col_name (str): The name of the column containing the quotes.
+        retained_columns (str): The names of the columns in the Excel file which are to be added to the output file.
+        tags_string (str): A newline-separated string of potential tags.
+    Yields:
+        tuple: A 4-element tuple containing:
+        - str: A progress indicator (or "Not running" if tagging is complete)
+        - pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
             - The original column containing the quotes.
             - A new column 'Tags' with the tags assigned to each quote.
+        - pd.DataFrame: A DataFrame with two columns: (or None if tagging is incomplete)
+            -"Tag" - The list of tags that was passed in.
+            -"Count" - The total number of times each tag was used in tagging all the quotes.
+        - str: A path to an Excel file containing sheets derived from the previous 2 DataFrames. (or None if tagging is incomplete)
     Raises:
         gr.Error: If the specified column name does not exist or is not unique.
     """
     tags_list = tags_string.split('\n')
     tags_list = [tag.strip() for tag in tags_list]
     quotes_data = quotes_df[quotes_col_name]
     # Tag all the quotes one by one using tag_quote function
+    tags_results = [None]*len(quotes_data)
+    # Threading execution of tag_quotes with {max_workers} threads: we send {max_workers} requests to the LLM concurrently.
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        # Generate futures for each of the quotes and map them to the quote indices
+        future_to_index = {
+            executor.submit(tag_quote, quote, tags_list): i for i, quote in enumerate(quotes_data)
+        }
+        # Enumerate the completed futures(ordered as completed which may be different from submitted order)
+        # This step waits for the tag_quote functions to complete
+        for completed, future in enumerate(as_completed(future_to_index), 1):
+            # Retrieve index of the completed future from above map
+            i = future_to_index[future]
+            # Insert the result of the completed future into the results list at its quote's original position
+            try:
+                tags_results[i] = future.result()
+            except Exception as e:
+                tags_results[i] = f"Error:{e}"
+            # Update UI by yielding a status update
+            yield (f"Tagged {completed}/{len(quotes_data)} quotes: {quotes_data[i]}", None, None, None)
+    quotes_df['Tags'] = tags_results
     logger.info("Quotes tagged")
     # Create hash table of tag occurrences using count_tags function