Spaces:
Sleeping
Sleeping
Update worklog_categorizer.py
Browse files- worklog_categorizer.py +368 -368
worklog_categorizer.py
CHANGED
|
@@ -1,368 +1,368 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import logging
|
| 3 |
-
import google.generativeai as genai
|
| 4 |
-
from functools import lru_cache
|
| 5 |
-
from typing import List, Dict, Any, Optional, Tuple
|
| 6 |
-
import pandas as pd
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
import time
|
| 9 |
-
from tqdm import tqdm
|
| 10 |
-
import re
|
| 11 |
-
|
| 12 |
-
# Configure logging
|
| 13 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 14 |
-
logger = logging.getLogger(__name__)
|
| 15 |
-
|
| 16 |
-
# Initialize Gemini API
|
| 17 |
-
try:
|
| 18 |
-
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
| 19 |
-
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 20 |
-
logger.info("Gemini API initialized successfully")
|
| 21 |
-
except Exception as e:
|
| 22 |
-
logger.error(f"Error initializing Gemini API: {e}")
|
| 23 |
-
model = None
|
| 24 |
-
|
| 25 |
-
# Prompt for worklog categorization - modified for batch processing
|
| 26 |
-
BATCH_CATEGORIZATION_PROMPT = """
|
| 27 |
-
You are a technology skill categorizer. Analyze each worklog entry and assign a single technology category word that best represents the technical skill or technology involved.
|
| 28 |
-
|
| 29 |
-
Guidelines:
|
| 30 |
-
1. Respond with ONLY a single word (or hyphenated term if necessary) for each worklog
|
| 31 |
-
2. Focus on the core technology, framework, or skill
|
| 32 |
-
3. Be specific when the technology is clear (e.g., "React", "Python", "AWS")
|
| 33 |
-
4. Use broader categories when specific technology isn't clear (e.g., "Frontend", "Backend", "DevOps")
|
| 34 |
-
5. Prefer standard technology names over abbreviations
|
| 35 |
-
6. Don't include unnecessary adjectives or descriptions
|
| 36 |
-
7. Respond in a numbered list format matching the input worklogs
|
| 37 |
-
|
| 38 |
-
Examples:
|
| 39 |
-
Worklog 1: "fixing issue in next js application" → "NextJS"
|
| 40 |
-
Worklog 2: "Task issue fixing - next js application" → "NextJS"
|
| 41 |
-
Worklog 3: "Debugging Python script for data analysis" → "Python"
|
| 42 |
-
Worklog 4: "Creating responsive CSS layout" → "CSS"
|
| 43 |
-
Worklog 5: "Implementing REST API endpoints" → "Backend"
|
| 44 |
-
|
| 45 |
-
Here are the worklogs to categorize:
|
| 46 |
-
{worklogs}
|
| 47 |
-
|
| 48 |
-
For each worklog, respond with a numbered list containing only the category word for each entry:
|
| 49 |
-
1. [category for worklog 1]
|
| 50 |
-
2. [category for worklog 2]
|
| 51 |
-
...and so on
|
| 52 |
-
"""
|
| 53 |
-
|
| 54 |
-
def is_upskilling_issue(issue_text):
|
| 55 |
-
"""
|
| 56 |
-
Check if an issue is related to upskilling using regex to match various formats.
|
| 57 |
-
|
| 58 |
-
Args:
|
| 59 |
-
issue_text: The issue text to check
|
| 60 |
-
|
| 61 |
-
Returns:
|
| 62 |
-
Boolean indicating if this is an upskilling issue
|
| 63 |
-
"""
|
| 64 |
-
if not issue_text or not isinstance(issue_text, str):
|
| 65 |
-
return False
|
| 66 |
-
|
| 67 |
-
# Case insensitive search for "upskill" with potential variations
|
| 68 |
-
# This will match: Upskilling, upskill, UPSKILLING, Up-skilling, Up skilling, etc.
|
| 69 |
-
pattern = re.compile(r'up[-\s]?skill', re.IGNORECASE)
|
| 70 |
-
return bool(pattern.search(issue_text))
|
| 71 |
-
|
| 72 |
-
def estimate_token_count(text: str) -> int:
|
| 73 |
-
"""
|
| 74 |
-
Estimate token count for a given text string.
|
| 75 |
-
|
| 76 |
-
This is an approximation based on GPT tokenization patterns:
|
| 77 |
-
- Average of ~4 characters per token for English text
|
| 78 |
-
- Spaces count as tokens
|
| 79 |
-
- Special characters typically count as their own tokens
|
| 80 |
-
|
| 81 |
-
Args:
|
| 82 |
-
text: The text to estimate token count for
|
| 83 |
-
|
| 84 |
-
Returns:
|
| 85 |
-
Estimated token count
|
| 86 |
-
"""
|
| 87 |
-
if not text:
|
| 88 |
-
return 0
|
| 89 |
-
|
| 90 |
-
# Count words (splitting by whitespace)
|
| 91 |
-
words = len(text.split())
|
| 92 |
-
|
| 93 |
-
# Count characters
|
| 94 |
-
chars = len(text)
|
| 95 |
-
|
| 96 |
-
# Count special tokens (punctuation, etc.)
|
| 97 |
-
special_chars = len(re.findall(r'[^\w\s]', text))
|
| 98 |
-
|
| 99 |
-
# Estimate based on a combination of factors
|
| 100 |
-
# This formula is approximate and can be adjusted based on testing
|
| 101 |
-
estimated_tokens = max(words, int(chars / 4) + special_chars)
|
| 102 |
-
|
| 103 |
-
return estimated_tokens
|
| 104 |
-
|
| 105 |
-
def categorize_worklog_batch(worklogs: List[str]) -> List[str]:
|
| 106 |
-
"""
|
| 107 |
-
Categorize multiple worklog entries with a single API call.
|
| 108 |
-
|
| 109 |
-
Args:
|
| 110 |
-
worklogs: List of worklog texts to categorize
|
| 111 |
-
|
| 112 |
-
Returns:
|
| 113 |
-
List of categories corresponding to each worklog
|
| 114 |
-
"""
|
| 115 |
-
if not worklogs or model is None:
|
| 116 |
-
return ["Unknown"] * len(worklogs)
|
| 117 |
-
|
| 118 |
-
# Format worklogs as a numbered list for the prompt
|
| 119 |
-
formatted_worklogs = "\n".join([f"{i+1}. {worklog}" for i, worklog in enumerate(worklogs)])
|
| 120 |
-
prompt = BATCH_CATEGORIZATION_PROMPT.format(worklogs=formatted_worklogs)
|
| 121 |
-
|
| 122 |
-
# Estimate token usage
|
| 123 |
-
worklogs_token_count = sum(estimate_token_count(w) for w in worklogs)
|
| 124 |
-
prompt_token_count = estimate_token_count(prompt)
|
| 125 |
-
total_tokens = prompt_token_count
|
| 126 |
-
|
| 127 |
-
logger.info(f"Sending batch with {len(worklogs)} worklogs (~{worklogs_token_count} worklog tokens, ~{total_tokens} total tokens)")
|
| 128 |
-
|
| 129 |
-
try:
|
| 130 |
-
response = model.generate_content(prompt)
|
| 131 |
-
response_text = response.text.strip()
|
| 132 |
-
|
| 133 |
-
logger.info(f"Response received: {response_text}")
|
| 134 |
-
|
| 135 |
-
# Parse numbered response - looking for patterns like "1. Python", "2. JavaScript", etc.
|
| 136 |
-
categories = []
|
| 137 |
-
|
| 138 |
-
# First, try to match numbered lines (1. Category)
|
| 139 |
-
number_pattern = re.compile(r'^\s*(\d+)\.\s*(.+?)$', re.MULTILINE)
|
| 140 |
-
matches = number_pattern.findall(response_text)
|
| 141 |
-
|
| 142 |
-
if matches:
|
| 143 |
-
# Sort by the number to maintain order
|
| 144 |
-
sorted_matches = sorted(matches, key=lambda x: int(x[0]))
|
| 145 |
-
categories = [match[1].strip() for match in sorted_matches]
|
| 146 |
-
else:
|
| 147 |
-
# Fallback: try to split by lines
|
| 148 |
-
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
| 149 |
-
categories = [line.split('.')[-1].strip() if '.' in line else line for line in lines]
|
| 150 |
-
|
| 151 |
-
# Ensure we have the right number of categories
|
| 152 |
-
if len(categories) != len(worklogs):
|
| 153 |
-
logger.warning(f"Mismatch between number of worklogs ({len(worklogs)}) and categories ({len(categories)})")
|
| 154 |
-
|
| 155 |
-
# Pad with "Unknown" if we have too few categories
|
| 156 |
-
if len(categories) < len(worklogs):
|
| 157 |
-
categories.extend(["Unknown"] * (len(worklogs) - len(categories)))
|
| 158 |
-
# Truncate if we have too many categories
|
| 159 |
-
else:
|
| 160 |
-
categories = categories[:len(worklogs)]
|
| 161 |
-
|
| 162 |
-
# Ensure each category is a single word
|
| 163 |
-
for i, category in enumerate(categories):
|
| 164 |
-
if len(category.split()) > 1 and "-" not in category:
|
| 165 |
-
logger.warning(f"Response '{category}' contains multiple words, taking first word")
|
| 166 |
-
categories[i] = category.split()[0]
|
| 167 |
-
|
| 168 |
-
# Log the results for verification
|
| 169 |
-
for i, (worklog, category) in enumerate(zip(worklogs, categories)):
|
| 170 |
-
logger.info(f"Worklog {i+1}: '{worklog[:50]}{'...' if len(worklog) > 50 else ''}' → '{category}'")
|
| 171 |
-
|
| 172 |
-
return categories
|
| 173 |
-
except Exception as e:
|
| 174 |
-
logger.error(f"Error categorizing worklog batch: {e}")
|
| 175 |
-
return ["Unknown"] * len(worklogs)
|
| 176 |
-
|
| 177 |
-
def batch_process_worklogs(worklogs: List[str], batch_size: int = 10,
|
| 178 |
-
pause_seconds: int = 5, show_progress: bool = True) -> List[str]:
|
| 179 |
-
"""
|
| 180 |
-
Process multiple worklog entries in batches with pauses to avoid rate limits.
|
| 181 |
-
Using 10 queries at a time with 5 seconds rest between batches.
|
| 182 |
-
|
| 183 |
-
Args:
|
| 184 |
-
worklogs: List of worklog texts to categorize
|
| 185 |
-
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 186 |
-
pause_seconds: Seconds to pause between batches (default: 5)
|
| 187 |
-
show_progress: Whether to show a progress bar
|
| 188 |
-
|
| 189 |
-
Returns:
|
| 190 |
-
List of categories corresponding to each worklog
|
| 191 |
-
"""
|
| 192 |
-
results = []
|
| 193 |
-
total_worklogs = len(worklogs)
|
| 194 |
-
|
| 195 |
-
# Create batches
|
| 196 |
-
batches = [worklogs[i:i + batch_size] for i in range(0, total_worklogs, batch_size)]
|
| 197 |
-
|
| 198 |
-
# Process each batch with progress indication
|
| 199 |
-
progress_bar = tqdm(total=total_worklogs, desc="Categorizing worklogs") if show_progress else None
|
| 200 |
-
|
| 201 |
-
for i, batch in enumerate(batches):
|
| 202 |
-
# Process current batch
|
| 203 |
-
logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} worklogs")
|
| 204 |
-
batch_results = categorize_worklog_batch(batch)
|
| 205 |
-
results.extend(batch_results)
|
| 206 |
-
|
| 207 |
-
# Update progress
|
| 208 |
-
if progress_bar:
|
| 209 |
-
progress_bar.update(len(batch))
|
| 210 |
-
|
| 211 |
-
# Pause between batches (except after the last batch)
|
| 212 |
-
if i < len(batches) - 1 and pause_seconds > 0:
|
| 213 |
-
logger.info(f"Pausing for {pause_seconds}s before next batch. Processed {len(results)}/{total_worklogs} worklogs")
|
| 214 |
-
if show_progress:
|
| 215 |
-
for s in range(pause_seconds):
|
| 216 |
-
progress_bar.set_description(f"Waiting {pause_seconds-s}s before next batch")
|
| 217 |
-
time.sleep(1)
|
| 218 |
-
progress_bar.set_description("Categorizing worklogs")
|
| 219 |
-
else:
|
| 220 |
-
time.sleep(pause_seconds)
|
| 221 |
-
|
| 222 |
-
if progress_bar:
|
| 223 |
-
progress_bar.close()
|
| 224 |
-
|
| 225 |
-
logger.info(f"Completed processing {total_worklogs} worklogs")
|
| 226 |
-
return results
|
| 227 |
-
|
| 228 |
-
def process_dataframe(df: pd.DataFrame, worklog_column: str = "Worklog",
|
| 229 |
-
issue_column: str = "Issue", default_category: str = "N/A",
|
| 230 |
-
batch_size: int = 10, pause_seconds: int = 5,
|
| 231 |
-
show_progress: bool = True) -> pd.DataFrame:
|
| 232 |
-
"""
|
| 233 |
-
Add a new column with technology categories to a dataframe.
|
| 234 |
-
Only categorizes worklogs associated with upskilling issues.
|
| 235 |
-
Processes 10 worklogs at a time with 5-second pauses between batches.
|
| 236 |
-
|
| 237 |
-
Args:
|
| 238 |
-
df: Pandas DataFrame containing worklog data
|
| 239 |
-
worklog_column: Name of the column containing worklog text
|
| 240 |
-
issue_column: Name of the column containing issue text
|
| 241 |
-
default_category: Default value for non-upskilling worklogs
|
| 242 |
-
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 243 |
-
pause_seconds: Seconds to pause between batches (default: 5)
|
| 244 |
-
show_progress: Whether to show a progress bar
|
| 245 |
-
|
| 246 |
-
Returns:
|
| 247 |
-
DataFrame with an additional 'TechCategory' column
|
| 248 |
-
"""
|
| 249 |
-
# Initialize TechCategory column with default value
|
| 250 |
-
df["TechCategory"] = default_category
|
| 251 |
-
|
| 252 |
-
# Check if required columns exist
|
| 253 |
-
if worklog_column not in df.columns:
|
| 254 |
-
logger.error(f"Column '{worklog_column}' not found in DataFrame")
|
| 255 |
-
return df
|
| 256 |
-
|
| 257 |
-
if issue_column not in df.columns:
|
| 258 |
-
logger.error(f"Column '{issue_column}' not found in DataFrame")
|
| 259 |
-
return df
|
| 260 |
-
|
| 261 |
-
# Filter for upskilling issues
|
| 262 |
-
upskilling_mask = df[issue_column].apply(is_upskilling_issue)
|
| 263 |
-
upskilling_rows = df[upskilling_mask].copy()
|
| 264 |
-
|
| 265 |
-
logger.info(f"Found {len(upskilling_rows)} rows with upskilling issues out of {len(df)} total rows")
|
| 266 |
-
|
| 267 |
-
if upskilling_rows.empty:
|
| 268 |
-
logger.info("No upskilling issues found, returning dataframe with default category values")
|
| 269 |
-
return df
|
| 270 |
-
|
| 271 |
-
# Extract unique non-null worklog entries from upskilling issues
|
| 272 |
-
unique_worklogs = upskilling_rows[worklog_column].dropna().unique().tolist()
|
| 273 |
-
|
| 274 |
-
# Calculate total estimated tokens
|
| 275 |
-
total_estimated_tokens = sum(estimate_token_count(worklog) for worklog in unique_worklogs)
|
| 276 |
-
|
| 277 |
-
logger.info(f"Processing {len(unique_worklogs)} unique upskilling worklog entries with approximately {total_estimated_tokens} tokens")
|
| 278 |
-
|
| 279 |
-
# Create a mapping of worklog text to category
|
| 280 |
-
if unique_worklogs:
|
| 281 |
-
categories = batch_process_worklogs(
|
| 282 |
-
unique_worklogs,
|
| 283 |
-
batch_size=batch_size,
|
| 284 |
-
pause_seconds=pause_seconds,
|
| 285 |
-
show_progress=show_progress
|
| 286 |
-
)
|
| 287 |
-
worklog_to_category = dict(zip(unique_worklogs, categories))
|
| 288 |
-
else:
|
| 289 |
-
worklog_to_category = {}
|
| 290 |
-
|
| 291 |
-
# Apply categorization only to upskilling worklog entries
|
| 292 |
-
df.loc[upskilling_mask, "TechCategory"] = df.loc[upskilling_mask, worklog_column].apply(
|
| 293 |
-
lambda x: worklog_to_category.get(x, default_category) if pd.notna(x) else default_category
|
| 294 |
-
)
|
| 295 |
-
|
| 296 |
-
# Count the number of actually categorized entries
|
| 297 |
-
categorized_count = len(df[df["TechCategory"] != default_category])
|
| 298 |
-
logger.info(f"Successfully categorized {categorized_count} worklog entries")
|
| 299 |
-
|
| 300 |
-
return df
|
| 301 |
-
|
| 302 |
-
def process_csv_file(
|
| 303 |
-
csv_path: str,
|
| 304 |
-
worklog_column: str = "Worklog",
|
| 305 |
-
issue_column: str = "Issue",
|
| 306 |
-
default_category: str = "N/A",
|
| 307 |
-
output_path: Optional[str] = None,
|
| 308 |
-
overwrite: bool = False,
|
| 309 |
-
batch_size: int = 10,
|
| 310 |
-
pause_seconds: int = 5
|
| 311 |
-
) -> str:
|
| 312 |
-
"""
|
| 313 |
-
Process a CSV file to add technology categories based on worklog entries.
|
| 314 |
-
Only categorizes worklogs associated with upskilling issues.
|
| 315 |
-
Processes 10 worklogs at a time with 5-second pauses between batches.
|
| 316 |
-
|
| 317 |
-
Args:
|
| 318 |
-
csv_path: Path to the CSV file to process
|
| 319 |
-
worklog_column: Name of the column containing worklog text
|
| 320 |
-
issue_column: Name of the column containing issue text
|
| 321 |
-
default_category: Default value for non-upskilling worklogs
|
| 322 |
-
output_path: Path to save the processed file (if None, creates a new file with '_categorized' suffix)
|
| 323 |
-
overwrite: If True, overwrite the original file
|
| 324 |
-
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 325 |
-
pause_seconds: Seconds to pause between batches (default: 5)
|
| 326 |
-
|
| 327 |
-
Returns:
|
| 328 |
-
Path to the saved CSV file
|
| 329 |
-
"""
|
| 330 |
-
try:
|
| 331 |
-
# Check if file exists
|
| 332 |
-
if not Path(csv_path).exists():
|
| 333 |
-
logger.error(f"CSV file not found: {csv_path}")
|
| 334 |
-
return ""
|
| 335 |
-
|
| 336 |
-
# Read CSV
|
| 337 |
-
logger.info(f"Reading CSV file: {csv_path}")
|
| 338 |
-
df = pd.read_csv(csv_path)
|
| 339 |
-
|
| 340 |
-
# Process dataframe
|
| 341 |
-
processed_df = process_dataframe(
|
| 342 |
-
df,
|
| 343 |
-
worklog_column=worklog_column,
|
| 344 |
-
issue_column=issue_column,
|
| 345 |
-
default_category=default_category,
|
| 346 |
-
batch_size=batch_size,
|
| 347 |
-
pause_seconds=pause_seconds
|
| 348 |
-
)
|
| 349 |
-
|
| 350 |
-
# Determine output path
|
| 351 |
-
if overwrite:
|
| 352 |
-
save_path = csv_path
|
| 353 |
-
elif output_path:
|
| 354 |
-
save_path = output_path
|
| 355 |
-
else:
|
| 356 |
-
# Create new filename with _categorized suffix
|
| 357 |
-
path_obj = Path(csv_path)
|
| 358 |
-
save_path = str(path_obj.with_stem(f"{path_obj.stem}_categorized"))
|
| 359 |
-
|
| 360 |
-
# Save processed dataframe
|
| 361 |
-
processed_df.to_csv(save_path, index=False)
|
| 362 |
-
logger.info(f"Saved categorized CSV to: {save_path}")
|
| 363 |
-
|
| 364 |
-
return save_path
|
| 365 |
-
|
| 366 |
-
except Exception as e:
|
| 367 |
-
logger.error(f"Error processing CSV file: {e}")
|
| 368 |
-
return ""
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
from functools import lru_cache
|
| 5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import time
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
import re
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Initialize Gemini API
|
| 17 |
+
try:
|
| 18 |
+
genai.configure(api_key=os.getenv("GEMINI_API_KEY","AIzaSyCunB1oTkxl7IINRMgQTVqIXKcFYw0Jqow"))
|
| 19 |
+
model = genai.GenerativeModel("gemini-1.5-flash")
|
| 20 |
+
logger.info("Gemini API initialized successfully")
|
| 21 |
+
except Exception as e:
|
| 22 |
+
logger.error(f"Error initializing Gemini API: {e}")
|
| 23 |
+
model = None
|
| 24 |
+
|
| 25 |
+
# Prompt for worklog categorization - modified for batch processing
|
| 26 |
+
BATCH_CATEGORIZATION_PROMPT = """
|
| 27 |
+
You are a technology skill categorizer. Analyze each worklog entry and assign a single technology category word that best represents the technical skill or technology involved.
|
| 28 |
+
|
| 29 |
+
Guidelines:
|
| 30 |
+
1. Respond with ONLY a single word (or hyphenated term if necessary) for each worklog
|
| 31 |
+
2. Focus on the core technology, framework, or skill
|
| 32 |
+
3. Be specific when the technology is clear (e.g., "React", "Python", "AWS")
|
| 33 |
+
4. Use broader categories when specific technology isn't clear (e.g., "Frontend", "Backend", "DevOps")
|
| 34 |
+
5. Prefer standard technology names over abbreviations
|
| 35 |
+
6. Don't include unnecessary adjectives or descriptions
|
| 36 |
+
7. Respond in a numbered list format matching the input worklogs
|
| 37 |
+
|
| 38 |
+
Examples:
|
| 39 |
+
Worklog 1: "fixing issue in next js application" → "NextJS"
|
| 40 |
+
Worklog 2: "Task issue fixing - next js application" → "NextJS"
|
| 41 |
+
Worklog 3: "Debugging Python script for data analysis" → "Python"
|
| 42 |
+
Worklog 4: "Creating responsive CSS layout" → "CSS"
|
| 43 |
+
Worklog 5: "Implementing REST API endpoints" → "Backend"
|
| 44 |
+
|
| 45 |
+
Here are the worklogs to categorize:
|
| 46 |
+
{worklogs}
|
| 47 |
+
|
| 48 |
+
For each worklog, respond with a numbered list containing only the category word for each entry:
|
| 49 |
+
1. [category for worklog 1]
|
| 50 |
+
2. [category for worklog 2]
|
| 51 |
+
...and so on
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
def is_upskilling_issue(issue_text):
|
| 55 |
+
"""
|
| 56 |
+
Check if an issue is related to upskilling using regex to match various formats.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
issue_text: The issue text to check
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Boolean indicating if this is an upskilling issue
|
| 63 |
+
"""
|
| 64 |
+
if not issue_text or not isinstance(issue_text, str):
|
| 65 |
+
return False
|
| 66 |
+
|
| 67 |
+
# Case insensitive search for "upskill" with potential variations
|
| 68 |
+
# This will match: Upskilling, upskill, UPSKILLING, Up-skilling, Up skilling, etc.
|
| 69 |
+
pattern = re.compile(r'up[-\s]?skill', re.IGNORECASE)
|
| 70 |
+
return bool(pattern.search(issue_text))
|
| 71 |
+
|
| 72 |
+
def estimate_token_count(text: str) -> int:
|
| 73 |
+
"""
|
| 74 |
+
Estimate token count for a given text string.
|
| 75 |
+
|
| 76 |
+
This is an approximation based on GPT tokenization patterns:
|
| 77 |
+
- Average of ~4 characters per token for English text
|
| 78 |
+
- Spaces count as tokens
|
| 79 |
+
- Special characters typically count as their own tokens
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
text: The text to estimate token count for
|
| 83 |
+
|
| 84 |
+
Returns:
|
| 85 |
+
Estimated token count
|
| 86 |
+
"""
|
| 87 |
+
if not text:
|
| 88 |
+
return 0
|
| 89 |
+
|
| 90 |
+
# Count words (splitting by whitespace)
|
| 91 |
+
words = len(text.split())
|
| 92 |
+
|
| 93 |
+
# Count characters
|
| 94 |
+
chars = len(text)
|
| 95 |
+
|
| 96 |
+
# Count special tokens (punctuation, etc.)
|
| 97 |
+
special_chars = len(re.findall(r'[^\w\s]', text))
|
| 98 |
+
|
| 99 |
+
# Estimate based on a combination of factors
|
| 100 |
+
# This formula is approximate and can be adjusted based on testing
|
| 101 |
+
estimated_tokens = max(words, int(chars / 4) + special_chars)
|
| 102 |
+
|
| 103 |
+
return estimated_tokens
|
| 104 |
+
|
| 105 |
+
def categorize_worklog_batch(worklogs: List[str]) -> List[str]:
|
| 106 |
+
"""
|
| 107 |
+
Categorize multiple worklog entries with a single API call.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
worklogs: List of worklog texts to categorize
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
List of categories corresponding to each worklog
|
| 114 |
+
"""
|
| 115 |
+
if not worklogs or model is None:
|
| 116 |
+
return ["Unknown"] * len(worklogs)
|
| 117 |
+
|
| 118 |
+
# Format worklogs as a numbered list for the prompt
|
| 119 |
+
formatted_worklogs = "\n".join([f"{i+1}. {worklog}" for i, worklog in enumerate(worklogs)])
|
| 120 |
+
prompt = BATCH_CATEGORIZATION_PROMPT.format(worklogs=formatted_worklogs)
|
| 121 |
+
|
| 122 |
+
# Estimate token usage
|
| 123 |
+
worklogs_token_count = sum(estimate_token_count(w) for w in worklogs)
|
| 124 |
+
prompt_token_count = estimate_token_count(prompt)
|
| 125 |
+
total_tokens = prompt_token_count
|
| 126 |
+
|
| 127 |
+
logger.info(f"Sending batch with {len(worklogs)} worklogs (~{worklogs_token_count} worklog tokens, ~{total_tokens} total tokens)")
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
response = model.generate_content(prompt)
|
| 131 |
+
response_text = response.text.strip()
|
| 132 |
+
|
| 133 |
+
logger.info(f"Response received: {response_text}")
|
| 134 |
+
|
| 135 |
+
# Parse numbered response - looking for patterns like "1. Python", "2. JavaScript", etc.
|
| 136 |
+
categories = []
|
| 137 |
+
|
| 138 |
+
# First, try to match numbered lines (1. Category)
|
| 139 |
+
number_pattern = re.compile(r'^\s*(\d+)\.\s*(.+?)$', re.MULTILINE)
|
| 140 |
+
matches = number_pattern.findall(response_text)
|
| 141 |
+
|
| 142 |
+
if matches:
|
| 143 |
+
# Sort by the number to maintain order
|
| 144 |
+
sorted_matches = sorted(matches, key=lambda x: int(x[0]))
|
| 145 |
+
categories = [match[1].strip() for match in sorted_matches]
|
| 146 |
+
else:
|
| 147 |
+
# Fallback: try to split by lines
|
| 148 |
+
lines = [line.strip() for line in response_text.split('\n') if line.strip()]
|
| 149 |
+
categories = [line.split('.')[-1].strip() if '.' in line else line for line in lines]
|
| 150 |
+
|
| 151 |
+
# Ensure we have the right number of categories
|
| 152 |
+
if len(categories) != len(worklogs):
|
| 153 |
+
logger.warning(f"Mismatch between number of worklogs ({len(worklogs)}) and categories ({len(categories)})")
|
| 154 |
+
|
| 155 |
+
# Pad with "Unknown" if we have too few categories
|
| 156 |
+
if len(categories) < len(worklogs):
|
| 157 |
+
categories.extend(["Unknown"] * (len(worklogs) - len(categories)))
|
| 158 |
+
# Truncate if we have too many categories
|
| 159 |
+
else:
|
| 160 |
+
categories = categories[:len(worklogs)]
|
| 161 |
+
|
| 162 |
+
# Ensure each category is a single word
|
| 163 |
+
for i, category in enumerate(categories):
|
| 164 |
+
if len(category.split()) > 1 and "-" not in category:
|
| 165 |
+
logger.warning(f"Response '{category}' contains multiple words, taking first word")
|
| 166 |
+
categories[i] = category.split()[0]
|
| 167 |
+
|
| 168 |
+
# Log the results for verification
|
| 169 |
+
for i, (worklog, category) in enumerate(zip(worklogs, categories)):
|
| 170 |
+
logger.info(f"Worklog {i+1}: '{worklog[:50]}{'...' if len(worklog) > 50 else ''}' → '{category}'")
|
| 171 |
+
|
| 172 |
+
return categories
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error categorizing worklog batch: {e}")
|
| 175 |
+
return ["Unknown"] * len(worklogs)
|
| 176 |
+
|
| 177 |
+
def batch_process_worklogs(worklogs: List[str], batch_size: int = 10,
|
| 178 |
+
pause_seconds: int = 5, show_progress: bool = True) -> List[str]:
|
| 179 |
+
"""
|
| 180 |
+
Process multiple worklog entries in batches with pauses to avoid rate limits.
|
| 181 |
+
Using 10 queries at a time with 5 seconds rest between batches.
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
worklogs: List of worklog texts to categorize
|
| 185 |
+
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 186 |
+
pause_seconds: Seconds to pause between batches (default: 5)
|
| 187 |
+
show_progress: Whether to show a progress bar
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
List of categories corresponding to each worklog
|
| 191 |
+
"""
|
| 192 |
+
results = []
|
| 193 |
+
total_worklogs = len(worklogs)
|
| 194 |
+
|
| 195 |
+
# Create batches
|
| 196 |
+
batches = [worklogs[i:i + batch_size] for i in range(0, total_worklogs, batch_size)]
|
| 197 |
+
|
| 198 |
+
# Process each batch with progress indication
|
| 199 |
+
progress_bar = tqdm(total=total_worklogs, desc="Categorizing worklogs") if show_progress else None
|
| 200 |
+
|
| 201 |
+
for i, batch in enumerate(batches):
|
| 202 |
+
# Process current batch
|
| 203 |
+
logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} worklogs")
|
| 204 |
+
batch_results = categorize_worklog_batch(batch)
|
| 205 |
+
results.extend(batch_results)
|
| 206 |
+
|
| 207 |
+
# Update progress
|
| 208 |
+
if progress_bar:
|
| 209 |
+
progress_bar.update(len(batch))
|
| 210 |
+
|
| 211 |
+
# Pause between batches (except after the last batch)
|
| 212 |
+
if i < len(batches) - 1 and pause_seconds > 0:
|
| 213 |
+
logger.info(f"Pausing for {pause_seconds}s before next batch. Processed {len(results)}/{total_worklogs} worklogs")
|
| 214 |
+
if show_progress:
|
| 215 |
+
for s in range(pause_seconds):
|
| 216 |
+
progress_bar.set_description(f"Waiting {pause_seconds-s}s before next batch")
|
| 217 |
+
time.sleep(1)
|
| 218 |
+
progress_bar.set_description("Categorizing worklogs")
|
| 219 |
+
else:
|
| 220 |
+
time.sleep(pause_seconds)
|
| 221 |
+
|
| 222 |
+
if progress_bar:
|
| 223 |
+
progress_bar.close()
|
| 224 |
+
|
| 225 |
+
logger.info(f"Completed processing {total_worklogs} worklogs")
|
| 226 |
+
return results
|
| 227 |
+
|
| 228 |
+
def process_dataframe(df: pd.DataFrame, worklog_column: str = "Worklog",
|
| 229 |
+
issue_column: str = "Issue", default_category: str = "N/A",
|
| 230 |
+
batch_size: int = 10, pause_seconds: int = 5,
|
| 231 |
+
show_progress: bool = True) -> pd.DataFrame:
|
| 232 |
+
"""
|
| 233 |
+
Add a new column with technology categories to a dataframe.
|
| 234 |
+
Only categorizes worklogs associated with upskilling issues.
|
| 235 |
+
Processes 10 worklogs at a time with 5-second pauses between batches.
|
| 236 |
+
|
| 237 |
+
Args:
|
| 238 |
+
df: Pandas DataFrame containing worklog data
|
| 239 |
+
worklog_column: Name of the column containing worklog text
|
| 240 |
+
issue_column: Name of the column containing issue text
|
| 241 |
+
default_category: Default value for non-upskilling worklogs
|
| 242 |
+
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 243 |
+
pause_seconds: Seconds to pause between batches (default: 5)
|
| 244 |
+
show_progress: Whether to show a progress bar
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
DataFrame with an additional 'TechCategory' column
|
| 248 |
+
"""
|
| 249 |
+
# Initialize TechCategory column with default value
|
| 250 |
+
df["TechCategory"] = default_category
|
| 251 |
+
|
| 252 |
+
# Check if required columns exist
|
| 253 |
+
if worklog_column not in df.columns:
|
| 254 |
+
logger.error(f"Column '{worklog_column}' not found in DataFrame")
|
| 255 |
+
return df
|
| 256 |
+
|
| 257 |
+
if issue_column not in df.columns:
|
| 258 |
+
logger.error(f"Column '{issue_column}' not found in DataFrame")
|
| 259 |
+
return df
|
| 260 |
+
|
| 261 |
+
# Filter for upskilling issues
|
| 262 |
+
upskilling_mask = df[issue_column].apply(is_upskilling_issue)
|
| 263 |
+
upskilling_rows = df[upskilling_mask].copy()
|
| 264 |
+
|
| 265 |
+
logger.info(f"Found {len(upskilling_rows)} rows with upskilling issues out of {len(df)} total rows")
|
| 266 |
+
|
| 267 |
+
if upskilling_rows.empty:
|
| 268 |
+
logger.info("No upskilling issues found, returning dataframe with default category values")
|
| 269 |
+
return df
|
| 270 |
+
|
| 271 |
+
# Extract unique non-null worklog entries from upskilling issues
|
| 272 |
+
unique_worklogs = upskilling_rows[worklog_column].dropna().unique().tolist()
|
| 273 |
+
|
| 274 |
+
# Calculate total estimated tokens
|
| 275 |
+
total_estimated_tokens = sum(estimate_token_count(worklog) for worklog in unique_worklogs)
|
| 276 |
+
|
| 277 |
+
logger.info(f"Processing {len(unique_worklogs)} unique upskilling worklog entries with approximately {total_estimated_tokens} tokens")
|
| 278 |
+
|
| 279 |
+
# Create a mapping of worklog text to category
|
| 280 |
+
if unique_worklogs:
|
| 281 |
+
categories = batch_process_worklogs(
|
| 282 |
+
unique_worklogs,
|
| 283 |
+
batch_size=batch_size,
|
| 284 |
+
pause_seconds=pause_seconds,
|
| 285 |
+
show_progress=show_progress
|
| 286 |
+
)
|
| 287 |
+
worklog_to_category = dict(zip(unique_worklogs, categories))
|
| 288 |
+
else:
|
| 289 |
+
worklog_to_category = {}
|
| 290 |
+
|
| 291 |
+
# Apply categorization only to upskilling worklog entries
|
| 292 |
+
df.loc[upskilling_mask, "TechCategory"] = df.loc[upskilling_mask, worklog_column].apply(
|
| 293 |
+
lambda x: worklog_to_category.get(x, default_category) if pd.notna(x) else default_category
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# Count the number of actually categorized entries
|
| 297 |
+
categorized_count = len(df[df["TechCategory"] != default_category])
|
| 298 |
+
logger.info(f"Successfully categorized {categorized_count} worklog entries")
|
| 299 |
+
|
| 300 |
+
return df
|
| 301 |
+
|
| 302 |
+
def process_csv_file(
|
| 303 |
+
csv_path: str,
|
| 304 |
+
worklog_column: str = "Worklog",
|
| 305 |
+
issue_column: str = "Issue",
|
| 306 |
+
default_category: str = "N/A",
|
| 307 |
+
output_path: Optional[str] = None,
|
| 308 |
+
overwrite: bool = False,
|
| 309 |
+
batch_size: int = 10,
|
| 310 |
+
pause_seconds: int = 5
|
| 311 |
+
) -> str:
|
| 312 |
+
"""
|
| 313 |
+
Process a CSV file to add technology categories based on worklog entries.
|
| 314 |
+
Only categorizes worklogs associated with upskilling issues.
|
| 315 |
+
Processes 10 worklogs at a time with 5-second pauses between batches.
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
csv_path: Path to the CSV file to process
|
| 319 |
+
worklog_column: Name of the column containing worklog text
|
| 320 |
+
issue_column: Name of the column containing issue text
|
| 321 |
+
default_category: Default value for non-upskilling worklogs
|
| 322 |
+
output_path: Path to save the processed file (if None, creates a new file with '_categorized' suffix)
|
| 323 |
+
overwrite: If True, overwrite the original file
|
| 324 |
+
batch_size: Number of worklogs to process in each batch (default: 10)
|
| 325 |
+
pause_seconds: Seconds to pause between batches (default: 5)
|
| 326 |
+
|
| 327 |
+
Returns:
|
| 328 |
+
Path to the saved CSV file
|
| 329 |
+
"""
|
| 330 |
+
try:
|
| 331 |
+
# Check if file exists
|
| 332 |
+
if not Path(csv_path).exists():
|
| 333 |
+
logger.error(f"CSV file not found: {csv_path}")
|
| 334 |
+
return ""
|
| 335 |
+
|
| 336 |
+
# Read CSV
|
| 337 |
+
logger.info(f"Reading CSV file: {csv_path}")
|
| 338 |
+
df = pd.read_csv(csv_path)
|
| 339 |
+
|
| 340 |
+
# Process dataframe
|
| 341 |
+
processed_df = process_dataframe(
|
| 342 |
+
df,
|
| 343 |
+
worklog_column=worklog_column,
|
| 344 |
+
issue_column=issue_column,
|
| 345 |
+
default_category=default_category,
|
| 346 |
+
batch_size=batch_size,
|
| 347 |
+
pause_seconds=pause_seconds
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Determine output path
|
| 351 |
+
if overwrite:
|
| 352 |
+
save_path = csv_path
|
| 353 |
+
elif output_path:
|
| 354 |
+
save_path = output_path
|
| 355 |
+
else:
|
| 356 |
+
# Create new filename with _categorized suffix
|
| 357 |
+
path_obj = Path(csv_path)
|
| 358 |
+
save_path = str(path_obj.with_stem(f"{path_obj.stem}_categorized"))
|
| 359 |
+
|
| 360 |
+
# Save processed dataframe
|
| 361 |
+
processed_df.to_csv(save_path, index=False)
|
| 362 |
+
logger.info(f"Saved categorized CSV to: {save_path}")
|
| 363 |
+
|
| 364 |
+
return save_path
|
| 365 |
+
|
| 366 |
+
except Exception as e:
|
| 367 |
+
logger.error(f"Error processing CSV file: {e}")
|
| 368 |
+
return ""
|