Pratik333 commited on
Commit
3fb828a
·
verified ·
1 Parent(s): f215f95

Update worklog_categorizer.py

Browse files
Files changed (1) hide show
  1. worklog_categorizer.py +368 -368
worklog_categorizer.py CHANGED
@@ -1,368 +1,368 @@
1
- import os
2
- import logging
3
- import google.generativeai as genai
4
- from functools import lru_cache
5
- from typing import List, Dict, Any, Optional, Tuple
6
- import pandas as pd
7
- from pathlib import Path
8
- import time
9
- from tqdm import tqdm
10
- import re
11
-
12
- # Configure logging
13
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
- logger = logging.getLogger(__name__)
15
-
16
- # Initialize Gemini API
17
- try:
18
- genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
19
- model = genai.GenerativeModel("gemini-1.5-flash")
20
- logger.info("Gemini API initialized successfully")
21
- except Exception as e:
22
- logger.error(f"Error initializing Gemini API: {e}")
23
- model = None
24
-
25
- # Prompt for worklog categorization - modified for batch processing
26
- BATCH_CATEGORIZATION_PROMPT = """
27
- You are a technology skill categorizer. Analyze each worklog entry and assign a single technology category word that best represents the technical skill or technology involved.
28
-
29
- Guidelines:
30
- 1. Respond with ONLY a single word (or hyphenated term if necessary) for each worklog
31
- 2. Focus on the core technology, framework, or skill
32
- 3. Be specific when the technology is clear (e.g., "React", "Python", "AWS")
33
- 4. Use broader categories when specific technology isn't clear (e.g., "Frontend", "Backend", "DevOps")
34
- 5. Prefer standard technology names over abbreviations
35
- 6. Don't include unnecessary adjectives or descriptions
36
- 7. Respond in a numbered list format matching the input worklogs
37
-
38
- Examples:
39
- Worklog 1: "fixing issue in next js application" → "NextJS"
40
- Worklog 2: "Task issue fixing - next js application" → "NextJS"
41
- Worklog 3: "Debugging Python script for data analysis" → "Python"
42
- Worklog 4: "Creating responsive CSS layout" → "CSS"
43
- Worklog 5: "Implementing REST API endpoints" → "Backend"
44
-
45
- Here are the worklogs to categorize:
46
- {worklogs}
47
-
48
- For each worklog, respond with a numbered list containing only the category word for each entry:
49
- 1. [category for worklog 1]
50
- 2. [category for worklog 2]
51
- ...and so on
52
- """
53
-
54
- def is_upskilling_issue(issue_text):
55
- """
56
- Check if an issue is related to upskilling using regex to match various formats.
57
-
58
- Args:
59
- issue_text: The issue text to check
60
-
61
- Returns:
62
- Boolean indicating if this is an upskilling issue
63
- """
64
- if not issue_text or not isinstance(issue_text, str):
65
- return False
66
-
67
- # Case insensitive search for "upskill" with potential variations
68
- # This will match: Upskilling, upskill, UPSKILLING, Up-skilling, Up skilling, etc.
69
- pattern = re.compile(r'up[-\s]?skill', re.IGNORECASE)
70
- return bool(pattern.search(issue_text))
71
-
72
- def estimate_token_count(text: str) -> int:
73
- """
74
- Estimate token count for a given text string.
75
-
76
- This is an approximation based on GPT tokenization patterns:
77
- - Average of ~4 characters per token for English text
78
- - Spaces count as tokens
79
- - Special characters typically count as their own tokens
80
-
81
- Args:
82
- text: The text to estimate token count for
83
-
84
- Returns:
85
- Estimated token count
86
- """
87
- if not text:
88
- return 0
89
-
90
- # Count words (splitting by whitespace)
91
- words = len(text.split())
92
-
93
- # Count characters
94
- chars = len(text)
95
-
96
- # Count special tokens (punctuation, etc.)
97
- special_chars = len(re.findall(r'[^\w\s]', text))
98
-
99
- # Estimate based on a combination of factors
100
- # This formula is approximate and can be adjusted based on testing
101
- estimated_tokens = max(words, int(chars / 4) + special_chars)
102
-
103
- return estimated_tokens
104
-
105
- def categorize_worklog_batch(worklogs: List[str]) -> List[str]:
106
- """
107
- Categorize multiple worklog entries with a single API call.
108
-
109
- Args:
110
- worklogs: List of worklog texts to categorize
111
-
112
- Returns:
113
- List of categories corresponding to each worklog
114
- """
115
- if not worklogs or model is None:
116
- return ["Unknown"] * len(worklogs)
117
-
118
- # Format worklogs as a numbered list for the prompt
119
- formatted_worklogs = "\n".join([f"{i+1}. {worklog}" for i, worklog in enumerate(worklogs)])
120
- prompt = BATCH_CATEGORIZATION_PROMPT.format(worklogs=formatted_worklogs)
121
-
122
- # Estimate token usage
123
- worklogs_token_count = sum(estimate_token_count(w) for w in worklogs)
124
- prompt_token_count = estimate_token_count(prompt)
125
- total_tokens = prompt_token_count
126
-
127
- logger.info(f"Sending batch with {len(worklogs)} worklogs (~{worklogs_token_count} worklog tokens, ~{total_tokens} total tokens)")
128
-
129
- try:
130
- response = model.generate_content(prompt)
131
- response_text = response.text.strip()
132
-
133
- logger.info(f"Response received: {response_text}")
134
-
135
- # Parse numbered response - looking for patterns like "1. Python", "2. JavaScript", etc.
136
- categories = []
137
-
138
- # First, try to match numbered lines (1. Category)
139
- number_pattern = re.compile(r'^\s*(\d+)\.\s*(.+?)$', re.MULTILINE)
140
- matches = number_pattern.findall(response_text)
141
-
142
- if matches:
143
- # Sort by the number to maintain order
144
- sorted_matches = sorted(matches, key=lambda x: int(x[0]))
145
- categories = [match[1].strip() for match in sorted_matches]
146
- else:
147
- # Fallback: try to split by lines
148
- lines = [line.strip() for line in response_text.split('\n') if line.strip()]
149
- categories = [line.split('.')[-1].strip() if '.' in line else line for line in lines]
150
-
151
- # Ensure we have the right number of categories
152
- if len(categories) != len(worklogs):
153
- logger.warning(f"Mismatch between number of worklogs ({len(worklogs)}) and categories ({len(categories)})")
154
-
155
- # Pad with "Unknown" if we have too few categories
156
- if len(categories) < len(worklogs):
157
- categories.extend(["Unknown"] * (len(worklogs) - len(categories)))
158
- # Truncate if we have too many categories
159
- else:
160
- categories = categories[:len(worklogs)]
161
-
162
- # Ensure each category is a single word
163
- for i, category in enumerate(categories):
164
- if len(category.split()) > 1 and "-" not in category:
165
- logger.warning(f"Response '{category}' contains multiple words, taking first word")
166
- categories[i] = category.split()[0]
167
-
168
- # Log the results for verification
169
- for i, (worklog, category) in enumerate(zip(worklogs, categories)):
170
- logger.info(f"Worklog {i+1}: '{worklog[:50]}{'...' if len(worklog) > 50 else ''}' → '{category}'")
171
-
172
- return categories
173
- except Exception as e:
174
- logger.error(f"Error categorizing worklog batch: {e}")
175
- return ["Unknown"] * len(worklogs)
176
-
177
- def batch_process_worklogs(worklogs: List[str], batch_size: int = 10,
178
- pause_seconds: int = 5, show_progress: bool = True) -> List[str]:
179
- """
180
- Process multiple worklog entries in batches with pauses to avoid rate limits.
181
- Using 10 queries at a time with 5 seconds rest between batches.
182
-
183
- Args:
184
- worklogs: List of worklog texts to categorize
185
- batch_size: Number of worklogs to process in each batch (default: 10)
186
- pause_seconds: Seconds to pause between batches (default: 5)
187
- show_progress: Whether to show a progress bar
188
-
189
- Returns:
190
- List of categories corresponding to each worklog
191
- """
192
- results = []
193
- total_worklogs = len(worklogs)
194
-
195
- # Create batches
196
- batches = [worklogs[i:i + batch_size] for i in range(0, total_worklogs, batch_size)]
197
-
198
- # Process each batch with progress indication
199
- progress_bar = tqdm(total=total_worklogs, desc="Categorizing worklogs") if show_progress else None
200
-
201
- for i, batch in enumerate(batches):
202
- # Process current batch
203
- logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} worklogs")
204
- batch_results = categorize_worklog_batch(batch)
205
- results.extend(batch_results)
206
-
207
- # Update progress
208
- if progress_bar:
209
- progress_bar.update(len(batch))
210
-
211
- # Pause between batches (except after the last batch)
212
- if i < len(batches) - 1 and pause_seconds > 0:
213
- logger.info(f"Pausing for {pause_seconds}s before next batch. Processed {len(results)}/{total_worklogs} worklogs")
214
- if show_progress:
215
- for s in range(pause_seconds):
216
- progress_bar.set_description(f"Waiting {pause_seconds-s}s before next batch")
217
- time.sleep(1)
218
- progress_bar.set_description("Categorizing worklogs")
219
- else:
220
- time.sleep(pause_seconds)
221
-
222
- if progress_bar:
223
- progress_bar.close()
224
-
225
- logger.info(f"Completed processing {total_worklogs} worklogs")
226
- return results
227
-
228
- def process_dataframe(df: pd.DataFrame, worklog_column: str = "Worklog",
229
- issue_column: str = "Issue", default_category: str = "N/A",
230
- batch_size: int = 10, pause_seconds: int = 5,
231
- show_progress: bool = True) -> pd.DataFrame:
232
- """
233
- Add a new column with technology categories to a dataframe.
234
- Only categorizes worklogs associated with upskilling issues.
235
- Processes 10 worklogs at a time with 5-second pauses between batches.
236
-
237
- Args:
238
- df: Pandas DataFrame containing worklog data
239
- worklog_column: Name of the column containing worklog text
240
- issue_column: Name of the column containing issue text
241
- default_category: Default value for non-upskilling worklogs
242
- batch_size: Number of worklogs to process in each batch (default: 10)
243
- pause_seconds: Seconds to pause between batches (default: 5)
244
- show_progress: Whether to show a progress bar
245
-
246
- Returns:
247
- DataFrame with an additional 'TechCategory' column
248
- """
249
- # Initialize TechCategory column with default value
250
- df["TechCategory"] = default_category
251
-
252
- # Check if required columns exist
253
- if worklog_column not in df.columns:
254
- logger.error(f"Column '{worklog_column}' not found in DataFrame")
255
- return df
256
-
257
- if issue_column not in df.columns:
258
- logger.error(f"Column '{issue_column}' not found in DataFrame")
259
- return df
260
-
261
- # Filter for upskilling issues
262
- upskilling_mask = df[issue_column].apply(is_upskilling_issue)
263
- upskilling_rows = df[upskilling_mask].copy()
264
-
265
- logger.info(f"Found {len(upskilling_rows)} rows with upskilling issues out of {len(df)} total rows")
266
-
267
- if upskilling_rows.empty:
268
- logger.info("No upskilling issues found, returning dataframe with default category values")
269
- return df
270
-
271
- # Extract unique non-null worklog entries from upskilling issues
272
- unique_worklogs = upskilling_rows[worklog_column].dropna().unique().tolist()
273
-
274
- # Calculate total estimated tokens
275
- total_estimated_tokens = sum(estimate_token_count(worklog) for worklog in unique_worklogs)
276
-
277
- logger.info(f"Processing {len(unique_worklogs)} unique upskilling worklog entries with approximately {total_estimated_tokens} tokens")
278
-
279
- # Create a mapping of worklog text to category
280
- if unique_worklogs:
281
- categories = batch_process_worklogs(
282
- unique_worklogs,
283
- batch_size=batch_size,
284
- pause_seconds=pause_seconds,
285
- show_progress=show_progress
286
- )
287
- worklog_to_category = dict(zip(unique_worklogs, categories))
288
- else:
289
- worklog_to_category = {}
290
-
291
- # Apply categorization only to upskilling worklog entries
292
- df.loc[upskilling_mask, "TechCategory"] = df.loc[upskilling_mask, worklog_column].apply(
293
- lambda x: worklog_to_category.get(x, default_category) if pd.notna(x) else default_category
294
- )
295
-
296
- # Count the number of actually categorized entries
297
- categorized_count = len(df[df["TechCategory"] != default_category])
298
- logger.info(f"Successfully categorized {categorized_count} worklog entries")
299
-
300
- return df
301
-
302
- def process_csv_file(
303
- csv_path: str,
304
- worklog_column: str = "Worklog",
305
- issue_column: str = "Issue",
306
- default_category: str = "N/A",
307
- output_path: Optional[str] = None,
308
- overwrite: bool = False,
309
- batch_size: int = 10,
310
- pause_seconds: int = 5
311
- ) -> str:
312
- """
313
- Process a CSV file to add technology categories based on worklog entries.
314
- Only categorizes worklogs associated with upskilling issues.
315
- Processes 10 worklogs at a time with 5-second pauses between batches.
316
-
317
- Args:
318
- csv_path: Path to the CSV file to process
319
- worklog_column: Name of the column containing worklog text
320
- issue_column: Name of the column containing issue text
321
- default_category: Default value for non-upskilling worklogs
322
- output_path: Path to save the processed file (if None, creates a new file with '_categorized' suffix)
323
- overwrite: If True, overwrite the original file
324
- batch_size: Number of worklogs to process in each batch (default: 10)
325
- pause_seconds: Seconds to pause between batches (default: 5)
326
-
327
- Returns:
328
- Path to the saved CSV file
329
- """
330
- try:
331
- # Check if file exists
332
- if not Path(csv_path).exists():
333
- logger.error(f"CSV file not found: {csv_path}")
334
- return ""
335
-
336
- # Read CSV
337
- logger.info(f"Reading CSV file: {csv_path}")
338
- df = pd.read_csv(csv_path)
339
-
340
- # Process dataframe
341
- processed_df = process_dataframe(
342
- df,
343
- worklog_column=worklog_column,
344
- issue_column=issue_column,
345
- default_category=default_category,
346
- batch_size=batch_size,
347
- pause_seconds=pause_seconds
348
- )
349
-
350
- # Determine output path
351
- if overwrite:
352
- save_path = csv_path
353
- elif output_path:
354
- save_path = output_path
355
- else:
356
- # Create new filename with _categorized suffix
357
- path_obj = Path(csv_path)
358
- save_path = str(path_obj.with_stem(f"{path_obj.stem}_categorized"))
359
-
360
- # Save processed dataframe
361
- processed_df.to_csv(save_path, index=False)
362
- logger.info(f"Saved categorized CSV to: {save_path}")
363
-
364
- return save_path
365
-
366
- except Exception as e:
367
- logger.error(f"Error processing CSV file: {e}")
368
- return ""
 
1
+ import os
2
+ import logging
3
+ import google.generativeai as genai
4
+ from functools import lru_cache
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ import time
9
+ from tqdm import tqdm
10
+ import re
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize Gemini API
17
+ try:
18
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY","AIzaSyCunB1oTkxl7IINRMgQTVqIXKcFYw0Jqow"))
19
+ model = genai.GenerativeModel("gemini-1.5-flash")
20
+ logger.info("Gemini API initialized successfully")
21
+ except Exception as e:
22
+ logger.error(f"Error initializing Gemini API: {e}")
23
+ model = None
24
+
25
+ # Prompt for worklog categorization - modified for batch processing
26
+ BATCH_CATEGORIZATION_PROMPT = """
27
+ You are a technology skill categorizer. Analyze each worklog entry and assign a single technology category word that best represents the technical skill or technology involved.
28
+
29
+ Guidelines:
30
+ 1. Respond with ONLY a single word (or hyphenated term if necessary) for each worklog
31
+ 2. Focus on the core technology, framework, or skill
32
+ 3. Be specific when the technology is clear (e.g., "React", "Python", "AWS")
33
+ 4. Use broader categories when specific technology isn't clear (e.g., "Frontend", "Backend", "DevOps")
34
+ 5. Prefer standard technology names over abbreviations
35
+ 6. Don't include unnecessary adjectives or descriptions
36
+ 7. Respond in a numbered list format matching the input worklogs
37
+
38
+ Examples:
39
+ Worklog 1: "fixing issue in next js application" → "NextJS"
40
+ Worklog 2: "Task issue fixing - next js application" → "NextJS"
41
+ Worklog 3: "Debugging Python script for data analysis" → "Python"
42
+ Worklog 4: "Creating responsive CSS layout" → "CSS"
43
+ Worklog 5: "Implementing REST API endpoints" → "Backend"
44
+
45
+ Here are the worklogs to categorize:
46
+ {worklogs}
47
+
48
+ For each worklog, respond with a numbered list containing only the category word for each entry:
49
+ 1. [category for worklog 1]
50
+ 2. [category for worklog 2]
51
+ ...and so on
52
+ """
53
+
54
+ def is_upskilling_issue(issue_text):
55
+ """
56
+ Check if an issue is related to upskilling using regex to match various formats.
57
+
58
+ Args:
59
+ issue_text: The issue text to check
60
+
61
+ Returns:
62
+ Boolean indicating if this is an upskilling issue
63
+ """
64
+ if not issue_text or not isinstance(issue_text, str):
65
+ return False
66
+
67
+ # Case insensitive search for "upskill" with potential variations
68
+ # This will match: Upskilling, upskill, UPSKILLING, Up-skilling, Up skilling, etc.
69
+ pattern = re.compile(r'up[-\s]?skill', re.IGNORECASE)
70
+ return bool(pattern.search(issue_text))
71
+
72
+ def estimate_token_count(text: str) -> int:
73
+ """
74
+ Estimate token count for a given text string.
75
+
76
+ This is an approximation based on GPT tokenization patterns:
77
+ - Average of ~4 characters per token for English text
78
+ - Spaces count as tokens
79
+ - Special characters typically count as their own tokens
80
+
81
+ Args:
82
+ text: The text to estimate token count for
83
+
84
+ Returns:
85
+ Estimated token count
86
+ """
87
+ if not text:
88
+ return 0
89
+
90
+ # Count words (splitting by whitespace)
91
+ words = len(text.split())
92
+
93
+ # Count characters
94
+ chars = len(text)
95
+
96
+ # Count special tokens (punctuation, etc.)
97
+ special_chars = len(re.findall(r'[^\w\s]', text))
98
+
99
+ # Estimate based on a combination of factors
100
+ # This formula is approximate and can be adjusted based on testing
101
+ estimated_tokens = max(words, int(chars / 4) + special_chars)
102
+
103
+ return estimated_tokens
104
+
105
+ def categorize_worklog_batch(worklogs: List[str]) -> List[str]:
106
+ """
107
+ Categorize multiple worklog entries with a single API call.
108
+
109
+ Args:
110
+ worklogs: List of worklog texts to categorize
111
+
112
+ Returns:
113
+ List of categories corresponding to each worklog
114
+ """
115
+ if not worklogs or model is None:
116
+ return ["Unknown"] * len(worklogs)
117
+
118
+ # Format worklogs as a numbered list for the prompt
119
+ formatted_worklogs = "\n".join([f"{i+1}. {worklog}" for i, worklog in enumerate(worklogs)])
120
+ prompt = BATCH_CATEGORIZATION_PROMPT.format(worklogs=formatted_worklogs)
121
+
122
+ # Estimate token usage
123
+ worklogs_token_count = sum(estimate_token_count(w) for w in worklogs)
124
+ prompt_token_count = estimate_token_count(prompt)
125
+ total_tokens = prompt_token_count
126
+
127
+ logger.info(f"Sending batch with {len(worklogs)} worklogs (~{worklogs_token_count} worklog tokens, ~{total_tokens} total tokens)")
128
+
129
+ try:
130
+ response = model.generate_content(prompt)
131
+ response_text = response.text.strip()
132
+
133
+ logger.info(f"Response received: {response_text}")
134
+
135
+ # Parse numbered response - looking for patterns like "1. Python", "2. JavaScript", etc.
136
+ categories = []
137
+
138
+ # First, try to match numbered lines (1. Category)
139
+ number_pattern = re.compile(r'^\s*(\d+)\.\s*(.+?)$', re.MULTILINE)
140
+ matches = number_pattern.findall(response_text)
141
+
142
+ if matches:
143
+ # Sort by the number to maintain order
144
+ sorted_matches = sorted(matches, key=lambda x: int(x[0]))
145
+ categories = [match[1].strip() for match in sorted_matches]
146
+ else:
147
+ # Fallback: try to split by lines
148
+ lines = [line.strip() for line in response_text.split('\n') if line.strip()]
149
+ categories = [line.split('.')[-1].strip() if '.' in line else line for line in lines]
150
+
151
+ # Ensure we have the right number of categories
152
+ if len(categories) != len(worklogs):
153
+ logger.warning(f"Mismatch between number of worklogs ({len(worklogs)}) and categories ({len(categories)})")
154
+
155
+ # Pad with "Unknown" if we have too few categories
156
+ if len(categories) < len(worklogs):
157
+ categories.extend(["Unknown"] * (len(worklogs) - len(categories)))
158
+ # Truncate if we have too many categories
159
+ else:
160
+ categories = categories[:len(worklogs)]
161
+
162
+ # Ensure each category is a single word
163
+ for i, category in enumerate(categories):
164
+ if len(category.split()) > 1 and "-" not in category:
165
+ logger.warning(f"Response '{category}' contains multiple words, taking first word")
166
+ categories[i] = category.split()[0]
167
+
168
+ # Log the results for verification
169
+ for i, (worklog, category) in enumerate(zip(worklogs, categories)):
170
+ logger.info(f"Worklog {i+1}: '{worklog[:50]}{'...' if len(worklog) > 50 else ''}' → '{category}'")
171
+
172
+ return categories
173
+ except Exception as e:
174
+ logger.error(f"Error categorizing worklog batch: {e}")
175
+ return ["Unknown"] * len(worklogs)
176
+
177
+ def batch_process_worklogs(worklogs: List[str], batch_size: int = 10,
178
+ pause_seconds: int = 5, show_progress: bool = True) -> List[str]:
179
+ """
180
+ Process multiple worklog entries in batches with pauses to avoid rate limits.
181
+ Using 10 queries at a time with 5 seconds rest between batches.
182
+
183
+ Args:
184
+ worklogs: List of worklog texts to categorize
185
+ batch_size: Number of worklogs to process in each batch (default: 10)
186
+ pause_seconds: Seconds to pause between batches (default: 5)
187
+ show_progress: Whether to show a progress bar
188
+
189
+ Returns:
190
+ List of categories corresponding to each worklog
191
+ """
192
+ results = []
193
+ total_worklogs = len(worklogs)
194
+
195
+ # Create batches
196
+ batches = [worklogs[i:i + batch_size] for i in range(0, total_worklogs, batch_size)]
197
+
198
+ # Process each batch with progress indication
199
+ progress_bar = tqdm(total=total_worklogs, desc="Categorizing worklogs") if show_progress else None
200
+
201
+ for i, batch in enumerate(batches):
202
+ # Process current batch
203
+ logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} worklogs")
204
+ batch_results = categorize_worklog_batch(batch)
205
+ results.extend(batch_results)
206
+
207
+ # Update progress
208
+ if progress_bar:
209
+ progress_bar.update(len(batch))
210
+
211
+ # Pause between batches (except after the last batch)
212
+ if i < len(batches) - 1 and pause_seconds > 0:
213
+ logger.info(f"Pausing for {pause_seconds}s before next batch. Processed {len(results)}/{total_worklogs} worklogs")
214
+ if show_progress:
215
+ for s in range(pause_seconds):
216
+ progress_bar.set_description(f"Waiting {pause_seconds-s}s before next batch")
217
+ time.sleep(1)
218
+ progress_bar.set_description("Categorizing worklogs")
219
+ else:
220
+ time.sleep(pause_seconds)
221
+
222
+ if progress_bar:
223
+ progress_bar.close()
224
+
225
+ logger.info(f"Completed processing {total_worklogs} worklogs")
226
+ return results
227
+
228
+ def process_dataframe(df: pd.DataFrame, worklog_column: str = "Worklog",
229
+ issue_column: str = "Issue", default_category: str = "N/A",
230
+ batch_size: int = 10, pause_seconds: int = 5,
231
+ show_progress: bool = True) -> pd.DataFrame:
232
+ """
233
+ Add a new column with technology categories to a dataframe.
234
+ Only categorizes worklogs associated with upskilling issues.
235
+ Processes 10 worklogs at a time with 5-second pauses between batches.
236
+
237
+ Args:
238
+ df: Pandas DataFrame containing worklog data
239
+ worklog_column: Name of the column containing worklog text
240
+ issue_column: Name of the column containing issue text
241
+ default_category: Default value for non-upskilling worklogs
242
+ batch_size: Number of worklogs to process in each batch (default: 10)
243
+ pause_seconds: Seconds to pause between batches (default: 5)
244
+ show_progress: Whether to show a progress bar
245
+
246
+ Returns:
247
+ DataFrame with an additional 'TechCategory' column
248
+ """
249
+ # Initialize TechCategory column with default value
250
+ df["TechCategory"] = default_category
251
+
252
+ # Check if required columns exist
253
+ if worklog_column not in df.columns:
254
+ logger.error(f"Column '{worklog_column}' not found in DataFrame")
255
+ return df
256
+
257
+ if issue_column not in df.columns:
258
+ logger.error(f"Column '{issue_column}' not found in DataFrame")
259
+ return df
260
+
261
+ # Filter for upskilling issues
262
+ upskilling_mask = df[issue_column].apply(is_upskilling_issue)
263
+ upskilling_rows = df[upskilling_mask].copy()
264
+
265
+ logger.info(f"Found {len(upskilling_rows)} rows with upskilling issues out of {len(df)} total rows")
266
+
267
+ if upskilling_rows.empty:
268
+ logger.info("No upskilling issues found, returning dataframe with default category values")
269
+ return df
270
+
271
+ # Extract unique non-null worklog entries from upskilling issues
272
+ unique_worklogs = upskilling_rows[worklog_column].dropna().unique().tolist()
273
+
274
+ # Calculate total estimated tokens
275
+ total_estimated_tokens = sum(estimate_token_count(worklog) for worklog in unique_worklogs)
276
+
277
+ logger.info(f"Processing {len(unique_worklogs)} unique upskilling worklog entries with approximately {total_estimated_tokens} tokens")
278
+
279
+ # Create a mapping of worklog text to category
280
+ if unique_worklogs:
281
+ categories = batch_process_worklogs(
282
+ unique_worklogs,
283
+ batch_size=batch_size,
284
+ pause_seconds=pause_seconds,
285
+ show_progress=show_progress
286
+ )
287
+ worklog_to_category = dict(zip(unique_worklogs, categories))
288
+ else:
289
+ worklog_to_category = {}
290
+
291
+ # Apply categorization only to upskilling worklog entries
292
+ df.loc[upskilling_mask, "TechCategory"] = df.loc[upskilling_mask, worklog_column].apply(
293
+ lambda x: worklog_to_category.get(x, default_category) if pd.notna(x) else default_category
294
+ )
295
+
296
+ # Count the number of actually categorized entries
297
+ categorized_count = len(df[df["TechCategory"] != default_category])
298
+ logger.info(f"Successfully categorized {categorized_count} worklog entries")
299
+
300
+ return df
301
+
302
+ def process_csv_file(
303
+ csv_path: str,
304
+ worklog_column: str = "Worklog",
305
+ issue_column: str = "Issue",
306
+ default_category: str = "N/A",
307
+ output_path: Optional[str] = None,
308
+ overwrite: bool = False,
309
+ batch_size: int = 10,
310
+ pause_seconds: int = 5
311
+ ) -> str:
312
+ """
313
+ Process a CSV file to add technology categories based on worklog entries.
314
+ Only categorizes worklogs associated with upskilling issues.
315
+ Processes 10 worklogs at a time with 5-second pauses between batches.
316
+
317
+ Args:
318
+ csv_path: Path to the CSV file to process
319
+ worklog_column: Name of the column containing worklog text
320
+ issue_column: Name of the column containing issue text
321
+ default_category: Default value for non-upskilling worklogs
322
+ output_path: Path to save the processed file (if None, creates a new file with '_categorized' suffix)
323
+ overwrite: If True, overwrite the original file
324
+ batch_size: Number of worklogs to process in each batch (default: 10)
325
+ pause_seconds: Seconds to pause between batches (default: 5)
326
+
327
+ Returns:
328
+ Path to the saved CSV file
329
+ """
330
+ try:
331
+ # Check if file exists
332
+ if not Path(csv_path).exists():
333
+ logger.error(f"CSV file not found: {csv_path}")
334
+ return ""
335
+
336
+ # Read CSV
337
+ logger.info(f"Reading CSV file: {csv_path}")
338
+ df = pd.read_csv(csv_path)
339
+
340
+ # Process dataframe
341
+ processed_df = process_dataframe(
342
+ df,
343
+ worklog_column=worklog_column,
344
+ issue_column=issue_column,
345
+ default_category=default_category,
346
+ batch_size=batch_size,
347
+ pause_seconds=pause_seconds
348
+ )
349
+
350
+ # Determine output path
351
+ if overwrite:
352
+ save_path = csv_path
353
+ elif output_path:
354
+ save_path = output_path
355
+ else:
356
+ # Create new filename with _categorized suffix
357
+ path_obj = Path(csv_path)
358
+ save_path = str(path_obj.with_stem(f"{path_obj.stem}_categorized"))
359
+
360
+ # Save processed dataframe
361
+ processed_df.to_csv(save_path, index=False)
362
+ logger.info(f"Saved categorized CSV to: {save_path}")
363
+
364
+ return save_path
365
+
366
+ except Exception as e:
367
+ logger.error(f"Error processing CSV file: {e}")
368
+ return ""