Pratik333 commited on
Commit
bb0a442
·
verified ·
1 Parent(s): 7b72e42

Upload 4 files

Browse files
Files changed (4) hide show
  1. .env +1 -0
  2. app1.py +1350 -0
  3. requirements.txt +6 -0
  4. worklog_categorizer.py +368 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GEMINI_API_KEY=AIzaSyCunB1oTkxl7IINRMgQTVqIXKcFYw0Jqow
app1.py ADDED
@@ -0,0 +1,1350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ import re
6
+ from datetime import datetime
7
+ import os
8
+ import shutil
9
+ from pathlib import Path
10
+ import worklog_categorizer as wc
11
+ import time
12
+ import base64
13
+ from io import BytesIO
14
+
15
+ # Set page configuration
16
+ st.set_page_config(layout="wide", page_title="Non-Billable Time Analysis", page_icon="📊")
17
+
18
+ # Define colors to match the React implementation
19
+ COLORS = ['#0088FE', '#00C49F', '#FFBB28', '#FF8042', '#8884d8', '#82ca9d', '#ffc658',
20
+ '#8dd1e1', '#a4de6c', '#d0ed57', '#bc80bd', '#ccebc5', '#ffed6f', '#bebada',
21
+ '#fb8072', '#80b1d3', '#fdb462', '#b3de69']
22
+
23
+ # Initialize session state variables if they don't exist
24
+ if 'initialized' not in st.session_state:
25
+ st.session_state.initialized = True
26
+ st.session_state.processed_data = None
27
+ st.session_state.expanded_user = None
28
+ st.session_state.sort_by = 'totalHours'
29
+ st.session_state.sort_order = 'desc'
30
+ st.session_state.selected_epics = []
31
+ st.session_state.active_tab = 'team_analysis'
32
+ st.session_state.tech_user_filter = ""
33
+ st.session_state.categorized_df = None
34
+ st.session_state.show_csv_data = False
35
+ st.session_state.needs_rerun = False
36
+
37
+ def extract_month(date_range):
38
+ """Extract month from date ranges"""
39
+ if not isinstance(date_range, str):
40
+ return None
41
+
42
+ date_match = re.match(r'^(\d+)/(\w+)/(\d+) to', date_range)
43
+ if date_match:
44
+ return date_match.group(2)
45
+
46
+ single_date_match = re.match(r'^(\d+)/(\w+)/(\d+) at', date_range)
47
+ if single_date_match:
48
+ return single_date_match.group(2)
49
+
50
+ return None
51
+
52
+ def get_row_level(row):
53
+ """Parse level in the data hierarchy"""
54
+ if pd.notna(row.get("Project Category")) and pd.isna(row.get("Project")):
55
+ return "category"
56
+ if pd.notna(row.get("Project")) and pd.isna(row.get("User")):
57
+ return "project"
58
+ if pd.notna(row.get("User")) and pd.isna(row.get("Epic")):
59
+ return "user"
60
+ if pd.notna(row.get("Epic")) and pd.isna(row.get("Issue")):
61
+ return "epic"
62
+ if pd.notna(row.get("Issue")) and pd.isna(row.get("Worklog")):
63
+ return "issue"
64
+ if pd.notna(row.get("Worklog")):
65
+ return "worklog"
66
+ return "unknown"
67
+
68
+ def clear_session_and_cache():
69
+ """Reset the application by clearing cache and session state"""
70
+ # Clear all cached data
71
+ st.cache_data.clear()
72
+
73
+ # Remove data files if they exist
74
+ try:
75
+ if os.path.exists("categorized_data.csv"):
76
+ os.remove("categorized_data.csv")
77
+ if os.path.exists("uploaded_data.csv"):
78
+ os.remove("uploaded_data.csv")
79
+ except Exception as e:
80
+ st.error(f"Error removing files: {e}")
81
+
82
+ # Reset all important session state variables
83
+ st.session_state.processed_data = None
84
+ st.session_state.expanded_user = None
85
+ st.session_state.sort_by = 'totalHours'
86
+ st.session_state.sort_order = 'desc'
87
+ st.session_state.selected_epics = []
88
+ st.session_state.active_tab = 'team_analysis'
89
+ st.session_state.tech_user_filter = ""
90
+ st.session_state.categorized_df = None
91
+ st.session_state.show_csv_data = False
92
+
93
+ # Mark that we need to rerun after clearing
94
+ st.session_state.needs_rerun = True
95
+
96
+ def save_categorized_data(df, filename="categorized_data.csv"):
97
+ """Save the categorized data to a CSV file"""
98
+ try:
99
+ df.to_csv(filename, index=False)
100
+ return True
101
+ except Exception as e:
102
+ st.error(f"Error saving categorized data: {e}")
103
+ return False
104
+
105
+ @st.cache_data
106
+ def process_data(raw_data, force_categorize=False, focus_users=None):
107
+ """Process the data and return various aggregations (cached to prevent reprocessing)"""
108
+ # Filter for non-billable data
109
+ non_billable_data = raw_data[raw_data["Project Category"] == "Non-Billable"].copy()
110
+
111
+ # Add level and month info
112
+ non_billable_data["Level"] = non_billable_data.apply(get_row_level, axis=1)
113
+ non_billable_data["Month"] = non_billable_data["Date"].apply(extract_month)
114
+
115
+ # Check if we need to categorize data
116
+ if "TechCategory" not in non_billable_data.columns or force_categorize:
117
+ # Process tech categories for upskilling worklog entries
118
+ with st.spinner("Categorizing upskilling worklog entries by technology..."):
119
+ # If specific users are provided, prioritize their worklog categorization
120
+ if focus_users and len(focus_users) > 0:
121
+ st.info(f"Focusing on worklog categorization for {len(focus_users)} selected users")
122
+
123
+ # First, process focus users
124
+ focus_mask = non_billable_data["User"].isin(focus_users)
125
+ focus_data = non_billable_data[focus_mask].copy()
126
+
127
+ if not focus_data.empty:
128
+ # Process worklogs for focus users first
129
+ focus_data = wc.process_dataframe(
130
+ focus_data,
131
+ worklog_column="Worklog",
132
+ issue_column="Issue",
133
+ default_category="N/A",
134
+ batch_size=10,
135
+ pause_seconds=2, # Shorter pause for focus users
136
+ show_progress=True
137
+ )
138
+
139
+ # Update the categorized data for focus users
140
+ non_billable_data.loc[focus_mask, "TechCategory"] = focus_data["TechCategory"]
141
+
142
+ # Process all remaining data
143
+ non_billable_data = wc.process_dataframe(
144
+ non_billable_data,
145
+ worklog_column="Worklog",
146
+ issue_column="Issue",
147
+ default_category="N/A",
148
+ batch_size=10,
149
+ pause_seconds=5,
150
+ show_progress=True
151
+ )
152
+
153
+ # Save the categorized data
154
+ save_path = "categorized_data.csv"
155
+ if save_categorized_data(non_billable_data, save_path):
156
+ st.success(f"Saved categorized data to {save_path}")
157
+ # Store the categorized DataFrame for download
158
+ st.session_state.categorized_df = non_billable_data
159
+
160
+ # Process derived data
161
+ team_data = process_team_members(non_billable_data)
162
+ epic_data = process_top_epics(non_billable_data)
163
+ monthly_data = process_monthly_data(non_billable_data)
164
+ tech_category_data = process_tech_categories(non_billable_data)
165
+ epics = sorted(non_billable_data["Epic"].dropna().unique())
166
+
167
+ # Count upskilling entries
168
+ upskilling_mask = non_billable_data["Issue"].apply(wc.is_upskilling_issue)
169
+ upskilling_count = upskilling_mask.sum()
170
+
171
+ # Get all unique users
172
+ unique_users = sorted(non_billable_data["User"].dropna().unique())
173
+
174
+ return {
175
+ 'non_billable_data': non_billable_data,
176
+ 'team_data': team_data,
177
+ 'epic_data': epic_data,
178
+ 'monthly_data': monthly_data,
179
+ 'unique_epics': epics,
180
+ 'unique_users': unique_users,
181
+ 'tech_category_data': tech_category_data,
182
+ 'upskilling_count': upskilling_count
183
+ }
184
+
185
+ def process_tech_categories(data):
186
+ """Process data to get tech category breakdown for upskilling entries"""
187
+ # Filter for rows with tech categories
188
+ tech_data = data[data["TechCategory"] != "N/A"].copy()
189
+
190
+ if tech_data.empty:
191
+ return {
192
+ "overall": [],
193
+ "by_user": {},
194
+ "by_month": []
195
+ }
196
+
197
+ # Overall tech category breakdown
198
+ overall = tech_data.groupby("TechCategory")["Logged"].sum().reset_index()
199
+ overall = overall.sort_values("Logged", ascending=False)
200
+ overall.columns = ["Category", "Hours"]
201
+
202
+ # Tech category by user
203
+ by_user = {}
204
+ for user in tech_data["User"].dropna().unique():
205
+ user_data = tech_data[tech_data["User"] == user]
206
+ user_categories = user_data.groupby("TechCategory")["Logged"].sum().reset_index()
207
+ user_categories = user_categories.sort_values("Logged", ascending=False)
208
+ user_categories.columns = ["Category", "Hours"]
209
+ by_user[user] = user_categories.to_dict('records')
210
+
211
+ # Tech category by month
212
+ month_order = ['Nov', 'Dec', 'Jan', 'Feb', 'Mar']
213
+ by_month = []
214
+
215
+ for month in month_order:
216
+ month_data = tech_data[tech_data["Month"] == month]
217
+ if not month_data.empty:
218
+ month_categories = month_data.groupby("TechCategory")["Logged"].sum().reset_index()
219
+ month_categories = month_categories.sort_values("Logged", ascending=False)
220
+ month_categories.columns = ["Category", "Hours"]
221
+ by_month.append({
222
+ "Month": month,
223
+ "Categories": month_categories.to_dict('records')
224
+ })
225
+
226
+ return {
227
+ "overall": overall.to_dict('records'),
228
+ "by_user": by_user,
229
+ "by_month": by_month
230
+ }
231
+
232
+ def process_team_members(data):
233
+ """Process data to get team member breakdown"""
234
+ # Get unique users
235
+ unique_users = data[data["Level"] == "user"]["User"].dropna().unique()
236
+
237
+ # Process data for each user
238
+ team_data = []
239
+ month_order = ['Nov', 'Dec', 'Jan', 'Feb', 'Mar']
240
+
241
+ for user in unique_users:
242
+ user_data = data[data["User"] == user]
243
+
244
+ # Get user total hours
245
+ user_total_row = user_data[user_data["Level"] == "user"]
246
+ user_total = user_total_row["Logged"].iloc[0] if not user_total_row.empty else 0
247
+
248
+ # Skip users with zero hours
249
+ if user_total == 0:
250
+ continue
251
+
252
+ # Get epic breakdown
253
+ epic_breakdown = []
254
+ for _, row in user_data[user_data["Level"] == "epic"].iterrows():
255
+ epic_breakdown.append({
256
+ "Epic": row["Epic"] if pd.notna(row["Epic"]) else "No Epic",
257
+ "Hours": row["Logged"] if pd.notna(row["Logged"]) else 0,
258
+ "Project": row["Project"] if pd.notna(row["Project"]) else "No Project",
259
+ "Month": row["Month"]
260
+ })
261
+
262
+ # Get tech categories for this user (upskilling only)
263
+ tech_categories = []
264
+ upskilling_rows = user_data[user_data["TechCategory"] != "N/A"]
265
+
266
+ for _, row in upskilling_rows.iterrows():
267
+ tech_categories.append({
268
+ "TechCategory": row["TechCategory"],
269
+ "Hours": row["Logged"] if pd.notna(row["Logged"]) else 0,
270
+ "Issue": row["Issue"] if pd.notna(row["Issue"]) else "No Issue",
271
+ "Worklog": row["Worklog"] if pd.notna(row["Worklog"]) else "No Worklog",
272
+ "Month": row["Month"]
273
+ })
274
+
275
+ # Get upskilling issues for this user
276
+ upskilling_issues = []
277
+ for issue in upskilling_rows["Issue"].unique():
278
+ issue_data = upskilling_rows[upskilling_rows["Issue"] == issue]
279
+ upskilling_issues.append({
280
+ "Issue": issue,
281
+ "Hours": issue_data["Logged"].sum(),
282
+ "TechCategories": [str(cat) for cat in issue_data["TechCategory"].unique().tolist()]
283
+ })
284
+
285
+ # Get monthly breakdown
286
+ monthly_breakdown = {}
287
+
288
+ for month in month_order:
289
+ month_data = [item for item in epic_breakdown if item["Month"] == month]
290
+ total = sum(item["Hours"] for item in month_data)
291
+
292
+ if total > 0:
293
+ epic_hours = {}
294
+ for item in month_data:
295
+ epic = item["Epic"]
296
+ hours = item["Hours"]
297
+ epic_hours[epic] = epic_hours.get(epic, 0) + hours
298
+
299
+ monthly_breakdown[month] = {
300
+ "total": total,
301
+ "epics": epic_hours
302
+ }
303
+
304
+ team_data.append({
305
+ "User": user,
306
+ "TotalHours": user_total,
307
+ "EpicBreakdown": epic_breakdown,
308
+ "TechCategories": tech_categories,
309
+ "UpskillIssues": upskilling_issues,
310
+ "MonthlyData": monthly_breakdown
311
+ })
312
+
313
+ # Sort by total hours
314
+ team_data.sort(key=lambda x: x["TotalHours"], reverse=True)
315
+ return team_data
316
+
317
+ def process_top_epics(data):
318
+ """Process epic data to get hours by epic"""
319
+ # Filter epic rows
320
+ epic_rows = data[data["Level"] == "epic"]
321
+
322
+ # Group by epic and sum hours
323
+ epic_hours = epic_rows.groupby(
324
+ epic_rows["Epic"].fillna("No Epic")
325
+ )["Logged"].sum().reset_index()
326
+
327
+ # Rename columns
328
+ epic_hours.columns = ["Epic", "Hours"]
329
+
330
+ # Sort by hours
331
+ epic_hours = epic_hours.sort_values("Hours", ascending=False)
332
+
333
+ return epic_hours.to_dict('records')
334
+
335
+ def process_monthly_data(data):
336
+ """Process data to get monthly totals"""
337
+ # Filter epic rows with month data
338
+ monthly_rows = data[(data["Level"] == "epic") & (data["Month"].notna())]
339
+
340
+ # Group by month and sum hours
341
+ monthly_hours = monthly_rows.groupby("Month")["Logged"].sum().reset_index()
342
+
343
+ # Rename columns
344
+ monthly_hours.columns = ["Month", "Hours"]
345
+
346
+ # Sort by custom month order
347
+ month_order = ['Nov', 'Dec', 'Jan', 'Feb', 'Mar']
348
+ monthly_hours["MonthOrder"] = monthly_hours["Month"].apply(lambda x: month_order.index(x) if x in month_order else 999)
349
+ monthly_hours = monthly_hours.sort_values("MonthOrder")
350
+ monthly_hours = monthly_hours.drop("MonthOrder", axis=1)
351
+
352
+ return monthly_hours.to_dict('records')
353
+
354
+ def format_hours(hours):
355
+ """Format hours for display"""
356
+ if hours == 0:
357
+ return "-"
358
+ return f"{hours:.1f}"
359
+
360
+ def get_filtered_data(team_data, search_term, selected_month, sort_by, sort_order, tech_category_filter=None):
361
+ """Filter and sort team data based on current selections"""
362
+ filtered_data = team_data.copy()
363
+
364
+ # Apply search filter
365
+ if search_term:
366
+ filtered_data = [item for item in filtered_data if search_term.lower() in item["User"].lower()]
367
+
368
+ # Apply tech category filter
369
+ if tech_category_filter and tech_category_filter != "All":
370
+ filtered_data = [
371
+ item for item in filtered_data
372
+ if any(tc["TechCategory"] == tech_category_filter for tc in item["TechCategories"])
373
+ ]
374
+
375
+ # Adjust total hours based on the tech category filter
376
+ for item in filtered_data:
377
+ tech_hours = sum(tc["Hours"] for tc in item["TechCategories"] if tc["TechCategory"] == tech_category_filter)
378
+ item["FilteredTechHours"] = tech_hours
379
+
380
+ # Apply month filter
381
+ if selected_month != "All":
382
+ filtered_data = [
383
+ item for item in filtered_data
384
+ if item.get("MonthlyData", {}).get(selected_month)
385
+ ]
386
+
387
+ # Adjust hours for selected month
388
+ for item in filtered_data:
389
+ monthly_data = item["MonthlyData"][selected_month]
390
+ item["TotalHours"] = monthly_data["total"]
391
+ item["EpicBreakdown"] = [
392
+ epic for epic in item["EpicBreakdown"]
393
+ if epic["Month"] == selected_month
394
+ ]
395
+ item["TechCategories"] = [
396
+ tc for tc in item["TechCategories"]
397
+ if tc["Month"] == selected_month
398
+ ]
399
+
400
+ # Determine if we want ascending or descending
401
+ reverse_sort = (sort_order == "desc")
402
+
403
+ # Sort the data
404
+ if sort_by == "name":
405
+ filtered_data.sort(key=lambda x: x["User"], reverse=reverse_sort)
406
+ elif sort_by == "totalHours":
407
+ if tech_category_filter and tech_category_filter != "All":
408
+ filtered_data.sort(key=lambda x: x.get("FilteredTechHours", 0), reverse=reverse_sort)
409
+ else:
410
+ filtered_data.sort(key=lambda x: x["TotalHours"], reverse=reverse_sort)
411
+ else:
412
+ # Sort by specific epic
413
+ filtered_data.sort(
414
+ key=lambda x: sum(e["Hours"] for e in x["EpicBreakdown"] if e["Epic"] == sort_by),
415
+ reverse=reverse_sort
416
+ )
417
+
418
+ return filtered_data
419
+
420
+ def get_epic_totals(filtered_data, unique_epics):
421
+ """Calculate total hours by epic for filtered data"""
422
+ totals = {epic: 0 for epic in unique_epics}
423
+
424
+ for user in filtered_data:
425
+ for epic in user["EpicBreakdown"]:
426
+ totals[epic["Epic"]] = totals.get(epic["Epic"], 0) + epic["Hours"]
427
+
428
+ return totals
429
+
430
+ def get_user_chart_data(user_data, selected_month):
431
+ """Get chart data for a specific user"""
432
+ # Combine hours by epic
433
+ epic_totals = {}
434
+
435
+ for epic in user_data["EpicBreakdown"]:
436
+ if selected_month == "All" or epic["Month"] == selected_month:
437
+ epic_name = epic["Epic"]
438
+ epic_totals[epic_name] = epic_totals.get(epic_name, 0) + epic["Hours"]
439
+
440
+ # Convert to array and sort
441
+ return [
442
+ {"name": name, "value": value}
443
+ for name, value in epic_totals.items()
444
+ ]
445
+
446
+ def get_user_tech_categories(user_data, selected_month, min_percentage=1.0):
447
+ """Get tech category data for a specific user focusing on actual technology categories"""
448
+ # Combine hours by tech category
449
+ tech_totals = {}
450
+ total_hours = 0
451
+
452
+ # First try to process TechCategories from the user data
453
+ for tech in user_data["TechCategories"]:
454
+ if selected_month == "All" or tech["Month"] == selected_month:
455
+ category = tech["TechCategory"]
456
+
457
+ # Skip 'nan' or empty categories
458
+ if pd.isna(category) or category in ["nan", "null", "", None, "N/A"]:
459
+ continue
460
+
461
+ hours = tech["Hours"]
462
+ tech_totals[category] = tech_totals.get(category, 0) + hours
463
+ total_hours += hours
464
+
465
+ # Filter categories below the minimum percentage threshold
466
+ if total_hours > 0:
467
+ tech_totals = {
468
+ k: v for k, v in tech_totals.items()
469
+ if (v / total_hours * 100) >= min_percentage
470
+ }
471
+
472
+ # Convert to array and sort by hours (value)
473
+ return [
474
+ {"name": name, "value": value}
475
+ for name, value in sorted(tech_totals.items(), key=lambda x: x[1], reverse=True)
476
+ ]
477
+
478
+ def get_user_monthly_data(user_data):
479
+ """Get monthly data for a specific user"""
480
+ month_order = ['Nov', 'Dec', 'Jan', 'Feb', 'Mar']
481
+
482
+ return [
483
+ {
484
+ "month": month,
485
+ "hours": user_data["MonthlyData"].get(month, {}).get("total", 0)
486
+ }
487
+ for month in month_order
488
+ ]
489
+
490
+ def get_user_tech_issues(user_data, tech_category, selected_month="All"):
491
+ """Get issues associated with a specific tech category for a user"""
492
+ issues = []
493
+
494
+ for tech in user_data["TechCategories"]:
495
+ if tech["TechCategory"] == tech_category:
496
+ if selected_month == "All" or tech["Month"] == selected_month:
497
+ # Check if this issue is already in the list
498
+ existing = next((i for i in issues if i["issue"] == tech["Issue"]), None)
499
+
500
+ if existing:
501
+ existing["hours"] += tech["Hours"]
502
+ else:
503
+ issues.append({
504
+ "issue": tech["Issue"],
505
+ "worklog": tech["Worklog"],
506
+ "hours": tech["Hours"],
507
+ "month": tech["Month"]
508
+ })
509
+
510
+ # Sort by hours
511
+ return sorted(issues, key=lambda x: x["hours"], reverse=True)
512
+
513
+ def display_categorized_data_view(df):
514
+ """Display a view of the categorized data with filtering options"""
515
+ st.header("View Categorized Data")
516
+
517
+ if df is None:
518
+ st.warning("No categorized data available. Please upload and process a CSV file first.")
519
+ return
520
+
521
+ # Add filters for the data view
522
+ col1, col2, col3 = st.columns(3)
523
+
524
+ with col1:
525
+ # Filter by user
526
+ users = sorted(df["User"].dropna().unique())
527
+ selected_user = st.selectbox("Filter by User:", ["All Users"] + list(users))
528
+
529
+ with col2:
530
+ # Filter by upskilling issues only
531
+ show_upskilling_only = st.checkbox("Show Upskilling Issues Only", value=True)
532
+
533
+ with col3:
534
+ # Filter by tech category
535
+ tech_categories = sorted(df["TechCategory"].dropna().unique())
536
+ tech_categories = [cat for cat in tech_categories if cat != "N/A"]
537
+ selected_tech = st.selectbox("Filter by Technology:", ["All Technologies"] + tech_categories)
538
+
539
+ # Apply filters
540
+ filtered_df = df.copy()
541
+
542
+ if selected_user != "All Users":
543
+ filtered_df = filtered_df[filtered_df["User"] == selected_user]
544
+
545
+ if show_upskilling_only:
546
+ filtered_df = filtered_df[filtered_df["Issue"].apply(wc.is_upskilling_issue)]
547
+
548
+ if selected_tech != "All Technologies":
549
+ filtered_df = filtered_df[filtered_df["TechCategory"] == selected_tech]
550
+
551
+ # Only show worklog level rows
552
+ filtered_df = filtered_df[filtered_df["Level"] == "worklog"]
553
+
554
+ # Select relevant columns to display
555
+ display_columns = ["User", "Issue", "Worklog", "TechCategory", "Logged", "Month"]
556
+
557
+ # Display the filtered data
558
+ if filtered_df.empty:
559
+ st.info("No data matches the selected filters.")
560
+ else:
561
+ st.write(f"Showing {len(filtered_df)} records. Use the filters above to narrow down the results.")
562
+
563
+ # Rename columns for display
564
+ display_df = filtered_df[display_columns].copy()
565
+ display_df.columns = ["User", "Issue", "Worklog", "Technology", "Hours", "Month"]
566
+
567
+ # Sort by user and hours
568
+ display_df = display_df.sort_values(["User", "Hours"], ascending=[True, False])
569
+
570
+ # Display as a table
571
+ st.dataframe(display_df, use_container_width=True)
572
+
573
+ # Add CSV download button
574
+ st.download_button(
575
+ label="Download Filtered CSV",
576
+ data=filtered_df.to_csv(index=False).encode('utf-8'),
577
+ file_name="filtered_upskilling_data.csv",
578
+ mime="text/csv",
579
+ )
580
+
581
+ # Download full categorized dataset
582
+ st.download_button(
583
+ label="Download Complete Categorized CSV",
584
+ data=df.to_csv(index=False).encode('utf-8'),
585
+ file_name="full_categorized_data.csv",
586
+ mime="text/csv",
587
+ )
588
+
589
+ def display_tech_category_analysis(team_data, tech_category_data, upskilling_count):
590
+ """Display tech category analysis section for upskilling issues"""
591
+ st.header("Upskilling Technology Analysis")
592
+
593
+ # Info about upskilling issues
594
+ st.info(f"Found {upskilling_count} upskilling-related entries in the data. Technology categories shown below represent only upskilling activities.")
595
+
596
+ # User filter for upskilling tech analysis with session state
597
+ tech_user_filter = st.text_input(
598
+ "Filter by Team Member Name:",
599
+ key="tech_user_filter_input"
600
+ )
601
+
602
+ # Apply filter without reprocessing data
603
+ filtered_team_data = team_data
604
+ if tech_user_filter:
605
+ filtered_team_data = [user for user in team_data if tech_user_filter.lower() in user["User"].lower()]
606
+ if not filtered_team_data:
607
+ st.warning(f"No team members found matching '{tech_user_filter}'")
608
+ else:
609
+ st.success(f"Showing data for {len(filtered_team_data)} team members matching '{tech_user_filter}'")
610
+
611
+ # Filter to show only users with actual upskilling data
612
+ upskilling_team_data = [
613
+ user for user in filtered_team_data
614
+ if any(tech["TechCategory"] not in ["nan", "null", "", None, "N/A"] for tech in user["TechCategories"])
615
+ ]
616
+
617
+ # If no tech categories found
618
+ if not tech_category_data["overall"] or not upskilling_team_data:
619
+ st.warning("No technology categories found in upskilling data. This could be because there are no upskilling worklog entries or the categorization process failed.")
620
+ return
621
+
622
+ # Add overall tech category chart
623
+ st.subheader("Overall Technology Distribution in Upskilling")
624
+
625
+ # Convert to DataFrame for Plotly
626
+ overall_df = pd.DataFrame(tech_category_data["overall"])
627
+
628
+ # Filter out nan/null values from overall tech categories
629
+ overall_df = overall_df[~overall_df["Category"].isin(["nan", "null", "", "N/A"])].copy()
630
+
631
+ if not overall_df.empty:
632
+ fig_tech = px.pie(
633
+ overall_df,
634
+ values="Hours",
635
+ names="Category",
636
+ color_discrete_sequence=COLORS,
637
+ title="Hours by Technology Category in Upskilling Activities"
638
+ )
639
+
640
+ fig_tech.update_traces(
641
+ textposition='inside',
642
+ textinfo='percent+label',
643
+ hovertemplate='%{label}: %{value:.1f} hours (%{percent})'
644
+ )
645
+
646
+ st.plotly_chart(fig_tech, use_container_width=True)
647
+
648
+ # Show table of top categories
649
+ st.subheader("Top Technology Categories in Upskilling")
650
+ top_tech_df = overall_df.head(10).copy()
651
+ top_tech_df["Hours"] = top_tech_df["Hours"].map(lambda x: f"{x:.1f}")
652
+ st.dataframe(top_tech_df, use_container_width=True)
653
+
654
+ # Tech category filters
655
+ st.subheader("Team Member Analysis by Technology")
656
+
657
+ col1, col2 = st.columns(2)
658
+
659
+ with col1:
660
+ # Get all unique tech categories (excluding nan/null)
661
+ all_categories = [
662
+ item["Category"] for item in tech_category_data["overall"]
663
+ if item["Category"] not in ["nan", "null", "", "N/A"]
664
+ ]
665
+ tech_filter_options = ["All"] + all_categories
666
+
667
+ if "selected_tech" not in st.session_state:
668
+ st.session_state.selected_tech = "All"
669
+
670
+ selected_tech = st.selectbox(
671
+ "Filter by Technology Category:",
672
+ options=tech_filter_options,
673
+ key="tech_category_selector"
674
+ )
675
+
676
+ with col2:
677
+ users_count = len(upskilling_team_data) # Use filtered upskilling users count
678
+ default_value = min(5, max(1, users_count))
679
+
680
+ if users_count <= 1:
681
+ st.write(f"Showing data for {users_count} user")
682
+ min_users = users_count
683
+ else:
684
+ min_users = st.slider(
685
+ "Minimum users to display:",
686
+ min_value=1,
687
+ max_value=max(2, users_count),
688
+ value=default_value,
689
+ key="min_users_slider"
690
+ )
691
+
692
+ # Filter team data by tech category
693
+ if selected_tech != "All":
694
+ tech_filtered_data = [
695
+ user for user in upskilling_team_data
696
+ if any(tc["TechCategory"] == selected_tech for tc in user["TechCategories"])
697
+ ]
698
+
699
+ # Calculate hours for each user in this tech category
700
+ for user in tech_filtered_data:
701
+ user["TechHours"] = sum(
702
+ tc["Hours"] for tc in user["TechCategories"]
703
+ if tc["TechCategory"] == selected_tech
704
+ )
705
+
706
+ # Sort by tech category hours
707
+ tech_filtered_data.sort(key=lambda x: x.get("TechHours", 0), reverse=True)
708
+
709
+ # Create bar chart of users by tech hours
710
+ if tech_filtered_data:
711
+ # Take top users by hours in this category
712
+ top_users = tech_filtered_data[:min_users]
713
+
714
+ user_tech_df = pd.DataFrame([
715
+ {"User": user["User"], "Hours": user["TechHours"]}
716
+ for user in top_users
717
+ ])
718
+
719
+ fig_users = px.bar(
720
+ user_tech_df,
721
+ x="User",
722
+ y="Hours",
723
+ title=f"Top Users for {selected_tech} in Upskilling",
724
+ color_discrete_sequence=['#8884d8']
725
+ )
726
+
727
+ st.plotly_chart(fig_users, use_container_width=True)
728
+
729
+ # Team member breakdown
730
+ st.subheader(f"Team Members Upskilling in {selected_tech}")
731
+
732
+ for i, user in enumerate(tech_filtered_data):
733
+ with st.expander(f"{user['User']} - {format_hours(user['TechHours'])} hours"):
734
+ # Get issues for this user in this tech category
735
+ issues = get_user_tech_issues(user, selected_tech)
736
+
737
+ if issues:
738
+ st.write(f"### Upskilling Issues for {user['User']} in {selected_tech}")
739
+
740
+ # Create an issues table
741
+ issues_df = pd.DataFrame([
742
+ {
743
+ "Issue": issue["issue"],
744
+ "Worklog": issue["worklog"],
745
+ "Hours": format_hours(issue["hours"]),
746
+ "Month": issue["month"]
747
+ }
748
+ for issue in issues
749
+ ])
750
+
751
+ st.dataframe(issues_df, use_container_width=True)
752
+ else:
753
+ st.write("No detailed issue information available.")
754
+ else:
755
+ st.info(f"No team members found upskilling in {selected_tech}.")
756
+ else:
757
+ # Show overall tech distribution by team member
758
+ st.subheader("Technology Distribution by Team Member (Upskilling Only)")
759
+
760
+ # Display upskilling users only
761
+ if not upskilling_team_data:
762
+ st.info("No team members found with upskilling entries.")
763
+ return
764
+
765
+ # Show top users based on selection
766
+ display_users = upskilling_team_data[:min_users]
767
+
768
+ # Create tabs for each team member
769
+ tabs = st.tabs([user["User"] for user in display_users])
770
+
771
+ for i, tab in enumerate(tabs):
772
+ user = display_users[i]
773
+
774
+ with tab:
775
+ # Get tech categories for this user
776
+ user_tech = get_user_tech_categories(user, "All")
777
+
778
+ if user_tech:
779
+ # Convert to DataFrame for Plotly
780
+ user_tech_df = pd.DataFrame(user_tech)
781
+
782
+ fig_user_tech = px.pie(
783
+ user_tech_df,
784
+ values="value",
785
+ names="name",
786
+ color_discrete_sequence=COLORS,
787
+ title=f"Upskilling Technology Distribution for {user['User']}"
788
+ )
789
+
790
+ fig_user_tech.update_traces(
791
+ textposition='inside',
792
+ textinfo='percent+label',
793
+ hovertemplate='%{label}: %{value:.1f} hours (%{percent})'
794
+ )
795
+
796
+ st.plotly_chart(fig_user_tech, use_container_width=True)
797
+
798
+ # Show breakdown of upskilling issues
799
+ if user["UpskillIssues"]:
800
+ st.subheader(f"Upskilling Issues for {user['User']}")
801
+
802
+ # Filter out issues with only nan values
803
+ valid_issues = [
804
+ issue for issue in user["UpskillIssues"]
805
+ if any(tech not in ["nan", "null", "", None, "N/A"] for tech in issue["TechCategories"])
806
+ ]
807
+
808
+ if valid_issues:
809
+ issues_df = pd.DataFrame([
810
+ {
811
+ "Issue": issue["Issue"],
812
+ "Hours": format_hours(issue["Hours"]),
813
+ "Technologies": ", ".join([
814
+ str(tech) for tech in issue["TechCategories"]
815
+ if tech not in ["nan", "null", "", None, "N/A"]
816
+ ])
817
+ }
818
+ for issue in sorted(valid_issues, key=lambda x: x["Hours"], reverse=True)
819
+ ])
820
+
821
+ st.dataframe(issues_df, use_container_width=True)
822
+ else:
823
+ st.info("No upskilling issues with valid technology categories found.")
824
+ else:
825
+ st.info(f"No upskilling technology categories found for {user['User']}.")
826
+
827
+ def display_team_epic_analysis(team_data, epic_data, monthly_data, unique_epics):
828
+ """Display team and epic analysis section"""
829
+ # Create filters sidebar
830
+ st.sidebar.title("Filters")
831
+
832
+ # Team Member Filters
833
+ st.sidebar.header("Team Member Filters")
834
+ search_term = st.sidebar.text_input("Search by Name:", "")
835
+
836
+ display_count_options = [10, 20, 50]
837
+ if len(team_data) > 50:
838
+ display_count_options.append(len(team_data))
839
+
840
+ display_count = st.sidebar.selectbox(
841
+ "Team Members to Display:",
842
+ options=display_count_options,
843
+ format_func=lambda x: f"All ({len(team_data)})" if x == len(team_data) else str(x),
844
+ index=1 # Default to 20
845
+ )
846
+
847
+ month_options = ["All"] + ['Nov', 'Dec', 'Jan', 'Feb', 'Mar']
848
+ selected_month = st.sidebar.selectbox(
849
+ "Month:",
850
+ options=month_options,
851
+ index=0 # Default to "All"
852
+ )
853
+
854
+ # Epic Filters
855
+ st.sidebar.header("Epic Filters")
856
+ epic_col1, epic_col2, epic_col3 = st.sidebar.columns(3)
857
+
858
+ with epic_col1:
859
+ if st.button("Select All"):
860
+ st.session_state.selected_epics = [epic["Epic"] for epic in epic_data]
861
+
862
+ with epic_col2:
863
+ if st.button("Clear All"):
864
+ st.session_state.selected_epics = []
865
+
866
+ with epic_col3:
867
+ if st.button("Top 5 Epics"):
868
+ st.session_state.selected_epics = [epic["Epic"] for epic in epic_data[:5]]
869
+
870
+ # Epic selection
871
+ st.sidebar.subheader("Select Epics")
872
+
873
+ # Create a scrollable container for epics
874
+ epic_container = st.sidebar.container()
875
+ with epic_container:
876
+ for i, epic in enumerate(epic_data):
877
+ epic_name = epic["Epic"]
878
+ epic_hours = epic["Hours"]
879
+ epic_color = COLORS[i % len(COLORS)]
880
+
881
+ # Use checkbox for each epic
882
+ checked = st.checkbox(
883
+ f"{epic_name} ({int(epic_hours)}h)",
884
+ value=epic_name in st.session_state.selected_epics,
885
+ key=f"epic_{i}"
886
+ )
887
+
888
+ # Update selected epics based on checkbox state
889
+ if checked and epic_name not in st.session_state.selected_epics:
890
+ st.session_state.selected_epics.append(epic_name)
891
+ elif not checked and epic_name in st.session_state.selected_epics:
892
+ st.session_state.selected_epics.remove(epic_name)
893
+
894
+ # Monthly Overview Chart
895
+ st.header("Monthly Non-Billable Hours Overview")
896
+
897
+ # Convert to DataFrame for Plotly
898
+ monthly_df = pd.DataFrame(monthly_data)
899
+
900
+ if not monthly_df.empty:
901
+ fig_monthly = px.bar(
902
+ monthly_df,
903
+ x="Month",
904
+ y="Hours",
905
+ title="",
906
+ labels={"Hours": "Non-Billable Hours", "Month": "Month"},
907
+ color_discrete_sequence=['#8884d8']
908
+ )
909
+ fig_monthly.update_layout(
910
+ plot_bgcolor='white',
911
+ margin=dict(l=20, r=30, t=10, b=20),
912
+ )
913
+ st.plotly_chart(fig_monthly, use_container_width=True)
914
+
915
+ # Team Members Table
916
+ st.header("Team Member Breakdown")
917
+ st.write("Click on a team member to see their detailed breakdown.")
918
+
919
+ # Sorting controls
920
+ sort_col1, sort_col2 = st.columns(2)
921
+
922
+ with sort_col1:
923
+ sort_options = ["totalHours", "name"] + st.session_state.selected_epics
924
+ sort_labels = {
925
+ "totalHours": "Total Hours",
926
+ "name": "Name"
927
+ }
928
+ for epic in st.session_state.selected_epics:
929
+ sort_labels[epic] = epic
930
+
931
+ new_sort_by = st.selectbox(
932
+ "Sort by:",
933
+ options=sort_options,
934
+ format_func=lambda x: sort_labels[x],
935
+ index=sort_options.index(st.session_state.sort_by)
936
+ )
937
+
938
+ if new_sort_by != st.session_state.sort_by:
939
+ st.session_state.sort_by = new_sort_by
940
+
941
+ with sort_col2:
942
+ sort_order_options = ["desc", "asc"]
943
+ sort_order_labels = {"desc": "Descending", "asc": "Ascending"}
944
+
945
+ new_sort_order = st.selectbox(
946
+ "Order:",
947
+ options=sort_order_options,
948
+ format_func=lambda x: sort_order_labels[x],
949
+ index=sort_order_options.index(st.session_state.sort_order)
950
+ )
951
+
952
+ if new_sort_order != st.session_state.sort_order:
953
+ st.session_state.sort_order = new_sort_order
954
+
955
+ # Get filtered and sorted data
956
+ filtered_team_data = get_filtered_data(
957
+ team_data,
958
+ search_term,
959
+ selected_month,
960
+ st.session_state.sort_by,
961
+ st.session_state.sort_order
962
+ )
963
+
964
+ # Apply display count
965
+ table_data = filtered_team_data[:display_count]
966
+
967
+ # Calculate epic totals
968
+ epic_totals = get_epic_totals(table_data, unique_epics)
969
+
970
+ # Create the table
971
+ if not table_data:
972
+ st.warning("No data matches your filters. Try adjusting your search criteria.")
973
+ else:
974
+ # Create table header
975
+ header_cols = ["Team Member", "Total Hours"] + st.session_state.selected_epics
976
+ header_col_sizes = [3] + [2] * (len(header_cols) - 1)
977
+
978
+ # Create a styled header row
979
+ header_row = st.columns(header_col_sizes)
980
+
981
+ with header_row[0]:
982
+ sort_icon = "▼" if st.session_state.sort_by == "name" and st.session_state.sort_order == "desc" else "▲" if st.session_state.sort_by == "name" and st.session_state.sort_order == "asc" else ""
983
+ st.markdown(f"**Team Member {sort_icon}**")
984
+
985
+ with header_row[1]:
986
+ sort_icon = "▼" if st.session_state.sort_by == "totalHours" and st.session_state.sort_order == "desc" else "▲" if st.session_state.sort_by == "totalHours" and st.session_state.sort_order == "asc" else ""
987
+ st.markdown(f"**Total Hours {sort_icon}**")
988
+
989
+ for i, epic in enumerate(st.session_state.selected_epics):
990
+ with header_row[i+2]:
991
+ sort_icon = "▼" if st.session_state.sort_by == epic and st.session_state.sort_order == "desc" else "▲" if st.session_state.sort_by == epic and st.session_state.sort_order == "asc" else ""
992
+ st.markdown(f"**{epic} {sort_icon}**")
993
+
994
+ # Display each team member as a row
995
+ for user_idx, user in enumerate(table_data):
996
+ # Create a container for each row
997
+ with st.container():
998
+ # Use columns for table cells
999
+ row_cols = st.columns(header_col_sizes)
1000
+
1001
+ # User name cell - clickable to expand
1002
+ with row_cols[0]:
1003
+ is_expanded = st.session_state.expanded_user == user["User"]
1004
+ expand_icon = "🔽" if is_expanded else "🔼"
1005
+
1006
+ if st.button(f"{user['User']} {expand_icon}", key=f"user_btn_{user_idx}"):
1007
+ if is_expanded:
1008
+ st.session_state.expanded_user = None
1009
+ else:
1010
+ st.session_state.expanded_user = user["User"]
1011
+
1012
+ # Total hours cell
1013
+ with row_cols[1]:
1014
+ st.write(format_hours(user["TotalHours"]))
1015
+
1016
+ # Epic hours cells
1017
+ for i, epic in enumerate(st.session_state.selected_epics):
1018
+ with row_cols[i+2]:
1019
+ epic_hours = sum(e["Hours"] for e in user["EpicBreakdown"] if e["Epic"] == epic)
1020
+ st.write(format_hours(epic_hours))
1021
+
1022
+ # Expanded user detail
1023
+ if st.session_state.expanded_user == user["User"]:
1024
+ with st.expander("", expanded=True):
1025
+ st.subheader(f"{user['User']} - Detailed Breakdown")
1026
+
1027
+ # Create tabs for different views
1028
+ user_tab1, user_tab2 = st.tabs(["Epic Distribution", "Upskilling Technologies"])
1029
+
1030
+ with user_tab1:
1031
+ # Create two columns for charts
1032
+ chart_col1, chart_col2 = st.columns(2)
1033
+
1034
+ # Epic Distribution Chart
1035
+ with chart_col1:
1036
+ st.markdown("#### Epic Distribution")
1037
+
1038
+ # Get chart data
1039
+ user_chart_data = get_user_chart_data(user, selected_month)
1040
+
1041
+ if user_chart_data:
1042
+ # Sort by value
1043
+ user_chart_data.sort(key=lambda x: x["value"], reverse=True)
1044
+
1045
+ # Create DataFrame
1046
+ epic_df = pd.DataFrame(user_chart_data)
1047
+
1048
+ fig_pie = px.pie(
1049
+ epic_df,
1050
+ values="value",
1051
+ names="name",
1052
+ color_discrete_sequence=COLORS,
1053
+ )
1054
+
1055
+ fig_pie.update_traces(
1056
+ textposition='inside',
1057
+ textinfo='percent+label',
1058
+ hovertemplate='%{label}: %{value:.1f} hours (%{percent})'
1059
+ )
1060
+
1061
+ fig_pie.update_layout(
1062
+ height=400,
1063
+ margin=dict(l=10, r=10, t=10, b=10)
1064
+ )
1065
+
1066
+ st.plotly_chart(fig_pie, use_container_width=True)
1067
+ else:
1068
+ st.info("No epic data available for the selected period.")
1069
+
1070
+ # Monthly Distribution Chart
1071
+ with chart_col2:
1072
+ st.markdown("#### Monthly Distribution")
1073
+
1074
+ # Get monthly data
1075
+ monthly_data = get_user_monthly_data(user)
1076
+
1077
+ # Filter out zero hours
1078
+ monthly_data = [m for m in monthly_data if m["hours"] > 0]
1079
+
1080
+ if monthly_data:
1081
+ # Create DataFrame
1082
+ monthly_df = pd.DataFrame(monthly_data)
1083
+
1084
+ fig_bar = px.bar(
1085
+ monthly_df,
1086
+ x="month",
1087
+ y="hours",
1088
+ color_discrete_sequence=['#82ca9d']
1089
+ )
1090
+
1091
+ fig_bar.update_layout(
1092
+ height=400,
1093
+ margin=dict(l=10, r=10, t=10, b=10),
1094
+ xaxis_title="Month",
1095
+ yaxis_title="Hours"
1096
+ )
1097
+
1098
+ st.plotly_chart(fig_bar, use_container_width=True)
1099
+ else:
1100
+ st.info("No monthly data available.")
1101
+
1102
+ # Epic Details Table
1103
+ st.markdown("#### Epic Details")
1104
+
1105
+ user_epic_data = get_user_chart_data(user, selected_month)
1106
+
1107
+ if user_epic_data:
1108
+ # Create data for table
1109
+ epic_details = []
1110
+
1111
+ for item in user_epic_data:
1112
+ epic_name = item["name"]
1113
+ hours = item["value"]
1114
+
1115
+ # Find project for this epic
1116
+ project = next((e["Project"] for e in user["EpicBreakdown"] if e["Epic"] == epic_name), "-")
1117
+
1118
+ # Calculate percentage
1119
+ percent = (hours / user["TotalHours"] * 100) if user["TotalHours"] > 0 else 0
1120
+
1121
+ epic_details.append({
1122
+ "Epic": epic_name,
1123
+ "Project": project,
1124
+ "Hours": hours,
1125
+ "% of Total": f"{percent:.1f}%"
1126
+ })
1127
+
1128
+ # Sort by hours descending
1129
+ epic_details.sort(key=lambda x: x["Hours"], reverse=True)
1130
+
1131
+ # Create DataFrame
1132
+ epic_df = pd.DataFrame(epic_details)
1133
+ st.dataframe(epic_df, use_container_width=True)
1134
+ else:
1135
+ st.info("No epic details available for this user.")
1136
+
1137
+ with user_tab2:
1138
+ # Technology Distribution
1139
+ st.markdown("#### Upskilling Technology Distribution")
1140
+
1141
+ # Get tech categories for this user
1142
+ user_tech_data = get_user_tech_categories(user, selected_month)
1143
+
1144
+ if user_tech_data:
1145
+ # Create DataFrame for chart
1146
+ tech_df = pd.DataFrame(user_tech_data)
1147
+
1148
+ fig_tech = px.pie(
1149
+ tech_df,
1150
+ values="value",
1151
+ names="name",
1152
+ color_discrete_sequence=COLORS,
1153
+ )
1154
+
1155
+ fig_tech.update_traces(
1156
+ textposition='inside',
1157
+ textinfo='percent+label',
1158
+ hovertemplate='%{label}: %{value:.1f} hours (%{percent})'
1159
+ )
1160
+
1161
+ st.plotly_chart(fig_tech, use_container_width=True)
1162
+
1163
+ # Upskilling issues
1164
+ if user["UpskillIssues"]:
1165
+ st.markdown("#### Upskilling Issues")
1166
+
1167
+ # Filter out issues with only nan values
1168
+ valid_issues = [
1169
+ issue for issue in user["UpskillIssues"]
1170
+ if any(tech not in ["nan", "null", "", None, "N/A"] for tech in issue["TechCategories"])
1171
+ ]
1172
+
1173
+ if valid_issues:
1174
+ issues_df = pd.DataFrame([
1175
+ {
1176
+ "Issue": issue["Issue"],
1177
+ "Hours": format_hours(issue["Hours"]),
1178
+ "Technologies": ", ".join([
1179
+ str(tech) for tech in issue["TechCategories"]
1180
+ if tech not in ["nan", "null", "", None, "N/A"]
1181
+ ])
1182
+ }
1183
+ for issue in sorted(valid_issues, key=lambda x: x["Hours"], reverse=True)
1184
+ ])
1185
+
1186
+ st.dataframe(issues_df, use_container_width=True)
1187
+ else:
1188
+ st.info("No upskilling issues with valid technology categories found.")
1189
+ else:
1190
+ st.info("No upskilling technology data found for this user.")
1191
+
1192
+ # Totals row
1193
+ st.markdown("---")
1194
+ total_row = st.columns(header_col_sizes)
1195
+
1196
+ with total_row[0]:
1197
+ st.markdown("**Total**")
1198
+
1199
+ with total_row[1]:
1200
+ total_hours = sum(user["TotalHours"] for user in table_data)
1201
+ st.markdown(f"**{format_hours(total_hours)}**")
1202
+
1203
+ for i, epic in enumerate(st.session_state.selected_epics):
1204
+ with total_row[i+2]:
1205
+ st.markdown(f"**{format_hours(epic_totals.get(epic, 0))}**")
1206
+
1207
+ # Epic Distribution Chart
1208
+ st.header("Epic Distribution")
1209
+
1210
+ # Filter epic data to only selected epics
1211
+ selected_epic_data = [epic for epic in epic_data if epic["Epic"] in st.session_state.selected_epics]
1212
+
1213
+ if selected_epic_data:
1214
+ epic_df = pd.DataFrame(selected_epic_data)
1215
+
1216
+ fig_pie = px.pie(
1217
+ epic_df,
1218
+ values="Hours",
1219
+ names="Epic",
1220
+ color_discrete_sequence=COLORS,
1221
+ )
1222
+
1223
+ fig_pie.update_traces(
1224
+ textposition='inside',
1225
+ textinfo='percent+label',
1226
+ hovertemplate='%{label}: %{value:.1f} hours (%{percent})'
1227
+ )
1228
+
1229
+ fig_pie.update_layout(
1230
+ height=500,
1231
+ margin=dict(l=20, r=20, t=20, b=20)
1232
+ )
1233
+
1234
+ st.plotly_chart(fig_pie, use_container_width=True)
1235
+ else:
1236
+ st.info("Please select at least one epic to display the distribution chart.")
1237
+
1238
+ def main():
1239
+ st.title("Non-Billable Time Analysis Dashboard")
1240
+
1241
+ # Clear cache button at the top
1242
+ clear_cache_col, title_col = st.columns([1, 5])
1243
+ with clear_cache_col:
1244
+ if st.button("🗑️ Clear Cache", key="clear_cache_button"):
1245
+ clear_session_and_cache()
1246
+ st.success("Cache and files cleared successfully! Please reload the page.")
1247
+ st.stop() # Stop execution to force a reload
1248
+
1249
+ # Check if we need to reload due to cache clearing
1250
+ if st.session_state.needs_rerun:
1251
+ st.session_state.needs_rerun = False
1252
+ st.rerun() # Updated to use st.rerun() instead of experimental_rerun
1253
+
1254
+ # Load data
1255
+ if st.session_state.processed_data is None:
1256
+ # File uploader (outside any container to ensure it's always displayed)
1257
+ uploaded_file = st.file_uploader("Upload Project Time Logging CSV", type=["csv"])
1258
+
1259
+ # Check for previously categorized data
1260
+ categorized_file = Path("categorized_data.csv")
1261
+
1262
+ if uploaded_file is not None:
1263
+ # New file uploaded - process from scratch
1264
+ try:
1265
+ raw_data = pd.read_csv(uploaded_file)
1266
+
1267
+ # Save uploaded file for reference
1268
+ with open("uploaded_data.csv", "wb") as f:
1269
+ f.write(uploaded_file.getvalue())
1270
+
1271
+ # Extract unique users for selection
1272
+ unique_users = sorted(raw_data["User"].dropna().unique())
1273
+
1274
+ # Allow user to select focus users for tech categorization
1275
+ st.subheader("Select Users for Focus Tech Categorization")
1276
+ focus_users = st.multiselect(
1277
+ "Select users to prioritize for tech categorization (optional):",
1278
+ options=unique_users,
1279
+ default=[]
1280
+ )
1281
+
1282
+ # Process data with focus on specific users
1283
+ processed_data = process_data(raw_data, force_categorize=True, focus_users=focus_users)
1284
+ if processed_data is not None:
1285
+ st.session_state.processed_data = processed_data
1286
+ st.rerun() # Updated to use st.rerun() instead of experimental_rerun
1287
+ else:
1288
+ st.error("Error processing data")
1289
+ return
1290
+ except Exception as e:
1291
+ st.error(f"Error processing uploaded file: {e}")
1292
+ return
1293
+ elif categorized_file.exists():
1294
+ # Use categorized data if available
1295
+ st.info("Using previously categorized data. Upload a new file to reprocess, or click 'Clear Cache' to start fresh.")
1296
+ try:
1297
+ raw_data = pd.read_csv(categorized_file)
1298
+
1299
+ # Store the categorized DataFrame for download
1300
+ st.session_state.categorized_df = raw_data
1301
+
1302
+ # Load data without reprocessing
1303
+ processed_data = process_data(raw_data, force_categorize=False)
1304
+ if processed_data is not None:
1305
+ st.session_state.processed_data = processed_data
1306
+ st.rerun() # Updated to use st.rerun() instead of experimental_rerun
1307
+ else:
1308
+ st.error("Error processing data")
1309
+ return
1310
+ except Exception as e:
1311
+ st.error(f"Error loading categorized data: {e}")
1312
+ return
1313
+ else:
1314
+ st.error("Please upload a CSV file to begin analysis.")
1315
+ return
1316
+ else:
1317
+ # Get processed data from session state
1318
+ non_billable_data = st.session_state.processed_data['non_billable_data']
1319
+ team_data = st.session_state.processed_data['team_data']
1320
+ epic_data = st.session_state.processed_data['epic_data']
1321
+ monthly_data = st.session_state.processed_data['monthly_data']
1322
+ unique_epics = st.session_state.processed_data['unique_epics']
1323
+ tech_category_data = st.session_state.processed_data['tech_category_data']
1324
+ upskilling_count = st.session_state.processed_data['upskilling_count']
1325
+
1326
+ # Set default selected epics if none are selected
1327
+ if not st.session_state.selected_epics and epic_data:
1328
+ st.session_state.selected_epics = [epic["Epic"] for epic in epic_data[:5]] # Top 5 epics
1329
+
1330
+ # Create tabs for different analysis views
1331
+ tab1, tab2, tab3 = st.tabs([
1332
+ "📊 Team & Epic Analysis",
1333
+ "💻 Upskilling Technology Analysis",
1334
+ "📋 View & Download CSV Data"
1335
+ ])
1336
+
1337
+ with tab1:
1338
+ st.session_state.active_tab = 'team_analysis'
1339
+ display_team_epic_analysis(team_data, epic_data, monthly_data, unique_epics)
1340
+
1341
+ with tab2:
1342
+ st.session_state.active_tab = 'tech_analysis'
1343
+ display_tech_category_analysis(team_data, tech_category_data, upskilling_count)
1344
+
1345
+ with tab3:
1346
+ st.session_state.active_tab = 'csv_view'
1347
+ display_categorized_data_view(st.session_state.categorized_df)
1348
+
1349
+ if __name__ == "__main__":
1350
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ plotly
4
+ python-dateutil
5
+ google-generativeai
6
+ python-dotenv
worklog_categorizer.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import google.generativeai as genai
4
+ from functools import lru_cache
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ import pandas as pd
7
+ from pathlib import Path
8
+ import time
9
+ from tqdm import tqdm
10
+ import re
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize Gemini API
17
+ try:
18
+ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
19
+ model = genai.GenerativeModel("gemini-1.5-flash")
20
+ logger.info("Gemini API initialized successfully")
21
+ except Exception as e:
22
+ logger.error(f"Error initializing Gemini API: {e}")
23
+ model = None
24
+
25
+ # Prompt for worklog categorization - modified for batch processing
26
+ BATCH_CATEGORIZATION_PROMPT = """
27
+ You are a technology skill categorizer. Analyze each worklog entry and assign a single technology category word that best represents the technical skill or technology involved.
28
+
29
+ Guidelines:
30
+ 1. Respond with ONLY a single word (or hyphenated term if necessary) for each worklog
31
+ 2. Focus on the core technology, framework, or skill
32
+ 3. Be specific when the technology is clear (e.g., "React", "Python", "AWS")
33
+ 4. Use broader categories when specific technology isn't clear (e.g., "Frontend", "Backend", "DevOps")
34
+ 5. Prefer standard technology names over abbreviations
35
+ 6. Don't include unnecessary adjectives or descriptions
36
+ 7. Respond in a numbered list format matching the input worklogs
37
+
38
+ Examples:
39
+ Worklog 1: "fixing issue in next js application" → "NextJS"
40
+ Worklog 2: "Task issue fixing - next js application" → "NextJS"
41
+ Worklog 3: "Debugging Python script for data analysis" → "Python"
42
+ Worklog 4: "Creating responsive CSS layout" → "CSS"
43
+ Worklog 5: "Implementing REST API endpoints" → "Backend"
44
+
45
+ Here are the worklogs to categorize:
46
+ {worklogs}
47
+
48
+ For each worklog, respond with a numbered list containing only the category word for each entry:
49
+ 1. [category for worklog 1]
50
+ 2. [category for worklog 2]
51
+ ...and so on
52
+ """
53
+
54
+ def is_upskilling_issue(issue_text):
55
+ """
56
+ Check if an issue is related to upskilling using regex to match various formats.
57
+
58
+ Args:
59
+ issue_text: The issue text to check
60
+
61
+ Returns:
62
+ Boolean indicating if this is an upskilling issue
63
+ """
64
+ if not issue_text or not isinstance(issue_text, str):
65
+ return False
66
+
67
+ # Case insensitive search for "upskill" with potential variations
68
+ # This will match: Upskilling, upskill, UPSKILLING, Up-skilling, Up skilling, etc.
69
+ pattern = re.compile(r'up[-\s]?skill', re.IGNORECASE)
70
+ return bool(pattern.search(issue_text))
71
+
72
+ def estimate_token_count(text: str) -> int:
73
+ """
74
+ Estimate token count for a given text string.
75
+
76
+ This is an approximation based on GPT tokenization patterns:
77
+ - Average of ~4 characters per token for English text
78
+ - Spaces count as tokens
79
+ - Special characters typically count as their own tokens
80
+
81
+ Args:
82
+ text: The text to estimate token count for
83
+
84
+ Returns:
85
+ Estimated token count
86
+ """
87
+ if not text:
88
+ return 0
89
+
90
+ # Count words (splitting by whitespace)
91
+ words = len(text.split())
92
+
93
+ # Count characters
94
+ chars = len(text)
95
+
96
+ # Count special tokens (punctuation, etc.)
97
+ special_chars = len(re.findall(r'[^\w\s]', text))
98
+
99
+ # Estimate based on a combination of factors
100
+ # This formula is approximate and can be adjusted based on testing
101
+ estimated_tokens = max(words, int(chars / 4) + special_chars)
102
+
103
+ return estimated_tokens
104
+
105
+ def categorize_worklog_batch(worklogs: List[str]) -> List[str]:
106
+ """
107
+ Categorize multiple worklog entries with a single API call.
108
+
109
+ Args:
110
+ worklogs: List of worklog texts to categorize
111
+
112
+ Returns:
113
+ List of categories corresponding to each worklog
114
+ """
115
+ if not worklogs or model is None:
116
+ return ["Unknown"] * len(worklogs)
117
+
118
+ # Format worklogs as a numbered list for the prompt
119
+ formatted_worklogs = "\n".join([f"{i+1}. {worklog}" for i, worklog in enumerate(worklogs)])
120
+ prompt = BATCH_CATEGORIZATION_PROMPT.format(worklogs=formatted_worklogs)
121
+
122
+ # Estimate token usage
123
+ worklogs_token_count = sum(estimate_token_count(w) for w in worklogs)
124
+ prompt_token_count = estimate_token_count(prompt)
125
+ total_tokens = prompt_token_count
126
+
127
+ logger.info(f"Sending batch with {len(worklogs)} worklogs (~{worklogs_token_count} worklog tokens, ~{total_tokens} total tokens)")
128
+
129
+ try:
130
+ response = model.generate_content(prompt)
131
+ response_text = response.text.strip()
132
+
133
+ logger.info(f"Response received: {response_text}")
134
+
135
+ # Parse numbered response - looking for patterns like "1. Python", "2. JavaScript", etc.
136
+ categories = []
137
+
138
+ # First, try to match numbered lines (1. Category)
139
+ number_pattern = re.compile(r'^\s*(\d+)\.\s*(.+?)$', re.MULTILINE)
140
+ matches = number_pattern.findall(response_text)
141
+
142
+ if matches:
143
+ # Sort by the number to maintain order
144
+ sorted_matches = sorted(matches, key=lambda x: int(x[0]))
145
+ categories = [match[1].strip() for match in sorted_matches]
146
+ else:
147
+ # Fallback: try to split by lines
148
+ lines = [line.strip() for line in response_text.split('\n') if line.strip()]
149
+ categories = [line.split('.')[-1].strip() if '.' in line else line for line in lines]
150
+
151
+ # Ensure we have the right number of categories
152
+ if len(categories) != len(worklogs):
153
+ logger.warning(f"Mismatch between number of worklogs ({len(worklogs)}) and categories ({len(categories)})")
154
+
155
+ # Pad with "Unknown" if we have too few categories
156
+ if len(categories) < len(worklogs):
157
+ categories.extend(["Unknown"] * (len(worklogs) - len(categories)))
158
+ # Truncate if we have too many categories
159
+ else:
160
+ categories = categories[:len(worklogs)]
161
+
162
+ # Ensure each category is a single word
163
+ for i, category in enumerate(categories):
164
+ if len(category.split()) > 1 and "-" not in category:
165
+ logger.warning(f"Response '{category}' contains multiple words, taking first word")
166
+ categories[i] = category.split()[0]
167
+
168
+ # Log the results for verification
169
+ for i, (worklog, category) in enumerate(zip(worklogs, categories)):
170
+ logger.info(f"Worklog {i+1}: '{worklog[:50]}{'...' if len(worklog) > 50 else ''}' → '{category}'")
171
+
172
+ return categories
173
+ except Exception as e:
174
+ logger.error(f"Error categorizing worklog batch: {e}")
175
+ return ["Unknown"] * len(worklogs)
176
+
177
+ def batch_process_worklogs(worklogs: List[str], batch_size: int = 10,
178
+ pause_seconds: int = 5, show_progress: bool = True) -> List[str]:
179
+ """
180
+ Process multiple worklog entries in batches with pauses to avoid rate limits.
181
+ Using 10 queries at a time with 5 seconds rest between batches.
182
+
183
+ Args:
184
+ worklogs: List of worklog texts to categorize
185
+ batch_size: Number of worklogs to process in each batch (default: 10)
186
+ pause_seconds: Seconds to pause between batches (default: 5)
187
+ show_progress: Whether to show a progress bar
188
+
189
+ Returns:
190
+ List of categories corresponding to each worklog
191
+ """
192
+ results = []
193
+ total_worklogs = len(worklogs)
194
+
195
+ # Create batches
196
+ batches = [worklogs[i:i + batch_size] for i in range(0, total_worklogs, batch_size)]
197
+
198
+ # Process each batch with progress indication
199
+ progress_bar = tqdm(total=total_worklogs, desc="Categorizing worklogs") if show_progress else None
200
+
201
+ for i, batch in enumerate(batches):
202
+ # Process current batch
203
+ logger.info(f"Processing batch {i+1}/{len(batches)} with {len(batch)} worklogs")
204
+ batch_results = categorize_worklog_batch(batch)
205
+ results.extend(batch_results)
206
+
207
+ # Update progress
208
+ if progress_bar:
209
+ progress_bar.update(len(batch))
210
+
211
+ # Pause between batches (except after the last batch)
212
+ if i < len(batches) - 1 and pause_seconds > 0:
213
+ logger.info(f"Pausing for {pause_seconds}s before next batch. Processed {len(results)}/{total_worklogs} worklogs")
214
+ if show_progress:
215
+ for s in range(pause_seconds):
216
+ progress_bar.set_description(f"Waiting {pause_seconds-s}s before next batch")
217
+ time.sleep(1)
218
+ progress_bar.set_description("Categorizing worklogs")
219
+ else:
220
+ time.sleep(pause_seconds)
221
+
222
+ if progress_bar:
223
+ progress_bar.close()
224
+
225
+ logger.info(f"Completed processing {total_worklogs} worklogs")
226
+ return results
227
+
228
+ def process_dataframe(df: pd.DataFrame, worklog_column: str = "Worklog",
229
+ issue_column: str = "Issue", default_category: str = "N/A",
230
+ batch_size: int = 10, pause_seconds: int = 5,
231
+ show_progress: bool = True) -> pd.DataFrame:
232
+ """
233
+ Add a new column with technology categories to a dataframe.
234
+ Only categorizes worklogs associated with upskilling issues.
235
+ Processes 10 worklogs at a time with 5-second pauses between batches.
236
+
237
+ Args:
238
+ df: Pandas DataFrame containing worklog data
239
+ worklog_column: Name of the column containing worklog text
240
+ issue_column: Name of the column containing issue text
241
+ default_category: Default value for non-upskilling worklogs
242
+ batch_size: Number of worklogs to process in each batch (default: 10)
243
+ pause_seconds: Seconds to pause between batches (default: 5)
244
+ show_progress: Whether to show a progress bar
245
+
246
+ Returns:
247
+ DataFrame with an additional 'TechCategory' column
248
+ """
249
+ # Initialize TechCategory column with default value
250
+ df["TechCategory"] = default_category
251
+
252
+ # Check if required columns exist
253
+ if worklog_column not in df.columns:
254
+ logger.error(f"Column '{worklog_column}' not found in DataFrame")
255
+ return df
256
+
257
+ if issue_column not in df.columns:
258
+ logger.error(f"Column '{issue_column}' not found in DataFrame")
259
+ return df
260
+
261
+ # Filter for upskilling issues
262
+ upskilling_mask = df[issue_column].apply(is_upskilling_issue)
263
+ upskilling_rows = df[upskilling_mask].copy()
264
+
265
+ logger.info(f"Found {len(upskilling_rows)} rows with upskilling issues out of {len(df)} total rows")
266
+
267
+ if upskilling_rows.empty:
268
+ logger.info("No upskilling issues found, returning dataframe with default category values")
269
+ return df
270
+
271
+ # Extract unique non-null worklog entries from upskilling issues
272
+ unique_worklogs = upskilling_rows[worklog_column].dropna().unique().tolist()
273
+
274
+ # Calculate total estimated tokens
275
+ total_estimated_tokens = sum(estimate_token_count(worklog) for worklog in unique_worklogs)
276
+
277
+ logger.info(f"Processing {len(unique_worklogs)} unique upskilling worklog entries with approximately {total_estimated_tokens} tokens")
278
+
279
+ # Create a mapping of worklog text to category
280
+ if unique_worklogs:
281
+ categories = batch_process_worklogs(
282
+ unique_worklogs,
283
+ batch_size=batch_size,
284
+ pause_seconds=pause_seconds,
285
+ show_progress=show_progress
286
+ )
287
+ worklog_to_category = dict(zip(unique_worklogs, categories))
288
+ else:
289
+ worklog_to_category = {}
290
+
291
+ # Apply categorization only to upskilling worklog entries
292
+ df.loc[upskilling_mask, "TechCategory"] = df.loc[upskilling_mask, worklog_column].apply(
293
+ lambda x: worklog_to_category.get(x, default_category) if pd.notna(x) else default_category
294
+ )
295
+
296
+ # Count the number of actually categorized entries
297
+ categorized_count = len(df[df["TechCategory"] != default_category])
298
+ logger.info(f"Successfully categorized {categorized_count} worklog entries")
299
+
300
+ return df
301
+
302
+ def process_csv_file(
303
+ csv_path: str,
304
+ worklog_column: str = "Worklog",
305
+ issue_column: str = "Issue",
306
+ default_category: str = "N/A",
307
+ output_path: Optional[str] = None,
308
+ overwrite: bool = False,
309
+ batch_size: int = 10,
310
+ pause_seconds: int = 5
311
+ ) -> str:
312
+ """
313
+ Process a CSV file to add technology categories based on worklog entries.
314
+ Only categorizes worklogs associated with upskilling issues.
315
+ Processes 10 worklogs at a time with 5-second pauses between batches.
316
+
317
+ Args:
318
+ csv_path: Path to the CSV file to process
319
+ worklog_column: Name of the column containing worklog text
320
+ issue_column: Name of the column containing issue text
321
+ default_category: Default value for non-upskilling worklogs
322
+ output_path: Path to save the processed file (if None, creates a new file with '_categorized' suffix)
323
+ overwrite: If True, overwrite the original file
324
+ batch_size: Number of worklogs to process in each batch (default: 10)
325
+ pause_seconds: Seconds to pause between batches (default: 5)
326
+
327
+ Returns:
328
+ Path to the saved CSV file
329
+ """
330
+ try:
331
+ # Check if file exists
332
+ if not Path(csv_path).exists():
333
+ logger.error(f"CSV file not found: {csv_path}")
334
+ return ""
335
+
336
+ # Read CSV
337
+ logger.info(f"Reading CSV file: {csv_path}")
338
+ df = pd.read_csv(csv_path)
339
+
340
+ # Process dataframe
341
+ processed_df = process_dataframe(
342
+ df,
343
+ worklog_column=worklog_column,
344
+ issue_column=issue_column,
345
+ default_category=default_category,
346
+ batch_size=batch_size,
347
+ pause_seconds=pause_seconds
348
+ )
349
+
350
+ # Determine output path
351
+ if overwrite:
352
+ save_path = csv_path
353
+ elif output_path:
354
+ save_path = output_path
355
+ else:
356
+ # Create new filename with _categorized suffix
357
+ path_obj = Path(csv_path)
358
+ save_path = str(path_obj.with_stem(f"{path_obj.stem}_categorized"))
359
+
360
+ # Save processed dataframe
361
+ processed_df.to_csv(save_path, index=False)
362
+ logger.info(f"Saved categorized CSV to: {save_path}")
363
+
364
+ return save_path
365
+
366
+ except Exception as e:
367
+ logger.error(f"Error processing CSV file: {e}")
368
+ return ""