File size: 36,386 Bytes
14df11b
 
 
 
 
 
 
3075c3f
 
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
 
 
 
 
 
 
3075c3f
 
14df11b
 
 
3075c3f
 
 
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
 
 
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
14df11b
 
 
3075c3f
14df11b
 
3075c3f
14df11b
 
 
 
 
 
 
 
3075c3f
14df11b
 
3075c3f
 
 
 
 
14df11b
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
14df11b
 
 
 
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
3075c3f
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3075c3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14df11b
 
 
 
 
 
3075c3f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
import streamlit as st
import pandas as pd
import numpy as np
import io
import base64
import plotly.express as px
import datetime
import re
from collections import Counter

def process_genai_data(df):
    # Create a new dataframe with unique users
    unique_users = df['User'].drop_duplicates().reset_index(drop=True)
    result_df = pd.DataFrame(unique_users, columns=['User'])
    
    # For each unique user, find all their GenAI use case descriptions and join them
    def get_descriptions(user):
        # Filter descriptions for the current user and drop empty values
        descriptions = df[df['User'] == user]['GenAI use case description'].dropna().unique()
        
        # Join the descriptions with a newline and add a dash at the beginning of each line
        if len(descriptions) > 0:
            # Format each description to start with "- "
            formatted_descriptions = ["- " + desc for desc in descriptions]
            return "\n".join(formatted_descriptions)
        return ""
    
    # Apply the function to get concatenated descriptions for each user
    result_df['GenAI_Descriptions'] = result_df['User'].apply(get_descriptions)
    
    # Get the GenAI Efficiency value for the first occurrence of each user
    def get_first_efficiency(user):
        efficiency_values = df[df['User'] == user]['GenAI Efficiency (Log time in hours)']
        if len(efficiency_values) > 0:
            return efficiency_values.iloc[0]
        return np.nan
    
    result_df['GenAI_Efficiency'] = result_df['User'].apply(get_first_efficiency)
    
    # Calculate total logged hours for each user
    def get_total_logged(user):
        return df[df['User'] == user]['Logged'].sum()
    
    result_df['Total_Logged_Hours'] = result_df['User'].apply(get_total_logged)
    
    # Calculate required hours for each user
    def get_total_required(user):
        return df[df['User'] == user]['Required'].sum()
    
    result_df['Total_Required_Hours'] = result_df['User'].apply(get_total_required)
    
    # Calculate utilization percentage
    result_df['Utilization_Percentage'] = (result_df['Total_Logged_Hours'] / result_df['Total_Required_Hours'] * 100).round(2)
    
    # Get date range for each user
    def get_date_range(user):
        user_logs = df[df['User'] == user]
        if 'Date' in user_logs.columns and not user_logs['Date'].empty:
            dates = user_logs['Date'].dropna()
            if len(dates) > 0:
                return f"{min(dates)} to {max(dates)}"
        return "N/A"
    
    result_df['Date_Range'] = result_df['User'].apply(get_date_range)
    
    # Add description quality score
    result_df['Description_Quality_Score'] = calculate_description_quality(result_df)
    
    # Get project data if available
    if 'Project' in df.columns:
        # Get a list of projects for each user
        def get_projects(user):
            projects = df[df['User'] == user]['Project'].dropna().unique()
            return list(projects)
        
        result_df['Projects'] = result_df['User'].apply(get_projects)
    
    return result_df

def analyze_projects_by_genai_hours(df):
    """
    Analyzes which projects have the highest GenAI efficiency log hours
    Returns a dataframe with projects and their total GenAI hours
    
    Fix: Ensure we don't double-count hours by first getting unique Project-User combinations
    """
    if 'Project' not in df.columns:
        return None
    
    # First, get unique Project-User combinations with their GenAI hours
    # This avoids double-counting hours for the same user on the same project
    user_project_hours = df.groupby(['Project', 'User'])['GenAI Efficiency (Log time in hours)'].first().reset_index()
    
    # Now sum up the hours by project
    project_hours = user_project_hours.groupby('Project')['GenAI Efficiency (Log time in hours)'].sum().reset_index()
    project_hours = project_hours.sort_values('GenAI Efficiency (Log time in hours)', ascending=False)
    project_hours.columns = ['Project', 'Total_GenAI_Hours']
    
    # Add user count per project
    project_users = df.groupby('Project')['User'].nunique().reset_index()
    project_users.columns = ['Project', 'User_Count']
    
    # Merge the dataframes
    project_analysis = pd.merge(project_hours, project_users, on='Project')
    
    return project_analysis

def extract_ai_tools_from_descriptions(df):
    """
    Extracts and counts AI tools mentioned in GenAI descriptions
    Returns a Counter object with tools and their frequencies
    """
    # Common AI tools and platforms to look for
    ai_tools = [
        'chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'openai', 
        'claude', 'anthropic', 
        'gemini', 'bard', 'google ai',
        'copilot', 'github copilot', 'microsoft copilot',
        'dall-e', 'midjourney', 'stable diffusion',
        'hugging face', 'transformers',
        'bert', 'llama', 'mistral', 
        'tensorflow', 'pytorch', 'ml',
        'jupyter', 'colab',
        'langchain', 'llm', 'rag'
    ]
    
    # Dictionary to store normalized tool names
    tool_mapping = {
        'gpt': 'ChatGPT/GPT',
        'gpt-3': 'ChatGPT/GPT',
        'gpt-4': 'ChatGPT/GPT',
        'chatgpt': 'ChatGPT/GPT',
        'openai': 'OpenAI',
        'claude': 'Claude',
        'anthropic': 'Claude',
        'gemini': 'Google AI',
        'bard': 'Google AI',
        'google ai': 'Google AI',
        'copilot': 'GitHub Copilot',
        'github copilot': 'GitHub Copilot',
        'microsoft copilot': 'Microsoft Copilot',
        'dall-e': 'DALL-E',
        'midjourney': 'Midjourney',
        'stable diffusion': 'Stable Diffusion',
        'hugging face': 'Hugging Face',
        'transformers': 'Transformers',
        'bert': 'BERT',
        'llama': 'LLaMA',
        'mistral': 'Mistral AI',
        'tensorflow': 'TensorFlow',
        'pytorch': 'PyTorch',
        'ml': 'Machine Learning',
        'jupyter': 'Jupyter',
        'colab': 'Google Colab',
        'langchain': 'LangChain',
        'llm': 'Large Language Models',
        'rag': 'Retrieval Augmented Generation'
    }
    
    # Extract all GenAI descriptions
    all_descriptions = " ".join(df['GenAI use case description'].dropna().astype(str).tolist()).lower()
    
    # Count occurrences of each tool
    tool_counts = Counter()
    for tool in ai_tools:
        count = len(re.findall(r'\b' + re.escape(tool) + r'\b', all_descriptions))
        if count > 0:
            normalized_tool = tool_mapping.get(tool, tool)
            tool_counts[normalized_tool] += count
    
    return tool_counts

def extract_use_cases_from_descriptions(df):
    """
    Analyzes GenAI descriptions to identify common use cases
    Returns a Counter object with use cases and their frequencies
    """
    # Common use case categories to look for
    use_case_keywords = {
        'Code Generation': ['code', 'coding', 'programming', 'script', 'develop', 'algorithm'],
        'Content Creation': ['content', 'write', 'writing', 'draft', 'article', 'blog'],
        'Data Analysis': ['data', 'analysis', 'analyze', 'analytics', 'statistics', 'insights'],
        'Documentation': ['document', 'documentation', 'manual', 'guide', 'readme'],
        'Research': ['research', 'study', 'investigate', 'explore', 'literature'],
        'Summarization': ['summary', 'summarize', 'summarization', 'extract key points'],
        'Translation': ['translate', 'translation', 'language', 'localize'],
        'Image Generation': ['image', 'picture', 'graphic', 'design', 'draw', 'art'],
        'Chatbot': ['chatbot', 'chat', 'conversation', 'dialogue', 'assistant'],
        'Automation': ['automate', 'automation', 'workflow', 'process', 'routine'],
        'Training': ['train', 'training', 'learn', 'learning', 'education'],
        'Testing': ['test', 'testing', 'QA', 'quality assurance', 'debug']
    }
    
    # Extract all GenAI descriptions
    descriptions = df['GenAI use case description'].dropna().astype(str).tolist()
    
    # Count occurrences of each use case
    use_case_counts = Counter()
    
    for description in descriptions:
        description_lower = description.lower()
        for use_case, keywords in use_case_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', description_lower):
                    use_case_counts[use_case] += 1
                    break  # Count each use case only once per description
    
    return use_case_counts

def calculate_description_quality(df):
    """
    Calculates a quality score for each user's GenAI description
    Score is based on length, specificity, and uniqueness
    Returns a Series with quality scores
    """
    # Get descriptions column
    descriptions = df['GenAI_Descriptions']
    
    # Initialize scores
    scores = pd.Series(0, index=df.index)
    
    # Factor 1: Length score (longer descriptions get more points)
    char_counts = descriptions.str.len()
    max_char_count = char_counts.max() if not char_counts.empty else 1
    length_score = (char_counts / max_char_count) * 40  # 40% weight to length
    
    # Factor 2: Specificity score (mentions of specific tools or numbers)
    def specificity_score(desc):
        if not isinstance(desc, str) or desc.strip() == "":
            return 0
        
        score = 0
        # Check for specific AI tools
        ai_tools = ['gpt', 'chatgpt', 'claude', 'gemini', 'copilot', 'dall-e', 'midjourney']
        for tool in ai_tools:
            if re.search(r'\b' + re.escape(tool) + r'\b', desc.lower()):
                score += 5
        
        # Check for numbers (could indicate metrics or specific examples)
        if re.search(r'\d+', desc):
            score += 5
        
        # Check for detailed explanations 
        if len(desc.split()) > 50:  # Long descriptions
            score += 10
            
        return min(score, 30)  # Cap at 30% weight
    
    specificity_scores = descriptions.apply(specificity_score)
    
    # Factor 3: Uniqueness score
    def uniqueness_score(desc):
        if not isinstance(desc, str) or desc.strip() == "":
            return 0
            
        # Simple word tokenization by splitting on whitespace
        words = desc.lower().split()
        
        # Remove common stop words and short words
        common_stopwords = {"a", "an", "the", "and", "or", "but", "is", "are", "was", "were", 
                            "in", "on", "at", "to", "for", "with", "by", "about", "of", "this", 
                            "that", "i", "we", "you", "he", "she", "they", "it", "have", "has"}
        
        # Filter out stopwords and short words
        filtered_words = [word for word in words if word not in common_stopwords and len(word) > 2]
        
        # Unique words ratio
        if filtered_words:
            uniqueness = len(set(filtered_words)) / len(filtered_words)
            return uniqueness * 30  # 30% weight to uniqueness
        return 0
    
    uniqueness_scores = descriptions.apply(uniqueness_score)
    
    # Combine scores
    total_scores = length_score + specificity_scores + uniqueness_scores
    
    # Normalize to 0-100 scale
    max_score = total_scores.max() if not total_scores.empty else 1
    normalized_scores = (total_scores / max_score * 100).round(1)
    
    return normalized_scores

def get_download_link(df, filename):
    """Generate a download link for the dataframe as an Excel file"""
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Processed Data')
        
        # Add a summary sheet
        summary = pd.DataFrame({
            'Metric': ['Total Users', 'Average GenAI Efficiency (hours)', 'Average Utilization (%)', 
                      'Top GenAI User', 'Top Quality Score'],
            'Value': [
                len(df),
                round(df['GenAI_Efficiency'].mean(), 2),
                round(df['Utilization_Percentage'].mean(), 2),
                df.loc[df['GenAI_Efficiency'].idxmax(), 'User'] if not df['GenAI_Efficiency'].isna().all() else 'N/A',
                df.loc[df['Description_Quality_Score'].idxmax(), 'User'] if not df['Description_Quality_Score'].isna().all() else 'N/A'
            ]
        })
        summary.to_excel(writer, index=False, sheet_name='Summary')
    
    binary_data = output.getvalue()
    b64 = base64.b64encode(binary_data).decode()
    href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}" class="download-button">Download Excel file</a>'
    return href

# Custom CSS
def local_css():
    st.markdown("""
    <style>
    .download-button {
        display: inline-block;
        padding: 0.5em 1em;
        color: #ffffff;
        background-color: #4CAF50;
        border-radius: 4px;
        text-decoration: none;
        font-weight: bold;
        transition: background-color 0.3s;
    }
    .download-button:hover {
        background-color: #45a049;
    }
    .stMetric {
        background-color: #f0f2f6;
        padding: 15px;
        border-radius: 5px;
    }
    .highlight-box {
        background-color: #f8f9fa;
        border-left: 5px solid #4CAF50;
        padding: 15px;
        margin: 10px 0;
        border-radius: 0 5px 5px 0;
    }
    .quality-high {
        color: #4CAF50;
        font-weight: bold;
    }
    .quality-medium {
        color: #FFC107;
        font-weight: bold;
    }
    .quality-low {
        color: #F44336;
        font-weight: bold;
    }
    /* Team category styling */
    table {
        width: 100%;
        border-collapse: collapse;
        margin-bottom: 20px;
    }
    th {
        background-color: #f2f2f2;
        padding: 8px;
        text-align: left;
        border: 1px solid #ddd;
    }
    td {
        padding: 8px;
        border: 1px solid #ddd;
    }
    tr:nth-child(even) {
        background-color: #f9f9f9;
    }
    tr:hover {
        background-color: #f0f0f0;
    }
    </style>
    """, unsafe_allow_html=True)

# Main app
st.set_page_config(page_title="GenAI Worklog Processor", layout="wide")
local_css()

st.title("GenAI Worklog Data Processor")
st.markdown("""
This app processes worklog data to extract insights about GenAI usage:
1. Creates a list of unique users
2. Concatenates GenAI use case descriptions for each user with proper formatting
3. Captures GenAI efficiency values and other metrics
4. Identifies projects with highest GenAI usage
5. Analyzes most common AI tools and use cases
6. Identifies prompt champions based on quality metrics
""")

# File uploader
uploaded_file = st.file_uploader("Upload your worklog CSV or Excel file", type=["csv", "xlsx", "xls"])

if uploaded_file is not None:
    try:
        # Read the file
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
        
        # Display the original data
        st.subheader("Original Data Preview")
        st.dataframe(df.head())
        
        # Check if required columns exist
        required_columns = ['User', 'GenAI use case description', 'GenAI Efficiency (Log time in hours)']
        for col in ['Required', 'Logged', 'Date', 'Project']:
            if col in df.columns:
                required_columns.append(col)
        
        missing_columns = [col for col in required_columns[:3] if col not in df.columns]
        
        if missing_columns:
            st.warning(f"The following required columns are missing: {', '.join(missing_columns)}")
            st.markdown("""
            For full functionality, your file should contain these columns:
            - User
            - GenAI use case description
            - GenAI Efficiency (Log time in hours)
            - Required
            - Logged
            - Date
            - Project (optional but recommended for project analysis)
            """)
            
            # Stop if essential columns are missing
            if any(col in missing_columns for col in ['User', 'GenAI use case description']):
                st.error("Cannot continue without essential columns.")
                st.stop()
            
            # Continue with available columns
            st.info("Continuing with available columns...")
        
        # Process the data
        if st.button("Process Data"):
            with st.spinner("Processing data..."):
                result_df = process_genai_data(df)
                
                # Get project analysis if available
                project_analysis = None
                if 'Project' in df.columns:
                    project_analysis = analyze_projects_by_genai_hours(df)
                
                # Get AI tools usage
                ai_tool_counts = extract_ai_tools_from_descriptions(df)
                
                # Get use case analysis
                use_case_counts = extract_use_cases_from_descriptions(df)
            
            # Display the result
            st.subheader("Processed Data")
            st.dataframe(result_df)
            
            # Download link
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            st.subheader("Download Processed Data")
            st.markdown(get_download_link(result_df, f"genai_processed_data_{timestamp}.xlsx"), unsafe_allow_html=True)
            
            # NEW INSIGHTS SECTION
            st.header("πŸ” Advanced GenAI Insights")
            
            # 1. Project with highest GenAI efficacy log hours
            if project_analysis is not None and not project_analysis.empty:
                st.subheader("πŸ† Project with Highest GenAI Efficacy Hours")
                
                top_project = project_analysis.iloc[0]
                
                col1, col2 = st.columns(2)
                with col1:
                    st.markdown(f"""
                    <div class="highlight-box">
                        <h3>{top_project['Project']}</h3>
                        <p>Total GenAI Hours: <b>{round(top_project['Total_GenAI_Hours'], 2)}</b></p>
                        <p>Number of Users: <b>{top_project['User_Count']}</b></p>
                        <p>Average Hours per User: <b>{round(top_project['Total_GenAI_Hours'] / top_project['User_Count'], 2)}</b></p>
                    </div>
                    """, unsafe_allow_html=True)
                
                with col2:
                    # Bar chart of top 5 projects
                    top_projects = project_analysis.head(5)
                    fig = px.bar(
                        top_projects, 
                        x='Project', 
                        y='Total_GenAI_Hours',
                        title='Top 5 Projects by GenAI Hours',
                        color='Total_GenAI_Hours',
                        color_continuous_scale='Viridis'
                    )
                    fig.update_layout(xaxis_title="Project", yaxis_title="Total GenAI Hours")
                    st.plotly_chart(fig, use_container_width=True)
                
                # Full project analysis
                st.markdown("### All Projects Analysis")
                st.dataframe(project_analysis)
            
            # 2. Most prominent use cases of AI tools
            st.subheader("πŸ“Š Most Prominent AI Use Cases")
            
            col1, col2 = st.columns(2)
            
            with col1:
                # AI Tools Analysis
                st.markdown("### Top AI Tools Mentioned")
                
                if ai_tool_counts:
                    # Convert to dataframe for visualization
                    ai_tools_df = pd.DataFrame({
                        'Tool': list(ai_tool_counts.keys()),
                        'Mentions': list(ai_tool_counts.values())
                    }).sort_values('Mentions', ascending=False)
                    
                    fig = px.bar(
                        ai_tools_df.head(10), 
                        x='Tool', 
                        y='Mentions',
                        title='Top 10 AI Tools Mentioned',
                        color='Mentions',
                        color_continuous_scale='Blues'
                    )
                    fig.update_layout(xaxis_title="AI Tool", yaxis_title="Number of Mentions")
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Top tool insight
                    if not ai_tools_df.empty:
                        top_tool = ai_tools_df.iloc[0]
                        st.markdown(f"""
                        <div class="highlight-box">
                            <p>Most used AI tool: <b>{top_tool['Tool']}</b> with {top_tool['Mentions']} mentions</p>
                        </div>
                        """, unsafe_allow_html=True)
                else:
                    st.info("No specific AI tools were identified in the descriptions.")
            
            with col2:
                # Use Cases Analysis
                st.markdown("### Top Use Cases")
                
                if use_case_counts:
                    # Convert to dataframe for visualization
                    use_cases_df = pd.DataFrame({
                        'Use Case': list(use_case_counts.keys()),
                        'Count': list(use_case_counts.values())
                    }).sort_values('Count', ascending=False)
                    
                    fig = px.pie(
                        use_cases_df.head(5), 
                        names='Use Case', 
                        values='Count',
                        title='Top 5 GenAI Use Cases',
                        hole=0.4
                    )
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Top use case insight
                    if not use_cases_df.empty:
                        top_use_case = use_cases_df.iloc[0]
                        st.markdown(f"""
                        <div class="highlight-box">
                            <p>Most common use case: <b>{top_use_case['Use Case']}</b> mentioned in {top_use_case['Count']} descriptions</p>
                        </div>
                        """, unsafe_allow_html=True)
                else:
                    st.info("No specific use cases were identified in the descriptions.")
            
            # 3. Champion of the prompt with quality GenAI Description
            st.subheader("πŸ‘‘ GenAI Prompt Champions")
            
            if 'Description_Quality_Score' in result_df.columns:
                # Get top 3 users by quality score
                top_quality_users = result_df.sort_values('Description_Quality_Score', ascending=False).head(3)
                
                # Display top champion
                if not top_quality_users.empty:
                    champion = top_quality_users.iloc[0]
                    st.markdown(f"""
                    <div class="highlight-box">
                        <h3>πŸ† Prompt Champion: {champion['User']}</h3>
                        <p>Quality Score: <span class="quality-high">{champion['Description_Quality_Score']}/100</span></p>
                        <p>GenAI Efficiency: {round(champion['GenAI_Efficiency'], 2)} hours</p>
                        <p><b>GenAI Descriptions:</b></p>
                        <pre>{champion['GenAI_Descriptions']}</pre>
                    </div>
                    """, unsafe_allow_html=True)
                
                # Quality score distribution
                st.markdown("### Quality Score Distribution")
                fig = px.histogram(
                    result_df, 
                    x='Description_Quality_Score',
                    nbins=10,
                    title='Distribution of GenAI Description Quality Scores',
                    color_discrete_sequence=['#4CAF50']
                )
                fig.update_layout(xaxis_title="Quality Score", yaxis_title="Number of Users")
                st.plotly_chart(fig, use_container_width=True)
                
                # Quality score by user with team categorization
                st.markdown("### Quality Scores by User & Team Categories")
                
                # Create a more comprehensive dataframe for team identification
                team_df = result_df[['User', 'Description_Quality_Score', 'GenAI_Efficiency', 'Total_Logged_Hours']].copy()
                
                # Ensure we have numeric values for calculations
                team_df['Description_Quality_Score'] = pd.to_numeric(team_df['Description_Quality_Score'], errors='coerce').fillna(0)
                team_df['GenAI_Efficiency'] = pd.to_numeric(team_df['GenAI_Efficiency'], errors='coerce').fillna(0)
                team_df['Total_Logged_Hours'] = pd.to_numeric(team_df['Total_Logged_Hours'], errors='coerce').fillna(0)
                
                # Calculate a combined score (weighted average of quality and hours)
                # Weight: 60% quality, 40% efficiency hours
                max_quality = team_df['Description_Quality_Score'].max() if not team_df.empty and team_df['Description_Quality_Score'].max() > 0 else 100
                max_hours = team_df['GenAI_Efficiency'].max() if not team_df.empty and team_df['GenAI_Efficiency'].max() > 0 else 1
                
                team_df['Quality_Normalized'] = team_df['Description_Quality_Score'] / max_quality * 100
                team_df['Hours_Normalized'] = team_df['GenAI_Efficiency'] / max_hours * 100
                team_df['Combined_Score'] = (team_df['Quality_Normalized'] * 0.6) + (team_df['Hours_Normalized'] * 0.4)
                
                # Assign team categories based on combined score and individual metrics
                def assign_team_category(row):
                    quality = row['Description_Quality_Score']
                    hours = row['GenAI_Efficiency']
                    combined = row['Combined_Score']
                    
                    if quality >= 80 and hours >= (max_hours * 0.7):
                        return "πŸ”₯ GenAI Champion", "Masters of both quality and quantity"
                    elif quality >= 70:
                        return "✨ Prompt Expert", "High-quality prompt crafters"
                    elif hours >= (max_hours * 0.8):
                        return "⚑ Power User", "High volume GenAI users"
                    elif combined >= 60:
                        return "🌟 Balanced Performer", "Good balance of quality and usage"
                    elif quality >= 50:
                        return "πŸ“ Quality Focused", "Focuses on quality over quantity"
                    elif hours > 0:
                        return "πŸ” Exploring User", "Beginning GenAI journey"
                    else:
                        return "❓ Inactive", "Little to no GenAI usage"
                
                # Apply the team categorization
                team_df[['Team_Category', 'Category_Description']] = team_df.apply(assign_team_category, axis=1, result_type='expand')
                
                # Sort by combined score
                team_df = team_df.sort_values('Combined_Score', ascending=False)
                
                # Add color coding based on quality score
                def quality_color(score):
                    if score >= 70:
                        return 'quality-high'
                    elif score >= 40:
                        return 'quality-medium'
                    else:
                        return 'quality-low'
                
                team_df['Score_Display'] = team_df['Description_Quality_Score'].apply(
                    lambda x: f'<span class="{quality_color(x)}">{x}</span>'
                )
                
                # Create a display dataframe with the relevant columns
                display_df = team_df[['User', 'Score_Display', 'GenAI_Efficiency', 'Team_Category', 'Category_Description']]
                display_df.columns = ['User', 'Quality Score', 'GenAI Hours', 'Team Category', 'Description']
                
                # Display as a styled dataframe
                st.write(display_df.to_html(escape=False), unsafe_allow_html=True)
                
                # Team distribution pie chart
                st.markdown("### Team Category Distribution")
                team_counts = team_df['Team_Category'].value_counts().reset_index()
                team_counts.columns = ['Team_Category', 'Count']
                
                fig = px.pie(
                    team_counts, 
                    names='Team_Category', 
                    values='Count',
                    title='Distribution of Team Categories',
                    color_discrete_sequence=px.colors.qualitative.Bold
                )
                st.plotly_chart(fig, use_container_width=True)
                
                # Quality factors explanation
                st.markdown("""
                ### How Quality Scores Are Calculated
                
                The quality score is based on these factors:
                
                1. **Length & Detail (40%)**: Longer, more detailed descriptions score higher
                2. **Specificity (30%)**: Mentions of specific AI tools, metrics, and technical details
                3. **Uniqueness (30%)**: Variety of terms and concepts used
                
                Scores range from 0-100, with higher scores indicating more comprehensive and useful GenAI descriptions.
                """)
            
            # Data visualization section (original visualizations)
            st.header("πŸ“ˆ Data Visualization")
            
            # Tab layout for visualizations
            tab1, tab2, tab3, tab4 = st.tabs(["GenAI Efficiency", "Utilization", "User Analysis", "Tools & Use Cases"])
            
            with tab1:
                # GenAI Efficiency by User
                if 'GenAI_Efficiency' in result_df.columns:
                    st.subheader("GenAI Efficiency by User")
                    sorted_df = result_df.sort_values('GenAI_Efficiency', ascending=False)
                    fig = px.bar(
                        sorted_df, 
                        x='User', 
                        y='GenAI_Efficiency',
                        title='GenAI Efficiency Hours by User',
                        color='GenAI_Efficiency',
                        color_continuous_scale='Viridis'
                    )
                    fig.update_layout(xaxis_title="User", yaxis_title="Hours")
                    st.plotly_chart(fig, use_container_width=True)
            
            with tab2:
                # Utilization Percentage
                if 'Utilization_Percentage' in result_df.columns:
                    st.subheader("Utilization Percentage by User")
                    sorted_df = result_df.sort_values('Utilization_Percentage', ascending=False)
                    fig = px.bar(
                        sorted_df, 
                        x='User', 
                        y='Utilization_Percentage',
                        title='Utilization Percentage by User',
                        color='Utilization_Percentage',
                        color_continuous_scale='RdYlGn'
                    )
                    fig.update_layout(xaxis_title="User", yaxis_title="Utilization %")
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Required vs Logged Hours
                    if 'Total_Required_Hours' in result_df.columns and 'Total_Logged_Hours' in result_df.columns:
                        st.subheader("Required vs Logged Hours by User")
                        fig = px.bar(
                            result_df, 
                            x='User', 
                            y=['Total_Required_Hours', 'Total_Logged_Hours'],
                            title='Required vs Logged Hours by User',
                            barmode='group'
                        )
                        fig.update_layout(xaxis_title="User", yaxis_title="Hours")
                        st.plotly_chart(fig, use_container_width=True)
            
            with tab3:
                # User with GenAI descriptions
                st.subheader("Users with GenAI Use Cases")
                has_description = result_df['GenAI_Descriptions'] != ""
                fig = px.pie(
                    names=['Has GenAI Use Cases', 'No GenAI Use Cases'],
                    values=[result_df[has_description].shape[0], result_df[~has_description].shape[0]],
                    title='Users with GenAI Use Cases'
                )
                st.plotly_chart(fig, use_container_width=True)
            
            with tab4:
                # Combined tools and use cases view
                st.subheader("AI Tools and Use Cases")
                
                if ai_tool_counts and use_case_counts:
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        # Word cloud of AI tools (text representation)
                        st.markdown("### AI Tools Word Cloud")
                        ai_tools_text = " ".join([f"{tool} " * count for tool, count in ai_tool_counts.items()])
                        st.text_area("", ai_tools_text, height=200)
                    
                    with col2:
                        # Use cases bar chart
                        use_cases_df = pd.DataFrame({
                            'Use Case': list(use_case_counts.keys()),
                            'Count': list(use_case_counts.values())
                        }).sort_values('Count', ascending=False)
                        
                        fig = px.bar(
                            use_cases_df, 
                            x='Use Case', 
                            y='Count',
                            title='All GenAI Use Cases',
                            color='Count',
                            color_continuous_scale='YlOrRd'
                        )
                        fig.update_layout(xaxis_title="Use Case", yaxis_title="Count")
                        st.plotly_chart(fig, use_container_width=True)
            
            # Summary statistics
            st.subheader("Summary Statistics")
            
            col1, col2, col3 = st.columns(3)
            
            with col1:
                st.metric("Total Users", len(result_df))
            
            with col2:
                # Average efficiency
                avg_efficiency = result_df['GenAI_Efficiency'].mean()
                if not pd.isna(avg_efficiency):
                    st.metric("Avg GenAI Efficiency (hours)", round(avg_efficiency, 2))
            
            with col3:
                # Average utilization
                if 'Utilization_Percentage' in result_df.columns:
                    avg_util = result_df['Utilization_Percentage'].mean()
                    st.metric("Avg Utilization %", f"{round(avg_util, 2)}%")
            
            # New row of metrics
            col1, col2, col3 = st.columns(3)
            
            with col1:
                if ai_tool_counts:
                    top_tool = max(ai_tool_counts.items(), key=lambda x: x[1])[0]
                    st.metric("Most Used AI Tool", top_tool)
            
            with col2:
                if use_case_counts:
                    top_use_case = max(use_case_counts.items(), key=lambda x: x[1])[0]
                    st.metric("Top Use Case", top_use_case)
            
            with col3:
                if 'Description_Quality_Score' in result_df.columns and not result_df.empty:
                    avg_quality = result_df['Description_Quality_Score'].mean()
                    st.metric("Avg Description Quality", f"{round(avg_quality, 1)}/100")
            
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.markdown("Please check your file format and try again.")

# Footer
st.markdown("---")
st.markdown("**Enhanced GenAI Worklog Processor** β€’ Built with Streamlit and Pandas")