Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import io | |
| import base64 | |
| import plotly.express as px | |
| import datetime | |
| import re | |
| from collections import Counter | |
| def process_genai_data(df): | |
| # Create a new dataframe with unique users | |
| unique_users = df['User'].drop_duplicates().reset_index(drop=True) | |
| result_df = pd.DataFrame(unique_users, columns=['User']) | |
| # For each unique user, find all their GenAI use case descriptions and join them | |
| def get_descriptions(user): | |
| # Filter descriptions for the current user and drop empty values | |
| descriptions = df[df['User'] == user]['GenAI use case description'].dropna().unique() | |
| # Join the descriptions with a newline and add a dash at the beginning of each line | |
| if len(descriptions) > 0: | |
| # Format each description to start with "- " | |
| formatted_descriptions = ["- " + desc for desc in descriptions] | |
| return "\n".join(formatted_descriptions) | |
| return "" | |
| # Apply the function to get concatenated descriptions for each user | |
| result_df['GenAI_Descriptions'] = result_df['User'].apply(get_descriptions) | |
| # Get the GenAI Efficiency value for the first occurrence of each user | |
| def get_first_efficiency(user): | |
| efficiency_values = df[df['User'] == user]['GenAI Efficiency (Log time in hours)'] | |
| if len(efficiency_values) > 0: | |
| return efficiency_values.iloc[0] | |
| return np.nan | |
| result_df['GenAI_Efficiency'] = result_df['User'].apply(get_first_efficiency) | |
| # Calculate total logged hours for each user | |
| def get_total_logged(user): | |
| return df[df['User'] == user]['Logged'].sum() | |
| result_df['Total_Logged_Hours'] = result_df['User'].apply(get_total_logged) | |
| # Calculate required hours for each user | |
| def get_total_required(user): | |
| return df[df['User'] == user]['Required'].sum() | |
| result_df['Total_Required_Hours'] = result_df['User'].apply(get_total_required) | |
| # Calculate utilization percentage | |
| result_df['Utilization_Percentage'] = (result_df['Total_Logged_Hours'] / result_df['Total_Required_Hours'] * 100).round(2) | |
| # Get date range for each user | |
| def get_date_range(user): | |
| user_logs = df[df['User'] == user] | |
| if 'Date' in user_logs.columns and not user_logs['Date'].empty: | |
| dates = user_logs['Date'].dropna() | |
| if len(dates) > 0: | |
| return f"{min(dates)} to {max(dates)}" | |
| return "N/A" | |
| result_df['Date_Range'] = result_df['User'].apply(get_date_range) | |
| # Add description quality score | |
| result_df['Description_Quality_Score'] = calculate_description_quality(result_df) | |
| # Get project data if available | |
| if 'Project' in df.columns: | |
| # Get a list of projects for each user | |
| def get_projects(user): | |
| projects = df[df['User'] == user]['Project'].dropna().unique() | |
| return list(projects) | |
| result_df['Projects'] = result_df['User'].apply(get_projects) | |
| return result_df | |
| def analyze_projects_by_genai_hours(df): | |
| """ | |
| Analyzes which projects have the highest GenAI efficiency log hours | |
| Returns a dataframe with projects and their total GenAI hours | |
| Fix: Ensure we don't double-count hours by first getting unique Project-User combinations | |
| """ | |
| if 'Project' not in df.columns: | |
| return None | |
| # First, get unique Project-User combinations with their GenAI hours | |
| # This avoids double-counting hours for the same user on the same project | |
| user_project_hours = df.groupby(['Project', 'User'])['GenAI Efficiency (Log time in hours)'].first().reset_index() | |
| # Now sum up the hours by project | |
| project_hours = user_project_hours.groupby('Project')['GenAI Efficiency (Log time in hours)'].sum().reset_index() | |
| project_hours = project_hours.sort_values('GenAI Efficiency (Log time in hours)', ascending=False) | |
| project_hours.columns = ['Project', 'Total_GenAI_Hours'] | |
| # Add user count per project | |
| project_users = df.groupby('Project')['User'].nunique().reset_index() | |
| project_users.columns = ['Project', 'User_Count'] | |
| # Merge the dataframes | |
| project_analysis = pd.merge(project_hours, project_users, on='Project') | |
| return project_analysis | |
| def extract_ai_tools_from_descriptions(df): | |
| """ | |
| Extracts and counts AI tools mentioned in GenAI descriptions | |
| Returns a Counter object with tools and their frequencies | |
| """ | |
| # Common AI tools and platforms to look for | |
| ai_tools = [ | |
| 'chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'openai', | |
| 'claude', 'anthropic', | |
| 'gemini', 'bard', 'google ai', | |
| 'copilot', 'github copilot', 'microsoft copilot', | |
| 'dall-e', 'midjourney', 'stable diffusion', | |
| 'hugging face', 'transformers', | |
| 'bert', 'llama', 'mistral', | |
| 'tensorflow', 'pytorch', 'ml', | |
| 'jupyter', 'colab', | |
| 'langchain', 'llm', 'rag' | |
| ] | |
| # Dictionary to store normalized tool names | |
| tool_mapping = { | |
| 'gpt': 'ChatGPT/GPT', | |
| 'gpt-3': 'ChatGPT/GPT', | |
| 'gpt-4': 'ChatGPT/GPT', | |
| 'chatgpt': 'ChatGPT/GPT', | |
| 'openai': 'OpenAI', | |
| 'claude': 'Claude', | |
| 'anthropic': 'Claude', | |
| 'gemini': 'Google AI', | |
| 'bard': 'Google AI', | |
| 'google ai': 'Google AI', | |
| 'copilot': 'GitHub Copilot', | |
| 'github copilot': 'GitHub Copilot', | |
| 'microsoft copilot': 'Microsoft Copilot', | |
| 'dall-e': 'DALL-E', | |
| 'midjourney': 'Midjourney', | |
| 'stable diffusion': 'Stable Diffusion', | |
| 'hugging face': 'Hugging Face', | |
| 'transformers': 'Transformers', | |
| 'bert': 'BERT', | |
| 'llama': 'LLaMA', | |
| 'mistral': 'Mistral AI', | |
| 'tensorflow': 'TensorFlow', | |
| 'pytorch': 'PyTorch', | |
| 'ml': 'Machine Learning', | |
| 'jupyter': 'Jupyter', | |
| 'colab': 'Google Colab', | |
| 'langchain': 'LangChain', | |
| 'llm': 'Large Language Models', | |
| 'rag': 'Retrieval Augmented Generation' | |
| } | |
| # Extract all GenAI descriptions | |
| all_descriptions = " ".join(df['GenAI use case description'].dropna().astype(str).tolist()).lower() | |
| # Count occurrences of each tool | |
| tool_counts = Counter() | |
| for tool in ai_tools: | |
| count = len(re.findall(r'\b' + re.escape(tool) + r'\b', all_descriptions)) | |
| if count > 0: | |
| normalized_tool = tool_mapping.get(tool, tool) | |
| tool_counts[normalized_tool] += count | |
| return tool_counts | |
| def extract_use_cases_from_descriptions(df): | |
| """ | |
| Analyzes GenAI descriptions to identify common use cases | |
| Returns a Counter object with use cases and their frequencies | |
| """ | |
| # Common use case categories to look for | |
| use_case_keywords = { | |
| 'Code Generation': ['code', 'coding', 'programming', 'script', 'develop', 'algorithm'], | |
| 'Content Creation': ['content', 'write', 'writing', 'draft', 'article', 'blog'], | |
| 'Data Analysis': ['data', 'analysis', 'analyze', 'analytics', 'statistics', 'insights'], | |
| 'Documentation': ['document', 'documentation', 'manual', 'guide', 'readme'], | |
| 'Research': ['research', 'study', 'investigate', 'explore', 'literature'], | |
| 'Summarization': ['summary', 'summarize', 'summarization', 'extract key points'], | |
| 'Translation': ['translate', 'translation', 'language', 'localize'], | |
| 'Image Generation': ['image', 'picture', 'graphic', 'design', 'draw', 'art'], | |
| 'Chatbot': ['chatbot', 'chat', 'conversation', 'dialogue', 'assistant'], | |
| 'Automation': ['automate', 'automation', 'workflow', 'process', 'routine'], | |
| 'Training': ['train', 'training', 'learn', 'learning', 'education'], | |
| 'Testing': ['test', 'testing', 'QA', 'quality assurance', 'debug'] | |
| } | |
| # Extract all GenAI descriptions | |
| descriptions = df['GenAI use case description'].dropna().astype(str).tolist() | |
| # Count occurrences of each use case | |
| use_case_counts = Counter() | |
| for description in descriptions: | |
| description_lower = description.lower() | |
| for use_case, keywords in use_case_keywords.items(): | |
| for keyword in keywords: | |
| if re.search(r'\b' + re.escape(keyword) + r'\b', description_lower): | |
| use_case_counts[use_case] += 1 | |
| break # Count each use case only once per description | |
| return use_case_counts | |
| def calculate_description_quality(df): | |
| """ | |
| Calculates a quality score for each user's GenAI description | |
| Score is based on length, specificity, and uniqueness | |
| Returns a Series with quality scores | |
| """ | |
| # Get descriptions column | |
| descriptions = df['GenAI_Descriptions'] | |
| # Initialize scores | |
| scores = pd.Series(0, index=df.index) | |
| # Factor 1: Length score (longer descriptions get more points) | |
| char_counts = descriptions.str.len() | |
| max_char_count = char_counts.max() if not char_counts.empty else 1 | |
| length_score = (char_counts / max_char_count) * 40 # 40% weight to length | |
| # Factor 2: Specificity score (mentions of specific tools or numbers) | |
| def specificity_score(desc): | |
| if not isinstance(desc, str) or desc.strip() == "": | |
| return 0 | |
| score = 0 | |
| # Check for specific AI tools | |
| ai_tools = ['gpt', 'chatgpt', 'claude', 'gemini', 'copilot', 'dall-e', 'midjourney'] | |
| for tool in ai_tools: | |
| if re.search(r'\b' + re.escape(tool) + r'\b', desc.lower()): | |
| score += 5 | |
| # Check for numbers (could indicate metrics or specific examples) | |
| if re.search(r'\d+', desc): | |
| score += 5 | |
| # Check for detailed explanations | |
| if len(desc.split()) > 50: # Long descriptions | |
| score += 10 | |
| return min(score, 30) # Cap at 30% weight | |
| specificity_scores = descriptions.apply(specificity_score) | |
| # Factor 3: Uniqueness score | |
| def uniqueness_score(desc): | |
| if not isinstance(desc, str) or desc.strip() == "": | |
| return 0 | |
| # Simple word tokenization by splitting on whitespace | |
| words = desc.lower().split() | |
| # Remove common stop words and short words | |
| common_stopwords = {"a", "an", "the", "and", "or", "but", "is", "are", "was", "were", | |
| "in", "on", "at", "to", "for", "with", "by", "about", "of", "this", | |
| "that", "i", "we", "you", "he", "she", "they", "it", "have", "has"} | |
| # Filter out stopwords and short words | |
| filtered_words = [word for word in words if word not in common_stopwords and len(word) > 2] | |
| # Unique words ratio | |
| if filtered_words: | |
| uniqueness = len(set(filtered_words)) / len(filtered_words) | |
| return uniqueness * 30 # 30% weight to uniqueness | |
| return 0 | |
| uniqueness_scores = descriptions.apply(uniqueness_score) | |
| # Combine scores | |
| total_scores = length_score + specificity_scores + uniqueness_scores | |
| # Normalize to 0-100 scale | |
| max_score = total_scores.max() if not total_scores.empty else 1 | |
| normalized_scores = (total_scores / max_score * 100).round(1) | |
| return normalized_scores | |
| def get_download_link(df, filename): | |
| """Generate a download link for the dataframe as an Excel file""" | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name='Processed Data') | |
| # Add a summary sheet | |
| summary = pd.DataFrame({ | |
| 'Metric': ['Total Users', 'Average GenAI Efficiency (hours)', 'Average Utilization (%)', | |
| 'Top GenAI User', 'Top Quality Score'], | |
| 'Value': [ | |
| len(df), | |
| round(df['GenAI_Efficiency'].mean(), 2), | |
| round(df['Utilization_Percentage'].mean(), 2), | |
| df.loc[df['GenAI_Efficiency'].idxmax(), 'User'] if not df['GenAI_Efficiency'].isna().all() else 'N/A', | |
| df.loc[df['Description_Quality_Score'].idxmax(), 'User'] if not df['Description_Quality_Score'].isna().all() else 'N/A' | |
| ] | |
| }) | |
| summary.to_excel(writer, index=False, sheet_name='Summary') | |
| binary_data = output.getvalue() | |
| b64 = base64.b64encode(binary_data).decode() | |
| href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}" class="download-button">Download Excel file</a>' | |
| return href | |
| # Custom CSS | |
| def local_css(): | |
| st.markdown(""" | |
| <style> | |
| .download-button { | |
| display: inline-block; | |
| padding: 0.5em 1em; | |
| color: #ffffff; | |
| background-color: #4CAF50; | |
| border-radius: 4px; | |
| text-decoration: none; | |
| font-weight: bold; | |
| transition: background-color 0.3s; | |
| } | |
| .download-button:hover { | |
| background-color: #45a049; | |
| } | |
| .stMetric { | |
| background-color: #f0f2f6; | |
| padding: 15px; | |
| border-radius: 5px; | |
| } | |
| .highlight-box { | |
| background-color: #f8f9fa; | |
| border-left: 5px solid #4CAF50; | |
| padding: 15px; | |
| margin: 10px 0; | |
| border-radius: 0 5px 5px 0; | |
| } | |
| .quality-high { | |
| color: #4CAF50; | |
| font-weight: bold; | |
| } | |
| .quality-medium { | |
| color: #FFC107; | |
| font-weight: bold; | |
| } | |
| .quality-low { | |
| color: #F44336; | |
| font-weight: bold; | |
| } | |
| /* Team category styling */ | |
| table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin-bottom: 20px; | |
| } | |
| th { | |
| background-color: #f2f2f2; | |
| padding: 8px; | |
| text-align: left; | |
| border: 1px solid #ddd; | |
| } | |
| td { | |
| padding: 8px; | |
| border: 1px solid #ddd; | |
| } | |
| tr:nth-child(even) { | |
| background-color: #f9f9f9; | |
| } | |
| tr:hover { | |
| background-color: #f0f0f0; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Main app | |
| st.set_page_config(page_title="GenAI Worklog Processor", layout="wide") | |
| local_css() | |
| st.title("GenAI Worklog Data Processor") | |
| st.markdown(""" | |
| This app processes worklog data to extract insights about GenAI usage: | |
| 1. Creates a list of unique users | |
| 2. Concatenates GenAI use case descriptions for each user with proper formatting | |
| 3. Captures GenAI efficiency values and other metrics | |
| 4. Identifies projects with highest GenAI usage | |
| 5. Analyzes most common AI tools and use cases | |
| 6. Identifies prompt champions based on quality metrics | |
| """) | |
| # File uploader | |
| uploaded_file = st.file_uploader("Upload your worklog CSV or Excel file", type=["csv", "xlsx", "xls"]) | |
| if uploaded_file is not None: | |
| try: | |
| # Read the file | |
| if uploaded_file.name.endswith('.csv'): | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| # Display the original data | |
| st.subheader("Original Data Preview") | |
| st.dataframe(df.head()) | |
| # Check if required columns exist | |
| required_columns = ['User', 'GenAI use case description', 'GenAI Efficiency (Log time in hours)'] | |
| for col in ['Required', 'Logged', 'Date', 'Project']: | |
| if col in df.columns: | |
| required_columns.append(col) | |
| missing_columns = [col for col in required_columns[:3] if col not in df.columns] | |
| if missing_columns: | |
| st.warning(f"The following required columns are missing: {', '.join(missing_columns)}") | |
| st.markdown(""" | |
| For full functionality, your file should contain these columns: | |
| - User | |
| - GenAI use case description | |
| - GenAI Efficiency (Log time in hours) | |
| - Required | |
| - Logged | |
| - Date | |
| - Project (optional but recommended for project analysis) | |
| """) | |
| # Stop if essential columns are missing | |
| if any(col in missing_columns for col in ['User', 'GenAI use case description']): | |
| st.error("Cannot continue without essential columns.") | |
| st.stop() | |
| # Continue with available columns | |
| st.info("Continuing with available columns...") | |
| # Process the data | |
| if st.button("Process Data"): | |
| with st.spinner("Processing data..."): | |
| result_df = process_genai_data(df) | |
| # Get project analysis if available | |
| project_analysis = None | |
| if 'Project' in df.columns: | |
| project_analysis = analyze_projects_by_genai_hours(df) | |
| # Get AI tools usage | |
| ai_tool_counts = extract_ai_tools_from_descriptions(df) | |
| # Get use case analysis | |
| use_case_counts = extract_use_cases_from_descriptions(df) | |
| # Display the result | |
| st.subheader("Processed Data") | |
| st.dataframe(result_df) | |
| # Download link | |
| timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| st.subheader("Download Processed Data") | |
| st.markdown(get_download_link(result_df, f"genai_processed_data_{timestamp}.xlsx"), unsafe_allow_html=True) | |
| # NEW INSIGHTS SECTION | |
| st.header("🔍 Advanced GenAI Insights") | |
| # 1. Project with highest GenAI efficacy log hours | |
| if project_analysis is not None and not project_analysis.empty: | |
| st.subheader("🏆 Project with Highest GenAI Efficacy Hours") | |
| top_project = project_analysis.iloc[0] | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown(f""" | |
| <div class="highlight-box"> | |
| <h3>{top_project['Project']}</h3> | |
| <p>Total GenAI Hours: <b>{round(top_project['Total_GenAI_Hours'], 2)}</b></p> | |
| <p>Number of Users: <b>{top_project['User_Count']}</b></p> | |
| <p>Average Hours per User: <b>{round(top_project['Total_GenAI_Hours'] / top_project['User_Count'], 2)}</b></p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| # Bar chart of top 5 projects | |
| top_projects = project_analysis.head(5) | |
| fig = px.bar( | |
| top_projects, | |
| x='Project', | |
| y='Total_GenAI_Hours', | |
| title='Top 5 Projects by GenAI Hours', | |
| color='Total_GenAI_Hours', | |
| color_continuous_scale='Viridis' | |
| ) | |
| fig.update_layout(xaxis_title="Project", yaxis_title="Total GenAI Hours") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Full project analysis | |
| st.markdown("### All Projects Analysis") | |
| st.dataframe(project_analysis) | |
| # 2. Most prominent use cases of AI tools | |
| st.subheader("📊 Most Prominent AI Use Cases") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # AI Tools Analysis | |
| st.markdown("### Top AI Tools Mentioned") | |
| if ai_tool_counts: | |
| # Convert to dataframe for visualization | |
| ai_tools_df = pd.DataFrame({ | |
| 'Tool': list(ai_tool_counts.keys()), | |
| 'Mentions': list(ai_tool_counts.values()) | |
| }).sort_values('Mentions', ascending=False) | |
| fig = px.bar( | |
| ai_tools_df.head(10), | |
| x='Tool', | |
| y='Mentions', | |
| title='Top 10 AI Tools Mentioned', | |
| color='Mentions', | |
| color_continuous_scale='Blues' | |
| ) | |
| fig.update_layout(xaxis_title="AI Tool", yaxis_title="Number of Mentions") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Top tool insight | |
| if not ai_tools_df.empty: | |
| top_tool = ai_tools_df.iloc[0] | |
| st.markdown(f""" | |
| <div class="highlight-box"> | |
| <p>Most used AI tool: <b>{top_tool['Tool']}</b> with {top_tool['Mentions']} mentions</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.info("No specific AI tools were identified in the descriptions.") | |
| with col2: | |
| # Use Cases Analysis | |
| st.markdown("### Top Use Cases") | |
| if use_case_counts: | |
| # Convert to dataframe for visualization | |
| use_cases_df = pd.DataFrame({ | |
| 'Use Case': list(use_case_counts.keys()), | |
| 'Count': list(use_case_counts.values()) | |
| }).sort_values('Count', ascending=False) | |
| fig = px.pie( | |
| use_cases_df.head(5), | |
| names='Use Case', | |
| values='Count', | |
| title='Top 5 GenAI Use Cases', | |
| hole=0.4 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Top use case insight | |
| if not use_cases_df.empty: | |
| top_use_case = use_cases_df.iloc[0] | |
| st.markdown(f""" | |
| <div class="highlight-box"> | |
| <p>Most common use case: <b>{top_use_case['Use Case']}</b> mentioned in {top_use_case['Count']} descriptions</p> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| st.info("No specific use cases were identified in the descriptions.") | |
| # 3. Champion of the prompt with quality GenAI Description | |
| st.subheader("👑 GenAI Prompt Champions") | |
| if 'Description_Quality_Score' in result_df.columns: | |
| # Get top 3 users by quality score | |
| top_quality_users = result_df.sort_values('Description_Quality_Score', ascending=False).head(3) | |
| # Display top champion | |
| if not top_quality_users.empty: | |
| champion = top_quality_users.iloc[0] | |
| st.markdown(f""" | |
| <div class="highlight-box"> | |
| <h3>🏆 Prompt Champion: {champion['User']}</h3> | |
| <p>Quality Score: <span class="quality-high">{champion['Description_Quality_Score']}/100</span></p> | |
| <p>GenAI Efficiency: {round(champion['GenAI_Efficiency'], 2)} hours</p> | |
| <p><b>GenAI Descriptions:</b></p> | |
| <pre>{champion['GenAI_Descriptions']}</pre> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Quality score distribution | |
| st.markdown("### Quality Score Distribution") | |
| fig = px.histogram( | |
| result_df, | |
| x='Description_Quality_Score', | |
| nbins=10, | |
| title='Distribution of GenAI Description Quality Scores', | |
| color_discrete_sequence=['#4CAF50'] | |
| ) | |
| fig.update_layout(xaxis_title="Quality Score", yaxis_title="Number of Users") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Quality score by user with team categorization | |
| st.markdown("### Quality Scores by User & Team Categories") | |
| # Create a more comprehensive dataframe for team identification | |
| team_df = result_df[['User', 'Description_Quality_Score', 'GenAI_Efficiency', 'Total_Logged_Hours']].copy() | |
| # Ensure we have numeric values for calculations | |
| team_df['Description_Quality_Score'] = pd.to_numeric(team_df['Description_Quality_Score'], errors='coerce').fillna(0) | |
| team_df['GenAI_Efficiency'] = pd.to_numeric(team_df['GenAI_Efficiency'], errors='coerce').fillna(0) | |
| team_df['Total_Logged_Hours'] = pd.to_numeric(team_df['Total_Logged_Hours'], errors='coerce').fillna(0) | |
| # Calculate a combined score (weighted average of quality and hours) | |
| # Weight: 60% quality, 40% efficiency hours | |
| max_quality = team_df['Description_Quality_Score'].max() if not team_df.empty and team_df['Description_Quality_Score'].max() > 0 else 100 | |
| max_hours = team_df['GenAI_Efficiency'].max() if not team_df.empty and team_df['GenAI_Efficiency'].max() > 0 else 1 | |
| team_df['Quality_Normalized'] = team_df['Description_Quality_Score'] / max_quality * 100 | |
| team_df['Hours_Normalized'] = team_df['GenAI_Efficiency'] / max_hours * 100 | |
| team_df['Combined_Score'] = (team_df['Quality_Normalized'] * 0.6) + (team_df['Hours_Normalized'] * 0.4) | |
| # Assign team categories based on combined score and individual metrics | |
| def assign_team_category(row): | |
| quality = row['Description_Quality_Score'] | |
| hours = row['GenAI_Efficiency'] | |
| combined = row['Combined_Score'] | |
| if quality >= 80 and hours >= (max_hours * 0.7): | |
| return "🔥 GenAI Champion", "Masters of both quality and quantity" | |
| elif quality >= 70: | |
| return "✨ Prompt Expert", "High-quality prompt crafters" | |
| elif hours >= (max_hours * 0.8): | |
| return "⚡ Power User", "High volume GenAI users" | |
| elif combined >= 60: | |
| return "🌟 Balanced Performer", "Good balance of quality and usage" | |
| elif quality >= 50: | |
| return "📝 Quality Focused", "Focuses on quality over quantity" | |
| elif hours > 0: | |
| return "🔍 Exploring User", "Beginning GenAI journey" | |
| else: | |
| return "❓ Inactive", "Little to no GenAI usage" | |
| # Apply the team categorization | |
| team_df[['Team_Category', 'Category_Description']] = team_df.apply(assign_team_category, axis=1, result_type='expand') | |
| # Sort by combined score | |
| team_df = team_df.sort_values('Combined_Score', ascending=False) | |
| # Add color coding based on quality score | |
| def quality_color(score): | |
| if score >= 70: | |
| return 'quality-high' | |
| elif score >= 40: | |
| return 'quality-medium' | |
| else: | |
| return 'quality-low' | |
| team_df['Score_Display'] = team_df['Description_Quality_Score'].apply( | |
| lambda x: f'<span class="{quality_color(x)}">{x}</span>' | |
| ) | |
| # Create a display dataframe with the relevant columns | |
| display_df = team_df[['User', 'Score_Display', 'GenAI_Efficiency', 'Team_Category', 'Category_Description']] | |
| display_df.columns = ['User', 'Quality Score', 'GenAI Hours', 'Team Category', 'Description'] | |
| # Display as a styled dataframe | |
| st.write(display_df.to_html(escape=False), unsafe_allow_html=True) | |
| # Team distribution pie chart | |
| st.markdown("### Team Category Distribution") | |
| team_counts = team_df['Team_Category'].value_counts().reset_index() | |
| team_counts.columns = ['Team_Category', 'Count'] | |
| fig = px.pie( | |
| team_counts, | |
| names='Team_Category', | |
| values='Count', | |
| title='Distribution of Team Categories', | |
| color_discrete_sequence=px.colors.qualitative.Bold | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Quality factors explanation | |
| st.markdown(""" | |
| ### How Quality Scores Are Calculated | |
| The quality score is based on these factors: | |
| 1. **Length & Detail (40%)**: Longer, more detailed descriptions score higher | |
| 2. **Specificity (30%)**: Mentions of specific AI tools, metrics, and technical details | |
| 3. **Uniqueness (30%)**: Variety of terms and concepts used | |
| Scores range from 0-100, with higher scores indicating more comprehensive and useful GenAI descriptions. | |
| """) | |
| # Data visualization section (original visualizations) | |
| st.header("📈 Data Visualization") | |
| # Tab layout for visualizations | |
| tab1, tab2, tab3, tab4 = st.tabs(["GenAI Efficiency", "Utilization", "User Analysis", "Tools & Use Cases"]) | |
| with tab1: | |
| # GenAI Efficiency by User | |
| if 'GenAI_Efficiency' in result_df.columns: | |
| st.subheader("GenAI Efficiency by User") | |
| sorted_df = result_df.sort_values('GenAI_Efficiency', ascending=False) | |
| fig = px.bar( | |
| sorted_df, | |
| x='User', | |
| y='GenAI_Efficiency', | |
| title='GenAI Efficiency Hours by User', | |
| color='GenAI_Efficiency', | |
| color_continuous_scale='Viridis' | |
| ) | |
| fig.update_layout(xaxis_title="User", yaxis_title="Hours") | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab2: | |
| # Utilization Percentage | |
| if 'Utilization_Percentage' in result_df.columns: | |
| st.subheader("Utilization Percentage by User") | |
| sorted_df = result_df.sort_values('Utilization_Percentage', ascending=False) | |
| fig = px.bar( | |
| sorted_df, | |
| x='User', | |
| y='Utilization_Percentage', | |
| title='Utilization Percentage by User', | |
| color='Utilization_Percentage', | |
| color_continuous_scale='RdYlGn' | |
| ) | |
| fig.update_layout(xaxis_title="User", yaxis_title="Utilization %") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Required vs Logged Hours | |
| if 'Total_Required_Hours' in result_df.columns and 'Total_Logged_Hours' in result_df.columns: | |
| st.subheader("Required vs Logged Hours by User") | |
| fig = px.bar( | |
| result_df, | |
| x='User', | |
| y=['Total_Required_Hours', 'Total_Logged_Hours'], | |
| title='Required vs Logged Hours by User', | |
| barmode='group' | |
| ) | |
| fig.update_layout(xaxis_title="User", yaxis_title="Hours") | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab3: | |
| # User with GenAI descriptions | |
| st.subheader("Users with GenAI Use Cases") | |
| has_description = result_df['GenAI_Descriptions'] != "" | |
| fig = px.pie( | |
| names=['Has GenAI Use Cases', 'No GenAI Use Cases'], | |
| values=[result_df[has_description].shape[0], result_df[~has_description].shape[0]], | |
| title='Users with GenAI Use Cases' | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab4: | |
| # Combined tools and use cases view | |
| st.subheader("AI Tools and Use Cases") | |
| if ai_tool_counts and use_case_counts: | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Word cloud of AI tools (text representation) | |
| st.markdown("### AI Tools Word Cloud") | |
| ai_tools_text = " ".join([f"{tool} " * count for tool, count in ai_tool_counts.items()]) | |
| st.text_area("", ai_tools_text, height=200) | |
| with col2: | |
| # Use cases bar chart | |
| use_cases_df = pd.DataFrame({ | |
| 'Use Case': list(use_case_counts.keys()), | |
| 'Count': list(use_case_counts.values()) | |
| }).sort_values('Count', ascending=False) | |
| fig = px.bar( | |
| use_cases_df, | |
| x='Use Case', | |
| y='Count', | |
| title='All GenAI Use Cases', | |
| color='Count', | |
| color_continuous_scale='YlOrRd' | |
| ) | |
| fig.update_layout(xaxis_title="Use Case", yaxis_title="Count") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Summary statistics | |
| st.subheader("Summary Statistics") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Users", len(result_df)) | |
| with col2: | |
| # Average efficiency | |
| avg_efficiency = result_df['GenAI_Efficiency'].mean() | |
| if not pd.isna(avg_efficiency): | |
| st.metric("Avg GenAI Efficiency (hours)", round(avg_efficiency, 2)) | |
| with col3: | |
| # Average utilization | |
| if 'Utilization_Percentage' in result_df.columns: | |
| avg_util = result_df['Utilization_Percentage'].mean() | |
| st.metric("Avg Utilization %", f"{round(avg_util, 2)}%") | |
| # New row of metrics | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| if ai_tool_counts: | |
| top_tool = max(ai_tool_counts.items(), key=lambda x: x[1])[0] | |
| st.metric("Most Used AI Tool", top_tool) | |
| with col2: | |
| if use_case_counts: | |
| top_use_case = max(use_case_counts.items(), key=lambda x: x[1])[0] | |
| st.metric("Top Use Case", top_use_case) | |
| with col3: | |
| if 'Description_Quality_Score' in result_df.columns and not result_df.empty: | |
| avg_quality = result_df['Description_Quality_Score'].mean() | |
| st.metric("Avg Description Quality", f"{round(avg_quality, 1)}/100") | |
| except Exception as e: | |
| st.error(f"An error occurred: {str(e)}") | |
| st.markdown("Please check your file format and try again.") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("**Enhanced GenAI Worklog Processor** • Built with Streamlit and Pandas") |