import streamlit as st import pandas as pd import numpy as np import io import base64 import plotly.express as px import datetime import re from collections import Counter def process_genai_data(df): # Create a new dataframe with unique users unique_users = df['User'].drop_duplicates().reset_index(drop=True) result_df = pd.DataFrame(unique_users, columns=['User']) # For each unique user, find all their GenAI use case descriptions and join them def get_descriptions(user): # Filter descriptions for the current user and drop empty values descriptions = df[df['User'] == user]['GenAI use case description'].dropna().unique() # Join the descriptions with a newline and add a dash at the beginning of each line if len(descriptions) > 0: # Format each description to start with "- " formatted_descriptions = ["- " + desc for desc in descriptions] return "\n".join(formatted_descriptions) return "" # Apply the function to get concatenated descriptions for each user result_df['GenAI_Descriptions'] = result_df['User'].apply(get_descriptions) # Get the GenAI Efficiency value for the first occurrence of each user def get_first_efficiency(user): efficiency_values = df[df['User'] == user]['GenAI Efficiency (Log time in hours)'] if len(efficiency_values) > 0: return efficiency_values.iloc[0] return np.nan result_df['GenAI_Efficiency'] = result_df['User'].apply(get_first_efficiency) # Calculate total logged hours for each user def get_total_logged(user): return df[df['User'] == user]['Logged'].sum() result_df['Total_Logged_Hours'] = result_df['User'].apply(get_total_logged) # Calculate required hours for each user def get_total_required(user): return df[df['User'] == user]['Required'].sum() result_df['Total_Required_Hours'] = result_df['User'].apply(get_total_required) # Calculate utilization percentage result_df['Utilization_Percentage'] = (result_df['Total_Logged_Hours'] / result_df['Total_Required_Hours'] * 100).round(2) # Get date range for each user def get_date_range(user): user_logs = df[df['User'] == user] if 'Date' in user_logs.columns and not user_logs['Date'].empty: dates = user_logs['Date'].dropna() if len(dates) > 0: return f"{min(dates)} to {max(dates)}" return "N/A" result_df['Date_Range'] = result_df['User'].apply(get_date_range) # Add description quality score result_df['Description_Quality_Score'] = calculate_description_quality(result_df) # Get project data if available if 'Project' in df.columns: # Get a list of projects for each user def get_projects(user): projects = df[df['User'] == user]['Project'].dropna().unique() return list(projects) result_df['Projects'] = result_df['User'].apply(get_projects) return result_df def analyze_projects_by_genai_hours(df): """ Analyzes which projects have the highest GenAI efficiency log hours Returns a dataframe with projects and their total GenAI hours Fix: Ensure we don't double-count hours by first getting unique Project-User combinations """ if 'Project' not in df.columns: return None # First, get unique Project-User combinations with their GenAI hours # This avoids double-counting hours for the same user on the same project user_project_hours = df.groupby(['Project', 'User'])['GenAI Efficiency (Log time in hours)'].first().reset_index() # Now sum up the hours by project project_hours = user_project_hours.groupby('Project')['GenAI Efficiency (Log time in hours)'].sum().reset_index() project_hours = project_hours.sort_values('GenAI Efficiency (Log time in hours)', ascending=False) project_hours.columns = ['Project', 'Total_GenAI_Hours'] # Add user count per project project_users = df.groupby('Project')['User'].nunique().reset_index() project_users.columns = ['Project', 'User_Count'] # Merge the dataframes project_analysis = pd.merge(project_hours, project_users, on='Project') return project_analysis def extract_ai_tools_from_descriptions(df): """ Extracts and counts AI tools mentioned in GenAI descriptions Returns a Counter object with tools and their frequencies """ # Common AI tools and platforms to look for ai_tools = [ 'chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'openai', 'claude', 'anthropic', 'gemini', 'bard', 'google ai', 'copilot', 'github copilot', 'microsoft copilot', 'dall-e', 'midjourney', 'stable diffusion', 'hugging face', 'transformers', 'bert', 'llama', 'mistral', 'tensorflow', 'pytorch', 'ml', 'jupyter', 'colab', 'langchain', 'llm', 'rag' ] # Dictionary to store normalized tool names tool_mapping = { 'gpt': 'ChatGPT/GPT', 'gpt-3': 'ChatGPT/GPT', 'gpt-4': 'ChatGPT/GPT', 'chatgpt': 'ChatGPT/GPT', 'openai': 'OpenAI', 'claude': 'Claude', 'anthropic': 'Claude', 'gemini': 'Google AI', 'bard': 'Google AI', 'google ai': 'Google AI', 'copilot': 'GitHub Copilot', 'github copilot': 'GitHub Copilot', 'microsoft copilot': 'Microsoft Copilot', 'dall-e': 'DALL-E', 'midjourney': 'Midjourney', 'stable diffusion': 'Stable Diffusion', 'hugging face': 'Hugging Face', 'transformers': 'Transformers', 'bert': 'BERT', 'llama': 'LLaMA', 'mistral': 'Mistral AI', 'tensorflow': 'TensorFlow', 'pytorch': 'PyTorch', 'ml': 'Machine Learning', 'jupyter': 'Jupyter', 'colab': 'Google Colab', 'langchain': 'LangChain', 'llm': 'Large Language Models', 'rag': 'Retrieval Augmented Generation' } # Extract all GenAI descriptions all_descriptions = " ".join(df['GenAI use case description'].dropna().astype(str).tolist()).lower() # Count occurrences of each tool tool_counts = Counter() for tool in ai_tools: count = len(re.findall(r'\b' + re.escape(tool) + r'\b', all_descriptions)) if count > 0: normalized_tool = tool_mapping.get(tool, tool) tool_counts[normalized_tool] += count return tool_counts def extract_use_cases_from_descriptions(df): """ Analyzes GenAI descriptions to identify common use cases Returns a Counter object with use cases and their frequencies """ # Common use case categories to look for use_case_keywords = { 'Code Generation': ['code', 'coding', 'programming', 'script', 'develop', 'algorithm'], 'Content Creation': ['content', 'write', 'writing', 'draft', 'article', 'blog'], 'Data Analysis': ['data', 'analysis', 'analyze', 'analytics', 'statistics', 'insights'], 'Documentation': ['document', 'documentation', 'manual', 'guide', 'readme'], 'Research': ['research', 'study', 'investigate', 'explore', 'literature'], 'Summarization': ['summary', 'summarize', 'summarization', 'extract key points'], 'Translation': ['translate', 'translation', 'language', 'localize'], 'Image Generation': ['image', 'picture', 'graphic', 'design', 'draw', 'art'], 'Chatbot': ['chatbot', 'chat', 'conversation', 'dialogue', 'assistant'], 'Automation': ['automate', 'automation', 'workflow', 'process', 'routine'], 'Training': ['train', 'training', 'learn', 'learning', 'education'], 'Testing': ['test', 'testing', 'QA', 'quality assurance', 'debug'] } # Extract all GenAI descriptions descriptions = df['GenAI use case description'].dropna().astype(str).tolist() # Count occurrences of each use case use_case_counts = Counter() for description in descriptions: description_lower = description.lower() for use_case, keywords in use_case_keywords.items(): for keyword in keywords: if re.search(r'\b' + re.escape(keyword) + r'\b', description_lower): use_case_counts[use_case] += 1 break # Count each use case only once per description return use_case_counts def calculate_description_quality(df): """ Calculates a quality score for each user's GenAI description Score is based on length, specificity, and uniqueness Returns a Series with quality scores """ # Get descriptions column descriptions = df['GenAI_Descriptions'] # Initialize scores scores = pd.Series(0, index=df.index) # Factor 1: Length score (longer descriptions get more points) char_counts = descriptions.str.len() max_char_count = char_counts.max() if not char_counts.empty else 1 length_score = (char_counts / max_char_count) * 40 # 40% weight to length # Factor 2: Specificity score (mentions of specific tools or numbers) def specificity_score(desc): if not isinstance(desc, str) or desc.strip() == "": return 0 score = 0 # Check for specific AI tools ai_tools = ['gpt', 'chatgpt', 'claude', 'gemini', 'copilot', 'dall-e', 'midjourney'] for tool in ai_tools: if re.search(r'\b' + re.escape(tool) + r'\b', desc.lower()): score += 5 # Check for numbers (could indicate metrics or specific examples) if re.search(r'\d+', desc): score += 5 # Check for detailed explanations if len(desc.split()) > 50: # Long descriptions score += 10 return min(score, 30) # Cap at 30% weight specificity_scores = descriptions.apply(specificity_score) # Factor 3: Uniqueness score def uniqueness_score(desc): if not isinstance(desc, str) or desc.strip() == "": return 0 # Simple word tokenization by splitting on whitespace words = desc.lower().split() # Remove common stop words and short words common_stopwords = {"a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "in", "on", "at", "to", "for", "with", "by", "about", "of", "this", "that", "i", "we", "you", "he", "she", "they", "it", "have", "has"} # Filter out stopwords and short words filtered_words = [word for word in words if word not in common_stopwords and len(word) > 2] # Unique words ratio if filtered_words: uniqueness = len(set(filtered_words)) / len(filtered_words) return uniqueness * 30 # 30% weight to uniqueness return 0 uniqueness_scores = descriptions.apply(uniqueness_score) # Combine scores total_scores = length_score + specificity_scores + uniqueness_scores # Normalize to 0-100 scale max_score = total_scores.max() if not total_scores.empty else 1 normalized_scores = (total_scores / max_score * 100).round(1) return normalized_scores def get_download_link(df, filename): """Generate a download link for the dataframe as an Excel file""" output = io.BytesIO() with pd.ExcelWriter(output, engine='openpyxl') as writer: df.to_excel(writer, index=False, sheet_name='Processed Data') # Add a summary sheet summary = pd.DataFrame({ 'Metric': ['Total Users', 'Average GenAI Efficiency (hours)', 'Average Utilization (%)', 'Top GenAI User', 'Top Quality Score'], 'Value': [ len(df), round(df['GenAI_Efficiency'].mean(), 2), round(df['Utilization_Percentage'].mean(), 2), df.loc[df['GenAI_Efficiency'].idxmax(), 'User'] if not df['GenAI_Efficiency'].isna().all() else 'N/A', df.loc[df['Description_Quality_Score'].idxmax(), 'User'] if not df['Description_Quality_Score'].isna().all() else 'N/A' ] }) summary.to_excel(writer, index=False, sheet_name='Summary') binary_data = output.getvalue() b64 = base64.b64encode(binary_data).decode() href = f'Download Excel file' return href # Custom CSS def local_css(): st.markdown(""" """, unsafe_allow_html=True) # Main app st.set_page_config(page_title="GenAI Worklog Processor", layout="wide") local_css() st.title("GenAI Worklog Data Processor") st.markdown(""" This app processes worklog data to extract insights about GenAI usage: 1. Creates a list of unique users 2. Concatenates GenAI use case descriptions for each user with proper formatting 3. Captures GenAI efficiency values and other metrics 4. Identifies projects with highest GenAI usage 5. Analyzes most common AI tools and use cases 6. Identifies prompt champions based on quality metrics """) # File uploader uploaded_file = st.file_uploader("Upload your worklog CSV or Excel file", type=["csv", "xlsx", "xls"]) if uploaded_file is not None: try: # Read the file if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) # Display the original data st.subheader("Original Data Preview") st.dataframe(df.head()) # Check if required columns exist required_columns = ['User', 'GenAI use case description', 'GenAI Efficiency (Log time in hours)'] for col in ['Required', 'Logged', 'Date', 'Project']: if col in df.columns: required_columns.append(col) missing_columns = [col for col in required_columns[:3] if col not in df.columns] if missing_columns: st.warning(f"The following required columns are missing: {', '.join(missing_columns)}") st.markdown(""" For full functionality, your file should contain these columns: - User - GenAI use case description - GenAI Efficiency (Log time in hours) - Required - Logged - Date - Project (optional but recommended for project analysis) """) # Stop if essential columns are missing if any(col in missing_columns for col in ['User', 'GenAI use case description']): st.error("Cannot continue without essential columns.") st.stop() # Continue with available columns st.info("Continuing with available columns...") # Process the data if st.button("Process Data"): with st.spinner("Processing data..."): result_df = process_genai_data(df) # Get project analysis if available project_analysis = None if 'Project' in df.columns: project_analysis = analyze_projects_by_genai_hours(df) # Get AI tools usage ai_tool_counts = extract_ai_tools_from_descriptions(df) # Get use case analysis use_case_counts = extract_use_cases_from_descriptions(df) # Display the result st.subheader("Processed Data") st.dataframe(result_df) # Download link timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") st.subheader("Download Processed Data") st.markdown(get_download_link(result_df, f"genai_processed_data_{timestamp}.xlsx"), unsafe_allow_html=True) # NEW INSIGHTS SECTION st.header("🔍 Advanced GenAI Insights") # 1. Project with highest GenAI efficacy log hours if project_analysis is not None and not project_analysis.empty: st.subheader("🏆 Project with Highest GenAI Efficacy Hours") top_project = project_analysis.iloc[0] col1, col2 = st.columns(2) with col1: st.markdown(f"""
Total GenAI Hours: {round(top_project['Total_GenAI_Hours'], 2)}
Number of Users: {top_project['User_Count']}
Average Hours per User: {round(top_project['Total_GenAI_Hours'] / top_project['User_Count'], 2)}
Most used AI tool: {top_tool['Tool']} with {top_tool['Mentions']} mentions
Most common use case: {top_use_case['Use Case']} mentioned in {top_use_case['Count']} descriptions
Quality Score: {champion['Description_Quality_Score']}/100
GenAI Efficiency: {round(champion['GenAI_Efficiency'], 2)} hours
GenAI Descriptions:
{champion['GenAI_Descriptions']}