import streamlit as st
import pandas as pd
import numpy as np
import io
import base64
import plotly.express as px
import datetime
import re
from collections import Counter

def process_genai_data(df):
    # Create a new dataframe with unique users
    unique_users = df['User'].drop_duplicates().reset_index(drop=True)
    result_df = pd.DataFrame(unique_users, columns=['User'])
    
    # For each unique user, find all their GenAI use case descriptions and join them
    def get_descriptions(user):
        # Filter descriptions for the current user and drop empty values
        descriptions = df[df['User'] == user]['GenAI use case description'].dropna().unique()
        
        # Join the descriptions with a newline and add a dash at the beginning of each line
        if len(descriptions) > 0:
            # Format each description to start with "- "
            formatted_descriptions = ["- " + desc for desc in descriptions]
            return "\n".join(formatted_descriptions)
        return ""
    
    # Apply the function to get concatenated descriptions for each user
    result_df['GenAI_Descriptions'] = result_df['User'].apply(get_descriptions)
    
    # Get the GenAI Efficiency value for the first occurrence of each user
    def get_first_efficiency(user):
        efficiency_values = df[df['User'] == user]['GenAI Efficiency (Log time in hours)']
        if len(efficiency_values) > 0:
            return efficiency_values.iloc[0]
        return np.nan
    
    result_df['GenAI_Efficiency'] = result_df['User'].apply(get_first_efficiency)
    
    # Calculate total logged hours for each user
    def get_total_logged(user):
        return df[df['User'] == user]['Logged'].sum()
    
    result_df['Total_Logged_Hours'] = result_df['User'].apply(get_total_logged)
    
    # Calculate required hours for each user
    def get_total_required(user):
        return df[df['User'] == user]['Required'].sum()
    
    result_df['Total_Required_Hours'] = result_df['User'].apply(get_total_required)
    
    # Calculate utilization percentage
    result_df['Utilization_Percentage'] = (result_df['Total_Logged_Hours'] / result_df['Total_Required_Hours'] * 100).round(2)
    
    # Get date range for each user
    def get_date_range(user):
        user_logs = df[df['User'] == user]
        if 'Date' in user_logs.columns and not user_logs['Date'].empty:
            dates = user_logs['Date'].dropna()
            if len(dates) > 0:
                return f"{min(dates)} to {max(dates)}"
        return "N/A"
    
    result_df['Date_Range'] = result_df['User'].apply(get_date_range)
    
    # Add description quality score
    result_df['Description_Quality_Score'] = calculate_description_quality(result_df)
    
    # Get project data if available
    if 'Project' in df.columns:
        # Get a list of projects for each user
        def get_projects(user):
            projects = df[df['User'] == user]['Project'].dropna().unique()
            return list(projects)
        
        result_df['Projects'] = result_df['User'].apply(get_projects)
    
    return result_df

def analyze_projects_by_genai_hours(df):
    """
    Analyzes which projects have the highest GenAI efficiency log hours
    Returns a dataframe with projects and their total GenAI hours
    
    Fix: Ensure we don't double-count hours by first getting unique Project-User combinations
    """
    if 'Project' not in df.columns:
        return None
    
    # First, get unique Project-User combinations with their GenAI hours
    # This avoids double-counting hours for the same user on the same project
    user_project_hours = df.groupby(['Project', 'User'])['GenAI Efficiency (Log time in hours)'].first().reset_index()
    
    # Now sum up the hours by project
    project_hours = user_project_hours.groupby('Project')['GenAI Efficiency (Log time in hours)'].sum().reset_index()
    project_hours = project_hours.sort_values('GenAI Efficiency (Log time in hours)', ascending=False)
    project_hours.columns = ['Project', 'Total_GenAI_Hours']
    
    # Add user count per project
    project_users = df.groupby('Project')['User'].nunique().reset_index()
    project_users.columns = ['Project', 'User_Count']
    
    # Merge the dataframes
    project_analysis = pd.merge(project_hours, project_users, on='Project')
    
    return project_analysis

def extract_ai_tools_from_descriptions(df):
    """
    Extracts and counts AI tools mentioned in GenAI descriptions
    Returns a Counter object with tools and their frequencies
    """
    # Common AI tools and platforms to look for
    ai_tools = [
        'chatgpt', 'gpt-4', 'gpt-3', 'gpt', 'openai', 
        'claude', 'anthropic', 
        'gemini', 'bard', 'google ai',
        'copilot', 'github copilot', 'microsoft copilot',
        'dall-e', 'midjourney', 'stable diffusion',
        'hugging face', 'transformers',
        'bert', 'llama', 'mistral', 
        'tensorflow', 'pytorch', 'ml',
        'jupyter', 'colab',
        'langchain', 'llm', 'rag'
    ]
    
    # Dictionary to store normalized tool names
    tool_mapping = {
        'gpt': 'ChatGPT/GPT',
        'gpt-3': 'ChatGPT/GPT',
        'gpt-4': 'ChatGPT/GPT',
        'chatgpt': 'ChatGPT/GPT',
        'openai': 'OpenAI',
        'claude': 'Claude',
        'anthropic': 'Claude',
        'gemini': 'Google AI',
        'bard': 'Google AI',
        'google ai': 'Google AI',
        'copilot': 'GitHub Copilot',
        'github copilot': 'GitHub Copilot',
        'microsoft copilot': 'Microsoft Copilot',
        'dall-e': 'DALL-E',
        'midjourney': 'Midjourney',
        'stable diffusion': 'Stable Diffusion',
        'hugging face': 'Hugging Face',
        'transformers': 'Transformers',
        'bert': 'BERT',
        'llama': 'LLaMA',
        'mistral': 'Mistral AI',
        'tensorflow': 'TensorFlow',
        'pytorch': 'PyTorch',
        'ml': 'Machine Learning',
        'jupyter': 'Jupyter',
        'colab': 'Google Colab',
        'langchain': 'LangChain',
        'llm': 'Large Language Models',
        'rag': 'Retrieval Augmented Generation'
    }
    
    # Extract all GenAI descriptions
    all_descriptions = " ".join(df['GenAI use case description'].dropna().astype(str).tolist()).lower()
    
    # Count occurrences of each tool
    tool_counts = Counter()
    for tool in ai_tools:
        count = len(re.findall(r'\b' + re.escape(tool) + r'\b', all_descriptions))
        if count > 0:
            normalized_tool = tool_mapping.get(tool, tool)
            tool_counts[normalized_tool] += count
    
    return tool_counts

def extract_use_cases_from_descriptions(df):
    """
    Analyzes GenAI descriptions to identify common use cases
    Returns a Counter object with use cases and their frequencies
    """
    # Common use case categories to look for
    use_case_keywords = {
        'Code Generation': ['code', 'coding', 'programming', 'script', 'develop', 'algorithm'],
        'Content Creation': ['content', 'write', 'writing', 'draft', 'article', 'blog'],
        'Data Analysis': ['data', 'analysis', 'analyze', 'analytics', 'statistics', 'insights'],
        'Documentation': ['document', 'documentation', 'manual', 'guide', 'readme'],
        'Research': ['research', 'study', 'investigate', 'explore', 'literature'],
        'Summarization': ['summary', 'summarize', 'summarization', 'extract key points'],
        'Translation': ['translate', 'translation', 'language', 'localize'],
        'Image Generation': ['image', 'picture', 'graphic', 'design', 'draw', 'art'],
        'Chatbot': ['chatbot', 'chat', 'conversation', 'dialogue', 'assistant'],
        'Automation': ['automate', 'automation', 'workflow', 'process', 'routine'],
        'Training': ['train', 'training', 'learn', 'learning', 'education'],
        'Testing': ['test', 'testing', 'QA', 'quality assurance', 'debug']
    }
    
    # Extract all GenAI descriptions
    descriptions = df['GenAI use case description'].dropna().astype(str).tolist()
    
    # Count occurrences of each use case
    use_case_counts = Counter()
    
    for description in descriptions:
        description_lower = description.lower()
        for use_case, keywords in use_case_keywords.items():
            for keyword in keywords:
                if re.search(r'\b' + re.escape(keyword) + r'\b', description_lower):
                    use_case_counts[use_case] += 1
                    break  # Count each use case only once per description
    
    return use_case_counts

def calculate_description_quality(df):
    """
    Calculates a quality score for each user's GenAI description
    Score is based on length, specificity, and uniqueness
    Returns a Series with quality scores
    """
    # Get descriptions column
    descriptions = df['GenAI_Descriptions']
    
    # Initialize scores
    scores = pd.Series(0, index=df.index)
    
    # Factor 1: Length score (longer descriptions get more points)
    char_counts = descriptions.str.len()
    max_char_count = char_counts.max() if not char_counts.empty else 1
    length_score = (char_counts / max_char_count) * 40  # 40% weight to length
    
    # Factor 2: Specificity score (mentions of specific tools or numbers)
    def specificity_score(desc):
        if not isinstance(desc, str) or desc.strip() == "":
            return 0
        
        score = 0
        # Check for specific AI tools
        ai_tools = ['gpt', 'chatgpt', 'claude', 'gemini', 'copilot', 'dall-e', 'midjourney']
        for tool in ai_tools:
            if re.search(r'\b' + re.escape(tool) + r'\b', desc.lower()):
                score += 5
        
        # Check for numbers (could indicate metrics or specific examples)
        if re.search(r'\d+', desc):
            score += 5
        
        # Check for detailed explanations 
        if len(desc.split()) > 50:  # Long descriptions
            score += 10
            
        return min(score, 30)  # Cap at 30% weight
    
    specificity_scores = descriptions.apply(specificity_score)
    
    # Factor 3: Uniqueness score
    def uniqueness_score(desc):
        if not isinstance(desc, str) or desc.strip() == "":
            return 0
            
        # Simple word tokenization by splitting on whitespace
        words = desc.lower().split()
        
        # Remove common stop words and short words
        common_stopwords = {"a", "an", "the", "and", "or", "but", "is", "are", "was", "were", 
                            "in", "on", "at", "to", "for", "with", "by", "about", "of", "this", 
                            "that", "i", "we", "you", "he", "she", "they", "it", "have", "has"}
        
        # Filter out stopwords and short words
        filtered_words = [word for word in words if word not in common_stopwords and len(word) > 2]
        
        # Unique words ratio
        if filtered_words:
            uniqueness = len(set(filtered_words)) / len(filtered_words)
            return uniqueness * 30  # 30% weight to uniqueness
        return 0
    
    uniqueness_scores = descriptions.apply(uniqueness_score)
    
    # Combine scores
    total_scores = length_score + specificity_scores + uniqueness_scores
    
    # Normalize to 0-100 scale
    max_score = total_scores.max() if not total_scores.empty else 1
    normalized_scores = (total_scores / max_score * 100).round(1)
    
    return normalized_scores

def get_download_link(df, filename):
    """Generate a download link for the dataframe as an Excel file"""
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Processed Data')
        
        # Add a summary sheet
        summary = pd.DataFrame({
            'Metric': ['Total Users', 'Average GenAI Efficiency (hours)', 'Average Utilization (%)', 
                      'Top GenAI User', 'Top Quality Score'],
            'Value': [
                len(df),
                round(df['GenAI_Efficiency'].mean(), 2),
                round(df['Utilization_Percentage'].mean(), 2),
                df.loc[df['GenAI_Efficiency'].idxmax(), 'User'] if not df['GenAI_Efficiency'].isna().all() else 'N/A',
                df.loc[df['Description_Quality_Score'].idxmax(), 'User'] if not df['Description_Quality_Score'].isna().all() else 'N/A'
            ]
        })
        summary.to_excel(writer, index=False, sheet_name='Summary')
    
    binary_data = output.getvalue()
    b64 = base64.b64encode(binary_data).decode()
    href = f'<a href="data:application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;base64,{b64}" download="{filename}" class="download-button">Download Excel file</a>'
    return href

# Custom CSS
def local_css():
    st.markdown("""
    <style>
    .download-button {
        display: inline-block;
        padding: 0.5em 1em;
        color: #ffffff;
        background-color: #4CAF50;
        border-radius: 4px;
        text-decoration: none;
        font-weight: bold;
        transition: background-color 0.3s;
    }
    .download-button:hover {
        background-color: #45a049;
    }
    .stMetric {
        background-color: #f0f2f6;
        padding: 15px;
        border-radius: 5px;
    }
    .highlight-box {
        background-color: #f8f9fa;
        border-left: 5px solid #4CAF50;
        padding: 15px;
        margin: 10px 0;
        border-radius: 0 5px 5px 0;
    }
    .quality-high {
        color: #4CAF50;
        font-weight: bold;
    }
    .quality-medium {
        color: #FFC107;
        font-weight: bold;
    }
    .quality-low {
        color: #F44336;
        font-weight: bold;
    }
    /* Team category styling */
    table {
        width: 100%;
        border-collapse: collapse;
        margin-bottom: 20px;
    }
    th {
        background-color: #f2f2f2;
        padding: 8px;
        text-align: left;
        border: 1px solid #ddd;
    }
    td {
        padding: 8px;
        border: 1px solid #ddd;
    }
    tr:nth-child(even) {
        background-color: #f9f9f9;
    }
    tr:hover {
        background-color: #f0f0f0;
    }
    </style>
    """, unsafe_allow_html=True)

# Main app
st.set_page_config(page_title="GenAI Worklog Processor", layout="wide")
local_css()

st.title("GenAI Worklog Data Processor")
st.markdown("""
This app processes worklog data to extract insights about GenAI usage:
1. Creates a list of unique users
2. Concatenates GenAI use case descriptions for each user with proper formatting
3. Captures GenAI efficiency values and other metrics
4. Identifies projects with highest GenAI usage
5. Analyzes most common AI tools and use cases
6. Identifies prompt champions based on quality metrics
""")

# File uploader
uploaded_file = st.file_uploader("Upload your worklog CSV or Excel file", type=["csv", "xlsx", "xls"])

if uploaded_file is not None:
    try:
        # Read the file
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)
        
        # Display the original data
        st.subheader("Original Data Preview")
        st.dataframe(df.head())
        
        # Check if required columns exist
        required_columns = ['User', 'GenAI use case description', 'GenAI Efficiency (Log time in hours)']
        for col in ['Required', 'Logged', 'Date', 'Project']:
            if col in df.columns:
                required_columns.append(col)
        
        missing_columns = [col for col in required_columns[:3] if col not in df.columns]
        
        if missing_columns:
            st.warning(f"The following required columns are missing: {', '.join(missing_columns)}")
            st.markdown("""
            For full functionality, your file should contain these columns:
            - User
            - GenAI use case description
            - GenAI Efficiency (Log time in hours)
            - Required
            - Logged
            - Date
            - Project (optional but recommended for project analysis)
            """)
            
            # Stop if essential columns are missing
            if any(col in missing_columns for col in ['User', 'GenAI use case description']):
                st.error("Cannot continue without essential columns.")
                st.stop()
            
            # Continue with available columns
            st.info("Continuing with available columns...")
        
        # Process the data
        if st.button("Process Data"):
            with st.spinner("Processing data..."):
                result_df = process_genai_data(df)
                
                # Get project analysis if available
                project_analysis = None
                if 'Project' in df.columns:
                    project_analysis = analyze_projects_by_genai_hours(df)
                
                # Get AI tools usage
                ai_tool_counts = extract_ai_tools_from_descriptions(df)
                
                # Get use case analysis
                use_case_counts = extract_use_cases_from_descriptions(df)
            
            # Display the result
            st.subheader("Processed Data")
            st.dataframe(result_df)
            
            # Download link
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            st.subheader("Download Processed Data")
            st.markdown(get_download_link(result_df, f"genai_processed_data_{timestamp}.xlsx"), unsafe_allow_html=True)
            
            # NEW INSIGHTS SECTION
            st.header("🔍 Advanced GenAI Insights")
            
            # 1. Project with highest GenAI efficacy log hours
            if project_analysis is not None and not project_analysis.empty:
                st.subheader("🏆 Project with Highest GenAI Efficacy Hours")
                
                top_project = project_analysis.iloc[0]
                
                col1, col2 = st.columns(2)
                with col1:
                    st.markdown(f"""
                    <div class="highlight-box">
                        <h3>{top_project['Project']}</h3>
                        <p>Total GenAI Hours: <b>{round(top_project['Total_GenAI_Hours'], 2)}</b></p>
                        <p>Number of Users: <b>{top_project['User_Count']}</b></p>
                        <p>Average Hours per User: <b>{round(top_project['Total_GenAI_Hours'] / top_project['User_Count'], 2)}</b></p>
                    </div>
                    """, unsafe_allow_html=True)
                
                with col2:
                    # Bar chart of top 5 projects
                    top_projects = project_analysis.head(5)
                    fig = px.bar(
                        top_projects, 
                        x='Project', 
                        y='Total_GenAI_Hours',
                        title='Top 5 Projects by GenAI Hours',
                        color='Total_GenAI_Hours',
                        color_continuous_scale='Viridis'
                    )
                    fig.update_layout(xaxis_title="Project", yaxis_title="Total GenAI Hours")
                    st.plotly_chart(fig, use_container_width=True)
                
                # Full project analysis
                st.markdown("### All Projects Analysis")
                st.dataframe(project_analysis)
            
            # 2. Most prominent use cases of AI tools
            st.subheader("📊 Most Prominent AI Use Cases")
            
            col1, col2 = st.columns(2)
            
            with col1:
                # AI Tools Analysis
                st.markdown("### Top AI Tools Mentioned")
                
                if ai_tool_counts:
                    # Convert to dataframe for visualization
                    ai_tools_df = pd.DataFrame({
                        'Tool': list(ai_tool_counts.keys()),
                        'Mentions': list(ai_tool_counts.values())
                    }).sort_values('Mentions', ascending=False)
                    
                    fig = px.bar(
                        ai_tools_df.head(10), 
                        x='Tool', 
                        y='Mentions',
                        title='Top 10 AI Tools Mentioned',
                        color='Mentions',
                        color_continuous_scale='Blues'
                    )
                    fig.update_layout(xaxis_title="AI Tool", yaxis_title="Number of Mentions")
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Top tool insight
                    if not ai_tools_df.empty:
                        top_tool = ai_tools_df.iloc[0]
                        st.markdown(f"""
                        <div class="highlight-box">
                            <p>Most used AI tool: <b>{top_tool['Tool']}</b> with {top_tool['Mentions']} mentions</p>
                        </div>
                        """, unsafe_allow_html=True)
                else:
                    st.info("No specific AI tools were identified in the descriptions.")
            
            with col2:
                # Use Cases Analysis
                st.markdown("### Top Use Cases")
                
                if use_case_counts:
                    # Convert to dataframe for visualization
                    use_cases_df = pd.DataFrame({
                        'Use Case': list(use_case_counts.keys()),
                        'Count': list(use_case_counts.values())
                    }).sort_values('Count', ascending=False)
                    
                    fig = px.pie(
                        use_cases_df.head(5), 
                        names='Use Case', 
                        values='Count',
                        title='Top 5 GenAI Use Cases',
                        hole=0.4
                    )
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Top use case insight
                    if not use_cases_df.empty:
                        top_use_case = use_cases_df.iloc[0]
                        st.markdown(f"""
                        <div class="highlight-box">
                            <p>Most common use case: <b>{top_use_case['Use Case']}</b> mentioned in {top_use_case['Count']} descriptions</p>
                        </div>
                        """, unsafe_allow_html=True)
                else:
                    st.info("No specific use cases were identified in the descriptions.")
            
            # 3. Champion of the prompt with quality GenAI Description
            st.subheader("👑 GenAI Prompt Champions")
            
            if 'Description_Quality_Score' in result_df.columns:
                # Get top 3 users by quality score
                top_quality_users = result_df.sort_values('Description_Quality_Score', ascending=False).head(3)
                
                # Display top champion
                if not top_quality_users.empty:
                    champion = top_quality_users.iloc[0]
                    st.markdown(f"""
                    <div class="highlight-box">
                        <h3>🏆 Prompt Champion: {champion['User']}</h3>
                        <p>Quality Score: <span class="quality-high">{champion['Description_Quality_Score']}/100</span></p>
                        <p>GenAI Efficiency: {round(champion['GenAI_Efficiency'], 2)} hours</p>
                        <p><b>GenAI Descriptions:</b></p>
                        <pre>{champion['GenAI_Descriptions']}</pre>
                    </div>
                    """, unsafe_allow_html=True)
                
                # Quality score distribution
                st.markdown("### Quality Score Distribution")
                fig = px.histogram(
                    result_df, 
                    x='Description_Quality_Score',
                    nbins=10,
                    title='Distribution of GenAI Description Quality Scores',
                    color_discrete_sequence=['#4CAF50']
                )
                fig.update_layout(xaxis_title="Quality Score", yaxis_title="Number of Users")
                st.plotly_chart(fig, use_container_width=True)
                
                # Quality score by user with team categorization
                st.markdown("### Quality Scores by User & Team Categories")
                
                # Create a more comprehensive dataframe for team identification
                team_df = result_df[['User', 'Description_Quality_Score', 'GenAI_Efficiency', 'Total_Logged_Hours']].copy()
                
                # Ensure we have numeric values for calculations
                team_df['Description_Quality_Score'] = pd.to_numeric(team_df['Description_Quality_Score'], errors='coerce').fillna(0)
                team_df['GenAI_Efficiency'] = pd.to_numeric(team_df['GenAI_Efficiency'], errors='coerce').fillna(0)
                team_df['Total_Logged_Hours'] = pd.to_numeric(team_df['Total_Logged_Hours'], errors='coerce').fillna(0)
                
                # Calculate a combined score (weighted average of quality and hours)
                # Weight: 60% quality, 40% efficiency hours
                max_quality = team_df['Description_Quality_Score'].max() if not team_df.empty and team_df['Description_Quality_Score'].max() > 0 else 100
                max_hours = team_df['GenAI_Efficiency'].max() if not team_df.empty and team_df['GenAI_Efficiency'].max() > 0 else 1
                
                team_df['Quality_Normalized'] = team_df['Description_Quality_Score'] / max_quality * 100
                team_df['Hours_Normalized'] = team_df['GenAI_Efficiency'] / max_hours * 100
                team_df['Combined_Score'] = (team_df['Quality_Normalized'] * 0.6) + (team_df['Hours_Normalized'] * 0.4)
                
                # Assign team categories based on combined score and individual metrics
                def assign_team_category(row):
                    quality = row['Description_Quality_Score']
                    hours = row['GenAI_Efficiency']
                    combined = row['Combined_Score']
                    
                    if quality >= 80 and hours >= (max_hours * 0.7):
                        return "🔥 GenAI Champion", "Masters of both quality and quantity"
                    elif quality >= 70:
                        return "✨ Prompt Expert", "High-quality prompt crafters"
                    elif hours >= (max_hours * 0.8):
                        return "⚡ Power User", "High volume GenAI users"
                    elif combined >= 60:
                        return "🌟 Balanced Performer", "Good balance of quality and usage"
                    elif quality >= 50:
                        return "📝 Quality Focused", "Focuses on quality over quantity"
                    elif hours > 0:
                        return "🔍 Exploring User", "Beginning GenAI journey"
                    else:
                        return "❓ Inactive", "Little to no GenAI usage"
                
                # Apply the team categorization
                team_df[['Team_Category', 'Category_Description']] = team_df.apply(assign_team_category, axis=1, result_type='expand')
                
                # Sort by combined score
                team_df = team_df.sort_values('Combined_Score', ascending=False)
                
                # Add color coding based on quality score
                def quality_color(score):
                    if score >= 70:
                        return 'quality-high'
                    elif score >= 40:
                        return 'quality-medium'
                    else:
                        return 'quality-low'
                
                team_df['Score_Display'] = team_df['Description_Quality_Score'].apply(
                    lambda x: f'<span class="{quality_color(x)}">{x}</span>'
                )
                
                # Create a display dataframe with the relevant columns
                display_df = team_df[['User', 'Score_Display', 'GenAI_Efficiency', 'Team_Category', 'Category_Description']]
                display_df.columns = ['User', 'Quality Score', 'GenAI Hours', 'Team Category', 'Description']
                
                # Display as a styled dataframe
                st.write(display_df.to_html(escape=False), unsafe_allow_html=True)
                
                # Team distribution pie chart
                st.markdown("### Team Category Distribution")
                team_counts = team_df['Team_Category'].value_counts().reset_index()
                team_counts.columns = ['Team_Category', 'Count']
                
                fig = px.pie(
                    team_counts, 
                    names='Team_Category', 
                    values='Count',
                    title='Distribution of Team Categories',
                    color_discrete_sequence=px.colors.qualitative.Bold
                )
                st.plotly_chart(fig, use_container_width=True)
                
                # Quality factors explanation
                st.markdown("""
                ### How Quality Scores Are Calculated
                
                The quality score is based on these factors:
                
                1. **Length & Detail (40%)**: Longer, more detailed descriptions score higher
                2. **Specificity (30%)**: Mentions of specific AI tools, metrics, and technical details
                3. **Uniqueness (30%)**: Variety of terms and concepts used
                
                Scores range from 0-100, with higher scores indicating more comprehensive and useful GenAI descriptions.
                """)
            
            # Data visualization section (original visualizations)
            st.header("📈 Data Visualization")
            
            # Tab layout for visualizations
            tab1, tab2, tab3, tab4 = st.tabs(["GenAI Efficiency", "Utilization", "User Analysis", "Tools & Use Cases"])
            
            with tab1:
                # GenAI Efficiency by User
                if 'GenAI_Efficiency' in result_df.columns:
                    st.subheader("GenAI Efficiency by User")
                    sorted_df = result_df.sort_values('GenAI_Efficiency', ascending=False)
                    fig = px.bar(
                        sorted_df, 
                        x='User', 
                        y='GenAI_Efficiency',
                        title='GenAI Efficiency Hours by User',
                        color='GenAI_Efficiency',
                        color_continuous_scale='Viridis'
                    )
                    fig.update_layout(xaxis_title="User", yaxis_title="Hours")
                    st.plotly_chart(fig, use_container_width=True)
            
            with tab2:
                # Utilization Percentage
                if 'Utilization_Percentage' in result_df.columns:
                    st.subheader("Utilization Percentage by User")
                    sorted_df = result_df.sort_values('Utilization_Percentage', ascending=False)
                    fig = px.bar(
                        sorted_df, 
                        x='User', 
                        y='Utilization_Percentage',
                        title='Utilization Percentage by User',
                        color='Utilization_Percentage',
                        color_continuous_scale='RdYlGn'
                    )
                    fig.update_layout(xaxis_title="User", yaxis_title="Utilization %")
                    st.plotly_chart(fig, use_container_width=True)
                    
                    # Required vs Logged Hours
                    if 'Total_Required_Hours' in result_df.columns and 'Total_Logged_Hours' in result_df.columns:
                        st.subheader("Required vs Logged Hours by User")
                        fig = px.bar(
                            result_df, 
                            x='User', 
                            y=['Total_Required_Hours', 'Total_Logged_Hours'],
                            title='Required vs Logged Hours by User',
                            barmode='group'
                        )
                        fig.update_layout(xaxis_title="User", yaxis_title="Hours")
                        st.plotly_chart(fig, use_container_width=True)
            
            with tab3:
                # User with GenAI descriptions
                st.subheader("Users with GenAI Use Cases")
                has_description = result_df['GenAI_Descriptions'] != ""
                fig = px.pie(
                    names=['Has GenAI Use Cases', 'No GenAI Use Cases'],
                    values=[result_df[has_description].shape[0], result_df[~has_description].shape[0]],
                    title='Users with GenAI Use Cases'
                )
                st.plotly_chart(fig, use_container_width=True)
            
            with tab4:
                # Combined tools and use cases view
                st.subheader("AI Tools and Use Cases")
                
                if ai_tool_counts and use_case_counts:
                    col1, col2 = st.columns(2)
                    
                    with col1:
                        # Word cloud of AI tools (text representation)
                        st.markdown("### AI Tools Word Cloud")
                        ai_tools_text = " ".join([f"{tool} " * count for tool, count in ai_tool_counts.items()])
                        st.text_area("", ai_tools_text, height=200)
                    
                    with col2:
                        # Use cases bar chart
                        use_cases_df = pd.DataFrame({
                            'Use Case': list(use_case_counts.keys()),
                            'Count': list(use_case_counts.values())
                        }).sort_values('Count', ascending=False)
                        
                        fig = px.bar(
                            use_cases_df, 
                            x='Use Case', 
                            y='Count',
                            title='All GenAI Use Cases',
                            color='Count',
                            color_continuous_scale='YlOrRd'
                        )
                        fig.update_layout(xaxis_title="Use Case", yaxis_title="Count")
                        st.plotly_chart(fig, use_container_width=True)
            
            # Summary statistics
            st.subheader("Summary Statistics")
            
            col1, col2, col3 = st.columns(3)
            
            with col1:
                st.metric("Total Users", len(result_df))
            
            with col2:
                # Average efficiency
                avg_efficiency = result_df['GenAI_Efficiency'].mean()
                if not pd.isna(avg_efficiency):
                    st.metric("Avg GenAI Efficiency (hours)", round(avg_efficiency, 2))
            
            with col3:
                # Average utilization
                if 'Utilization_Percentage' in result_df.columns:
                    avg_util = result_df['Utilization_Percentage'].mean()
                    st.metric("Avg Utilization %", f"{round(avg_util, 2)}%")
            
            # New row of metrics
            col1, col2, col3 = st.columns(3)
            
            with col1:
                if ai_tool_counts:
                    top_tool = max(ai_tool_counts.items(), key=lambda x: x[1])[0]
                    st.metric("Most Used AI Tool", top_tool)
            
            with col2:
                if use_case_counts:
                    top_use_case = max(use_case_counts.items(), key=lambda x: x[1])[0]
                    st.metric("Top Use Case", top_use_case)
            
            with col3:
                if 'Description_Quality_Score' in result_df.columns and not result_df.empty:
                    avg_quality = result_df['Description_Quality_Score'].mean()
                    st.metric("Avg Description Quality", f"{round(avg_quality, 1)}/100")
            
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.markdown("Please check your file format and try again.")

# Footer
st.markdown("---")
st.markdown("**Enhanced GenAI Worklog Processor** • Built with Streamlit and Pandas")