Spaces:

hansche
/

Foci_of_Interest

No application file

App Files Files Community

hansche commited on Apr 10, 2025

Commit

dd83f16

verified ·

1 Parent(s): 155e783

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -438

app.py DELETED Viewed

@@ -1,438 +0,0 @@
-import streamlit as st
-st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide")
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-import preprocessor, helper
-from sentiment import predict_sentiment_batch
-import os
-os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false"
-# Theme customization
-st.markdown(
-    """
-    <style>
-    .main {background-color: #f0f2f6;}
-    </style>
-    """,
-    unsafe_allow_html=True
-)
-# Set seaborn style
-sns.set_theme(style="whitegrid")
-st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard")
-st.subheader('Instructions')
-st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.")
-st.markdown("2. Wait for the initial processing (minimal delay).")
-st.markdown("3. Customize the analysis by selecting users or filters.")
-st.markdown("4. Click 'Show Analysis' for detailed results.")
-st.sidebar.title("Whatsapp Chat Analyzer")
-uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt")
-@st.cache_data
-def load_and_preprocess(file_content):
-    return preprocessor.preprocess(file_content)
-if uploaded_file is not None:
-    raw_data = uploaded_file.read().decode("utf-8")
-    with st.spinner("Loading chat data..."):
-        df, _ = load_and_preprocess(raw_data)
-    st.session_state.df = df
-    st.sidebar.header("🔍 Filters")
-    user_list = ["Overall"] + sorted(df["user"].unique().tolist())
-    selected_user = st.sidebar.selectbox("Select User", user_list)
-    df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user]
-    if st.sidebar.button("Show Analysis"):
-        if df_filtered.empty:
-            st.warning(f"No data found for user: {selected_user}")
-        else:
-            with st.spinner("Analyzing..."):
-                if 'sentiment' not in df_filtered.columns:
-                    try:
-                        print("Starting sentiment analysis...")
-                        # Get messages as clean strings
-                        message_list = df_filtered["message"].astype(str).tolist()
-                        message_list = [msg for msg in message_list if msg.strip()]
-                        print(f"Processing {len(message_list)} messages")
-                        print(f"Sample messages: {message_list[:5]}")
-                        # Directly call the sentiment analysis function
-                        df_filtered['sentiment'] = predict_sentiment_batch(message_list)
-                        print("Sentiment analysis completed successfully")
-                    except Exception as e:
-                        st.error(f"Sentiment analysis failed: {str(e)}")
-                        print(f"Full error: {str(e)}")
-                    st.session_state.df_filtered = df_filtered
-                else:
-                    st.session_state.df_filtered = df_filtered
-                # Display statistics and visualizations
-                num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered)
-                st.title("Top Statistics")
-                col1, col2, col3, col4 = st.columns(4)
-                with col1:
-                    st.header("Total Messages")
-                    st.title(num_messages)
-                with col2:
-                    st.header("Total Words")
-                    st.title(words)
-                with col3:
-                    st.header("Media Shared")
-                    st.title(num_media)
-                with col4:
-                    st.header("Links Shared")
-                    st.title(num_links)
-                st.title("Monthly Timeline")
-                timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
-                if not timeline.empty:
-                    plt.figure(figsize=(10, 5))
-                    sns.lineplot(data=timeline, x='time', y='message', color='green')
-                    plt.title("Monthly Timeline")
-                    plt.xlabel("Date")
-                    plt.ylabel("Messages")
-                    st.pyplot(plt)
-                    plt.clf()
-                st.title("Daily Timeline")
-                daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered))))
-                if not daily_timeline.empty:
-                    plt.figure(figsize=(10, 5))
-                    sns.lineplot(data=daily_timeline, x='date', y='message', color='black')
-                    plt.title("Daily Timeline")
-                    plt.xlabel("Date")
-                    plt.ylabel("Messages")
-                    st.pyplot(plt)
-                    plt.clf()
-                st.title("Activity Map")
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.header("Most Busy Day")
-                    busy_day = helper.week_activity_map(selected_user, df_filtered)
-                    if not busy_day.empty:
-                        plt.figure(figsize=(10, 5))
-                        sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r")
-                        plt.title("Most Busy Day")
-                        plt.xlabel("Day of Week")
-                        plt.ylabel("Message Count")
-                        st.pyplot(plt)
-                        plt.clf()
-                with col2:
-                    st.header("Most Busy Month")
-                    busy_month = helper.month_activity_map(selected_user, df_filtered)
-                    if not busy_month.empty:
-                        plt.figure(figsize=(10, 5))
-                        sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r")
-                        plt.title("Most Busy Month")
-                        plt.xlabel("Month")
-                        plt.ylabel("Message Count")
-                        st.pyplot(plt)
-                        plt.clf()
-                if selected_user == 'Overall':
-                    st.title("Most Busy Users")
-                    x, new_df = helper.most_busy_users(df_filtered)
-                    if not x.empty:
-                        plt.figure(figsize=(10, 5))
-                        sns.barplot(x=x.index, y=x.values, palette="Reds_r")
-                        plt.title("Most Busy Users")
-                        plt.xlabel("User")
-                        plt.ylabel("Message Count")
-                        plt.xticks(rotation=45)
-                        st.pyplot(plt)
-                        st.title("Word Count by User")
-                        plt.clf()
-                        st.dataframe(new_df)
-                # Most common words analysis
-                st.title("Most Common Words")
-                most_common_df = helper.most_common_words(selected_user, df_filtered)
-                if not most_common_df.empty:
-                    fig, ax = plt.subplots(figsize=(10, 6))
-                    sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r")
-                    ax.set_title("Top 20 Most Common Words")
-                    ax.set_xlabel("Frequency")
-                    ax.set_ylabel("Words")
-                    plt.xticks(rotation='vertical')
-                    st.pyplot(fig)
-                    plt.clf()
-                else:
-                    st.warning("No data available for most common words.")
-                # Emoji analysis
-                st.title("Emoji Analysis")
-                emoji_df = helper.emoji_helper(selected_user, df_filtered)
-                if not emoji_df.empty:
-                    col1, col2 = st.columns(2)
-                    with col1:
-                        st.subheader("Top Emojis Used")
-                        st.dataframe(emoji_df)
-                    with col2:
-                        fig, ax = plt.subplots(figsize=(8, 8))
-                        ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(),
-                              autopct="%0.2f%%", startangle=90,
-                              colors=sns.color_palette("pastel"))
-                        ax.set_title("Top Emoji Distribution")
-                        st.pyplot(fig)
-                        plt.clf()
-                else:
-                    st.warning("No data available for emoji analysis.")
-                # Sentiment Analysis Visualizations
-                st.title("📈 Sentiment Analysis")
-                # Convert month names to abbreviated format
-                month_map = {
-                    'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
-                    'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
-                    'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec'
-                }
-                df_filtered['month'] = df_filtered['month'].map(month_map)
-                # Group by month and sentiment
-                monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0)
-                # Plotting: Histogram (Bar Chart) for each sentiment
-                st.write("### Sentiment Count by Month (Histogram)")
-                # Create a figure with subplots for each sentiment
-                fig, axes = plt.subplots(1, 3, figsize=(18, 5))
-                # Plot Positive Sentiment
-                if 'positive' in monthly_sentiment:
-                    axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green')
-                axes[0].set_title('Positive Sentiment')
-                axes[0].set_xlabel('Month')
-                axes[0].set_ylabel('Count')
-                # Plot Neutral Sentiment
-                if 'neutral' in monthly_sentiment:
-                    axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue')
-                axes[1].set_title('Neutral Sentiment')
-                axes[1].set_xlabel('Month')
-                axes[1].set_ylabel('Count')
-                # Plot Negative Sentiment
-                if 'negative' in monthly_sentiment:
-                    axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red')
-                axes[2].set_title('Negative Sentiment')
-                axes[2].set_xlabel('Month')
-                axes[2].set_ylabel('Count')
-                # Display the plots in Streamlit
-                st.pyplot(fig)
-                plt.clf()
-                # Count sentiments per day of the week
-                sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0)
-                # Sort days correctly
-                day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
-                sentiment_counts = sentiment_counts.reindex(day_order)
-                # Daily Sentiment Analysis
-                st.write("### Daily Sentiment Analysis")
-                # Create a Matplotlib figure
-                fig, ax = plt.subplots(figsize=(10, 5))
-                sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green'])
-                # Customize the plot
-                ax.set_xlabel("Day of the Week")
-                ax.set_ylabel("Count")
-                ax.set_title("Sentiment Distribution per Day of the Week")
-                ax.legend(title="Sentiment")
-                # Display the plot in Streamlit
-                st.pyplot(fig)
-                plt.clf()
-                # Count messages per user per sentiment (only for Overall view)
-                if selected_user == 'Overall':
-                    sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count')
-                    # Calculate total messages per sentiment
-                    total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict()
-                    # Add percentage column
-                    sentiment_counts['Percentage'] = sentiment_counts.apply(
-                        lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1
-                    )
-                    # Separate tables for each sentiment
-                    positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10)
-                    neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10)
-                    negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10)
-                    # Sentiment Contribution Analysis
-                    st.write("### Sentiment Contribution by User")
-                    # Create three columns for side-by-side display
-                    col1, col2, col3 = st.columns(3)
-                    # Display Positive Table
-                    with col1:
-                        st.subheader("Top Positive Contributors")
-                        if not positive_df.empty:
-                            st.dataframe(positive_df[['user', 'Count', 'Percentage']])
-                        else:
-                            st.warning("No positive sentiment data")
-                    # Display Neutral Table
-                    with col2:
-                        st.subheader("Top Neutral Contributors")
-                        if not neutral_df.empty:
-                            st.dataframe(neutral_df[['user', 'Count', 'Percentage']])
-                        else:
-                            st.warning("No neutral sentiment data")
-                    # Display Negative Table
-                    with col3:
-                        st.subheader("Top Negative Contributors")
-                        if not negative_df.empty:
-                            st.dataframe(negative_df[['user', 'Count', 'Percentage']])
-                        else:
-                            st.warning("No negative sentiment data")
-                             # Topic Analysis Section
-                st.title("🔍 Area of Focus: Topic Analysis")
-                # Check if topic column exists, otherwise perform topic modeling
-                # if 'topic' not in df_filtered.columns:
-                #     with st.spinner("Performing topic modeling..."):
-                #         try:
-                #             # Add topic modeling here or ensure your helper functions handle it
-                #             df_filtered = helper.perform_topic_modeling(df_filtered)
-                #         except Exception as e:
-                #             st.error(f"Topic modeling failed: {str(e)}")
-                #             st.stop()
-                # Plot Topic Distribution
-                st.header("Topic Distribution")
-                try:
-                    fig = helper.plot_topic_distribution(df_filtered)
-                    st.pyplot(fig)
-                    plt.clf()
-                except Exception as e:
-                    st.warning(f"Could not display topic distribution: {str(e)}")
-                # Display Sample Messages for Each Topic
-                st.header("Sample Messages for Each Topic")
-                if 'topic' in df_filtered.columns:
-                    for topic_id in sorted(df_filtered['topic'].unique()):
-                        st.subheader(f"Topic {topic_id}")
-                        # Get messages for the current topic
-                        filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message']
-                        # Determine sample size
-                        sample_size = min(5, len(filtered_messages))
-                        if sample_size > 0:
-                            sample_messages = filtered_messages.sample(sample_size, replace=False).tolist()
-                            for msg in sample_messages:
-                                st.write(f"- {msg}")
-                        else:
-                            st.write("No messages available for this topic.")
-                else:
-                    st.warning("Topic information not available")
-                # Topic Distribution Over Time
-                st.header("📅 Topic Trends Over Time")
-                # Add time frequency selector
-                time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq')
-                # Plot topic trends
-                try:
-                    freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"}
-                    topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq])
-                    # Choose between static and interactive plot
-                    use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly')
-                    if use_plotly:
-                        fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution)
-                        st.plotly_chart(fig, use_container_width=True)
-                    else:
-                        fig = helper.plot_topic_distribution_over_time(topic_distribution)
-                        st.pyplot(fig)
-                        plt.clf()
-                except Exception as e:
-                    st.warning(f"Could not display topic trends: {str(e)}")
-                # Clustering Analysis Section
-                st.title("🧩 Conversation Clusters")
-                # Number of clusters input
-                n_clusters = st.slider("Select number of clusters",
-                                       min_value=2,
-                                       max_value=10,
-                                       value=5,
-                                       key='n_clusters')
-                # Perform clustering
-                with st.spinner("Analyzing conversation clusters..."):
-                    try:
-                        df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters)
-                        # Plot clusters
-                        st.header("Cluster Visualization")
-                        fig = helper.plot_clusters(reduced_features, df_clustered['cluster'])
-                        st.pyplot(fig)
-                        plt.clf()
-                        # Cluster Insights
-                        st.header("📌 Cluster Insights")
-                        # 1. Dominant Conversation Themes
-                        st.subheader("1. Dominant Themes")
-                        cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters)
-                        for cluster_id, label in cluster_labels.items():
-                            st.write(f"**Cluster {cluster_id}**: {label}")
-                        # 2. Temporal Patterns
-                        st.subheader("2. Temporal Patterns")
-                        temporal_trends = helper.get_temporal_trends(df_clustered)
-                        for cluster_id, trend in temporal_trends.items():
-                            st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}")
-                        # 3. User Contributions
-                        if selected_user == 'Overall':
-                            st.subheader("3. Top Contributors")
-                            user_contributions = helper.get_user_contributions(df_clustered)
-                            for cluster_id, users in user_contributions.items():
-                                st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...")
-                        # 4. Sentiment by Cluster
-                        st.subheader("4. Sentiment Analysis")
-                        sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered)
-                        for cluster_id, sentiment in sentiment_by_cluster.items():
-                            st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative")
-                        # Sample messages from each cluster
-                        st.subheader("Sample Messages")
-                        for cluster_id in sorted(df_clustered['cluster'].unique()):
-                            with st.expander(f"Cluster {cluster_id} Messages"):
-                                cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message']
-                                sample_size = min(3, len(cluster_msgs))
-                                if sample_size > 0:
-                                    for msg in cluster_msgs.sample(sample_size, replace=False):
-                                        st.write(f"- {msg}")
-                                else:
-                                    st.write("No messages available")
-                    except Exception as e:
-                        st.error(f"Clustering failed: {str(e)}")