Spaces:

Mars203020
/

bertopic

Sleeping

File size: 31,545 Bytes

b7b041e

import streamlit as st
import pandas as pd
import numpy as np
import time

import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Import custom modules
from text_preprocessor import MultilingualPreprocessor
from topic_modeling import perform_topic_modeling
from gini_calculator import calculate_gini_per_user, calculate_gini_per_topic
from topic_evolution import analyze_general_topic_evolution
from narrative_similarity import calculate_narrative_similarity, calculate_text_similarity_tfidf 

# --- Page Configuration ---
st.set_page_config(
    page_title="Social Media Topic Modeling System",
    page_icon="📊",
    layout="wide",
)

# --- Custom CSS ---
st.markdown("""
<style>
    .main-header { font-size: 2.5rem; color: #1f77b4; text-align: center; margin-bottom: 1rem; }
    .sub-header { font-size: 1.75rem; color: #2c3e50; border-bottom: 2px solid #f0f2f6; padding-bottom: 0.3rem; margin-top: 2rem; margin-bottom: 1rem;}
</style>
""", unsafe_allow_html=True)

# --- Session State Initialization ---
if 'results' not in st.session_state:
    st.session_state.results = None
if 'df_raw' not in st.session_state:
    st.session_state.df_raw = None
if 'custom_stopwords_text' not in st.session_state:
    st.session_state.custom_stopwords_text = ""
if "topics_info_for_sync" not in st.session_state:
    st.session_state.topics_info_for_sync = []


# --- Helper Functions ---
@st.cache_data
def create_word_cloud(_topic_model, topic_id):
    word_freq = _topic_model.get_topic(topic_id)
    if not word_freq: return None
    wc = WordCloud(width=800, height=400, background_color="white", colormap="viridis", max_words=50).generate_from_frequencies(dict(word_freq))
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.imshow(wc, interpolation='bilinear')
    ax.axis("off")
    plt.close(fig)
    return fig



def interpret_gini(gini_score):
    # Handle NaN or None values
    if gini_score is None or (isinstance(gini_score, float) and np.isnan(gini_score)):
        return "N/A"
    # Logic is now FLIPPED for Gini Impurity
    if gini_score >= 0.6: return "Diverse Interests"
    elif gini_score >= 0.3: return "Moderately Focused"
    else: return "Highly Specialized"

# --- START OF DEFINITIVE FIX: Centralized Callback Function ---
def sync_stopwords():
    """
    This function is the single source of truth for updating stopwords.
    It's called whenever any related widget changes.
    """
    # 1. Get words from all multiselect lists
    selected_from_lists = set()
    for topic_id in st.session_state.topics_info_for_sync:
        key = f"multiselect_topic_{topic_id}"
        if key in st.session_state:
            selected_from_lists.update([s.split(' ')[0] for s in st.session_state[key]])

    # 2. Get words from the text area
    # The key for the text area is now the master state variable itself.
    typed_stopwords = set([s.strip() for s in st.session_state.custom_stopwords_text.split(',') if s])

    # 3. Combine them and update the master state variable
    combined_stopwords = typed_stopwords.union(selected_from_lists)
    st.session_state.custom_stopwords_text = ", ".join(sorted(list(combined_stopwords)))


# --- Main Page Layout ---
st.title("🌍 Multilingual Topic Modeling Dashboard")
st.markdown("Analyze textual data in multiple languages to discover topics and user trends.")

# Use a key to ensure the file uploader keeps its state, and update session_state directly
uploaded_file = st.file_uploader("Upload your CSV data", type="csv", key="csv_uploader")

# Check if a new file has been uploaded (or if it's the first time and a file exists)
if uploaded_file is not None and uploaded_file != st.session_state.get('last_uploaded_file', None):
    try:
        st.session_state.df_raw = pd.read_csv(uploaded_file)
        st.session_state.results = None # Reset results if a new file is uploaded
        st.session_state.custom_stopwords_text = ""
        st.session_state.last_uploaded_file = uploaded_file # Store the uploaded file itself
        st.success("CSV file loaded successfully!")
    except Exception as e:
        st.error(f"Could not read CSV file. Error: {e}")
        st.session_state.df_raw = None
        st.session_state.last_uploaded_file = None

if st.session_state.df_raw is not None:
    df_raw = st.session_state.df_raw
    col1, col2, col3 = st.columns(3)

    with col1: user_id_col = st.selectbox("User ID Column", df_raw.columns, index=0, key="user_id_col")
    with col2: post_content_col = st.selectbox("Post Content Column", df_raw.columns, index=min(1, len(df_raw.columns)-1), key="post_content_col")
    with col3: timestamp_col = st.selectbox("Timestamp Column", df_raw.columns, index=min(2, len(df_raw.columns)-1), key="timestamp_col")
    
    st.subheader("Topic Modeling Settings")
    lang_col, topics_col = st.columns(2)
    with lang_col: language = st.selectbox("Language Model", ["english", "multilingual"], key="language_model")
    with topics_col: num_topics = st.number_input("Number of Topics", -1, help="Use -1 for automatic detection", key="num_topics")
    
    with st.expander("Advanced: Text Cleaning & Preprocessing Options", expanded=False):
        c1, c2 = st.columns(2)
        with c1:
            opts = {
                'lowercase': st.checkbox("Convert to Lowercase", True, key="opt_lowercase"),
                'lemmatize': st.checkbox("Lemmatize words", False, key="opt_lemmatize"),
                'remove_urls': st.checkbox("Remove URLs", False, key="opt_remove_urls"),
                'remove_html': st.checkbox("Remove HTML Tags", False, key="opt_remove_html")
            }
        with c2:
            opts.update({
                'remove_special_chars': st.checkbox("Remove Special Characters", False, key="opt_remove_special_chars"),
                'remove_punctuation': st.checkbox("Remove Punctuation", False, key="opt_remove_punctuation"),
                'remove_numbers': st.checkbox("Remove Numbers", False, key="opt_remove_numbers")
            })
        st.markdown("---")
        c1_emoji, c2_hashtag, c3_mention = st.columns(3)
        with c1_emoji: opts['handle_emojis'] = st.radio("Emoji Handling", ["Keep Emojis", "Remove Emojis", "Convert Emojis to Text"], index=0, key="opt_handle_emojis")
        with c2_hashtag: opts['handle_hashtags'] = st.radio("Hashtag (#) Handling", ["Keep Hashtags", "Remove Hashtags", "Extract Hashtags"], index=0, key="opt_handle_hashtags")
        with c3_mention: opts['handle_mentions'] = st.radio("Mention (@) Handling", ["Keep Mentions", "Remove Mentions", "Extract Mentions"], index=0, key="opt_handle_mentions")
        st.markdown("---")
        opts['remove_stopwords'] = st.checkbox("Remove Stopwords", True, key="opt_remove_stopwords")
        
        st.text_area(
            "Custom Stopwords (comma-separated)",
            key="custom_stopwords_text", # This one already had a key
            on_change=sync_stopwords
        )
        opts['custom_stopwords'] = [s.strip().lower() for s in st.session_state.custom_stopwords_text.split(',') if s]

    st.subheader("User Similarity Analysis")
    enable_similarity = st.checkbox(
        "Enable User Similarity Analysis",
        value=True,
        help="Find users with similar interests based on topics or text content",
        key="enable_similarity"
    )

    if enable_similarity:
        similarity_method = st.radio(
            "Similarity Method",
            options=["Topic-Based", "Text Similarity (TF-IDF)"],
            index=0,
            help="Topic-Based: Compare topic distributions. TF-IDF: Compare actual text content.",
            key="similarity_method",
            horizontal=True
        )
    else:
        similarity_method = None

    st.divider()
    process_button = st.button("🚀 Run Full Analysis", type="primary", use_container_width=True)
else:
    process_button = False

st.divider()

# --- Main Processing Logic ---
if process_button:
    st.session_state.results = None
    start_time = time.time()
    with st.spinner("Processing your data... This may take a few minutes."):
        try:
            df = df_raw[[user_id_col, post_content_col, timestamp_col]].copy()
            df.columns = ['user_id', 'post_content', 'timestamp']
            df.dropna(subset=['user_id', 'post_content', 'timestamp'], inplace=True)
            try:
                df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
                invalid_timestamps = df['timestamp'].isna().sum()
                if invalid_timestamps > 0:
                    st.warning(f"Warning: {invalid_timestamps} rows have invalid timestamps and will be excluded.")
                    df = df.dropna(subset=['timestamp'])
            except Exception as e:
                st.error(f"Could not parse timestamp column: {e}")
                st.stop()
            if opts['handle_hashtags'] == 'Extract Hashtags': df['hashtags'] = df['post_content'].str.findall(r'#\w+')
            if opts['handle_mentions'] == 'Extract Mentions': df['mentions'] = df['post_content'].str.findall(r'@\w+')
            
            # 1. Capture the user's actual choice about stopwords
            user_wants_stopwords_removed = opts.get("remove_stopwords", False)
            custom_stopwords_list = opts.get("custom_stopwords", [])
            
            # 2. Tell the preprocessor to KEEP stopwords in the text.
            opts_for_preprocessor = opts.copy()
            opts_for_preprocessor['remove_stopwords'] = False
            
            st.info("⚙️ Initializing preprocessor and cleaning text (keeping stopwords for now)...")
            preprocessor = MultilingualPreprocessor(language=language)
            df['processed_content'] = preprocessor.preprocess_series(
                df['post_content'], 
                opts_for_preprocessor,
                n_process_spacy=-1  # Use all CPU cores for faster processing
            )

            st.info("🔍 Performing topic modeling...")
            # Add +1 because BERTopic creates an outlier topic (-1), so to get N meaningful topics, request N+1
            if num_topics > 0:
                bertopic_nr_topics = num_topics + 1
            else:
                bertopic_nr_topics = "auto"

            docs_series = df['processed_content'].fillna('').astype(str)
            docs_to_model = docs_series[docs_series.str.len() > 0].tolist()
            df_with_content = df[docs_series.str.len() > 0].copy()
            
            if not docs_to_model:
                st.error("❌ After preprocessing, no documents were left to analyze. Please adjust your cleaning options.")
                st.stop()

            # 3. Pass the user's choice and stopwords list to BERTopic
            topic_model, topics, probs, coherence_score = perform_topic_modeling(
                docs=docs_to_model, 
                language=language, 
                nr_topics=bertopic_nr_topics,
                remove_stopwords_bertopic=user_wants_stopwords_removed,
                custom_stopwords=custom_stopwords_list
            )

            df_with_content['topic_id'] = topics
            df_with_content['probability'] = probs
            df = pd.merge(df, df_with_content[['topic_id', 'probability']], left_index=True, right_index=True, how='left')
            df['topic_id'] = df['topic_id'].fillna(-1).astype(int)
            
            st.info("📊 Calculating user engagement metrics...")
            all_unique_topics = sorted(df[df['topic_id'] != -1]['topic_id'].unique().tolist())
            all_unique_users = sorted(df['user_id'].unique().tolist())

            gini_per_user = calculate_gini_per_user(df[['user_id', 'topic_id']], all_topics=all_unique_topics)
            gini_per_topic = calculate_gini_per_topic(df[['user_id', 'topic_id']], all_users=all_unique_users)
            
            st.info("📈 Analyzing topic evolution...")
            general_evolution = analyze_general_topic_evolution(topic_model, docs_to_model, df_with_content['timestamp'].tolist())
            
            end_time = time.time()
            elapsed_time = end_time - start_time

            # Format elapsed time nicely
            if elapsed_time >= 60:
                minutes = int(elapsed_time // 60)
                seconds = elapsed_time % 60
                time_str = f"{minutes} min {seconds:.1f} sec"
            else:
                time_str = f"{elapsed_time:.1f} sec"

            # Cache df_meaningful for reuse (avoids repeated filtering)
            df_meaningful = df[df['topic_id'] != -1].copy()

            st.session_state.results = {
                'topic_model': topic_model,
                'topic_info': topic_model.get_topic_info(),
                'df': df,
                'df_meaningful': df_meaningful,  # Cached for performance
                'gini_per_user': gini_per_user,
                'gini_per_topic': gini_per_topic,
                'general_evolution': general_evolution,
                'coherence_score': coherence_score,
                'processing_time': elapsed_time
            }

            st.success(f"✅ Analysis complete! Processing time: {time_str}")
        except OSError as e:
            st.error(f"spaCy Model Error: Could not load model. Please run `python -m spacy download en_core_web_sm` and `python -m spacy download xx_ent_wiki_sm` from your terminal.")
        except Exception as e:
            st.error(f"❌ An error occurred during processing: {e}")
            st.exception(e)
# --- Display Results ---
if st.session_state.results:
    results = st.session_state.results
    df = results['df']
    topic_model = results['topic_model']
    topic_info = results['topic_info']
    
    st.markdown('<h2 class="sub-header">📋 Overview & Preprocessing</h2>', unsafe_allow_html=True)
    score_text = f"{results['coherence_score']:.3f}" if results['coherence_score'] is not None else "N/A"
    num_users = df['user_id'].nunique()
    avg_posts = len(df) / num_users if num_users > 0 else 0
    start_date, end_date = df['timestamp'].min(), df['timestamp'].max()
     # Option 1: More Compact Date Format
    if start_date.year == end_date.year:
        # If both dates are in the same year, only show year on the end date
        time_range_str = f"{start_date.strftime('%b %d')} - {end_date.strftime('%b %d, %Y')}"
    else:
        # If dates span multiple years, show year on both
        time_range_str = f"{start_date.strftime('%b %d, %Y')} - {end_date.strftime('%b %d, %Y')}"

    # Format processing time for display
    proc_time = results.get('processing_time', 0)
    if proc_time >= 60:
        proc_time_str = f"{int(proc_time // 60)}m {proc_time % 60:.1f}s"
    else:
        proc_time_str = f"{proc_time:.1f}s"

    col1, col2, col3, col4, col5, col6 = st.columns(6)
    col1.metric("Total Posts", len(df))
    col2.metric("Unique Users", num_users)
    col3.metric("Avg Posts / User", f"{avg_posts:.1f}")
    col4.metric("Time Range", time_range_str)
    col5.metric("Topic Coherence", score_text)
    col6.metric("Processing Time", proc_time_str)
    st.markdown("#### Preprocessing Results (Sample)")
    st.dataframe(df[['post_content', 'processed_content']].head())

    with st.expander("📊 Topic Model Evaluation Metrics"):
        st.write("""
        ### 🔹Coherence Score
        - measures how well the discovered topics make sense:
        - **> 0.6**: Excellent - Topics are very distinct and meaningful
        - **0.5 - 0.6**: Good - Topics are generally clear and interpretable  
        - **0.4 - 0.5**: Fair - Topics are somewhat meaningful but may overlap
        - **< 0.4**: Poor - Topics may be unclear or too similar
        
        💡 **Tip**: If coherence is low, try adjusting the number of topics or cleaning options.
        """)
    
    st.markdown('<h2 class="sub-header">🎯 Topic Visualization & Refinement</h2>', unsafe_allow_html=True)
    topic_options = topic_info[topic_info.Topic != -1].sort_values('Count', ascending=False)


 
    
    view1, view2 = st.tabs(["Word Clouds", "Interactive Word Lists & Refinement"])

    with view1:
        st.info("Visual representation of the most important words for each topic.")
        topics_to_show = topic_options.head(9)
        num_cols = 3
        cols = st.columns(num_cols)
        for i, row in enumerate(topics_to_show.itertuples()):
            with cols[i % num_cols]:
                st.markdown(f"##### Topic {row.Topic}: {row.Name}")
                fig = create_word_cloud(topic_model, row.Topic)
                if fig: st.pyplot(fig, use_container_width=True)
    
    with view2:
        st.info("Select or deselect words from the lists below to instantly update the custom stopwords list in the configuration section above.")
        topics_to_show = topic_options.head(9)
        # Store the topic IDs we are showing so the callback can find the right widgets
        st.session_state.topics_info_for_sync = [row.Topic for row in topics_to_show.itertuples()]

        num_cols = 3
        cols = st.columns(num_cols)
        
        # Calculate which words should be pre-selected in the multiselects
        current_stopwords_set = set([s.strip() for s in st.session_state.custom_stopwords_text.split(',') if s])

        for i, row in enumerate(topics_to_show.itertuples()):
            with cols[i % num_cols]:
                st.markdown(f"##### Topic {row.Topic}")
                topic_words = topic_model.get_topic(row.Topic)
                
                # The options for the multiselect, e.g., ["word1 (0.123)", "word2 (0.122)"]
                formatted_options = [f"{word} ({score:.3f})" for word, score in topic_words[:15]]
                
                # Determine the default selected values for this specific multiselect
                default_selection = []
                for formatted_word in formatted_options:
                    word_part = formatted_word.split(' ')[0]
                    if word_part in current_stopwords_set:
                        default_selection.append(formatted_word)

                st.multiselect(
                    f"Select words from Topic {row.Topic}",
                    options=formatted_options,
                    default=default_selection, # Pre-select words that are already in the list
                    key=f"multiselect_topic_{row.Topic}",
                    on_change=sync_stopwords, # The callback synchronizes everything
                    label_visibility="collapsed"
                )
    



    st.markdown('<h2 class="sub-header">📈 Topic Evolution</h2>', unsafe_allow_html=True)
    if not results['general_evolution'].empty:
        evo = results['general_evolution']
        
        
        # 1. Filter out the outlier topic (-1) and ensure Timestamp is a datetime object
        evo_filtered = evo[evo.Topic != -1].copy()
        evo_filtered['Timestamp'] = pd.to_datetime(evo_filtered['Timestamp'])
        
        if not evo_filtered.empty:
            # 2. Pivot the data to get topics as columns and aggregate frequencies
            evo_pivot = evo_filtered.pivot_table(
                index='Timestamp', 
                columns='Topic', 
                values='Frequency', 
                aggfunc='sum'
            ).fillna(0)
            
            # 3. Dynamically choose a good resampling frequency (Hourly, Daily, or Weekly)
            time_delta = evo_pivot.index.max() - evo_pivot.index.min()
            if time_delta.days > 60:
                resample_freq, freq_label = 'W', 'Weekly'
            elif time_delta.days > 5:
                resample_freq, freq_label = 'D', 'Daily'
            else:
                resample_freq, freq_label = 'H', 'Hourly'

            # Resample the data into the chosen time bins by summing up the frequencies
            evo_resampled = evo_pivot.resample(resample_freq).sum()

            # 4. Create the line chart using plotly.express.line
            # --- The main change is here: from px.area to px.line ---
            fig_evo = px.line(
                evo_resampled,
                x=evo_resampled.index,
                y=evo_resampled.columns,
                title=f"Topic Frequency Over Time ({freq_label} Line Chart)",
                labels={'value': 'Total Frequency', 'variable': 'Topic ID', 'index': 'Time'},
                height=500
            )
            # Make the topic IDs in the legend categorical for better color mapping
            fig_evo.for_each_trace(lambda t: t.update(name=str(t.name)))
            fig_evo.update_layout(legend_title_text='Topic')
            
            st.plotly_chart(fig_evo, use_container_width=True)
        else:
            st.info("No topic evolution data available to display (all posts may have been outliers).")
    else:
        st.warning("Could not compute topic evolution (requires more data points over time).")    


    


    st.markdown('<h2 class="sub-header">🧑‍🤝‍🧑 User Engagement Profile</h2>', unsafe_allow_html=True)

    # --- START OF THE CRITICAL FIX ---

    # 1. Use cached df_meaningful from session_state for performance
    df_meaningful = results.get('df_meaningful', df[df['topic_id'] != -1])

    # 2. Get post counts based on this meaningful data.
    meaningful_post_counts = df_meaningful.groupby('user_id').size().reset_index(name='post_count')

    # 3. Merge with the Gini results (which were already correctly calculated on meaningful topics).
    #    Using an 'inner' merge ensures we only consider users who have at least one meaningful post.
    user_metrics_df = pd.merge(
        meaningful_post_counts,
        results['gini_per_user'],
        on='user_id',
        how='inner'
    )

    # 4. Filter to include only users with more than one MEANINGFUL post.
    metrics_to_plot = user_metrics_df[user_metrics_df['post_count'] > 1].copy()

    total_meaningful_users = len(user_metrics_df)
    st.info(f"Displaying engagement profile for {len(metrics_to_plot)} users out of {total_meaningful_users} who contributed to meaningful topics.")

    # 5. Add jitter for better visualization (deterministic seed for consistency)
    np.random.seed(42)
    jitter_strength = 0.02
    metrics_to_plot['gini_jittered'] = metrics_to_plot['gini_coefficient'] + \
                                        np.random.uniform(-jitter_strength, jitter_strength, size=len(metrics_to_plot))

    # 6. Create the plot using the correctly filtered and prepared data.
    fig = px.scatter(
        metrics_to_plot,
        x='post_count',
        y='gini_jittered',
        title='User Engagement Profile (based on posts in meaningful topics)',
        labels={
            'post_count': 'Number of Posts in Meaningful Topics', # Updated label
            'gini_jittered': 'Gini Index (Topic Diversity)'
        },
        custom_data=['user_id', 'gini_coefficient']
    )
    fig.update_traces(
        marker=dict(opacity=0.5),
        hovertemplate="<b>User</b>: %{customdata[0]}<br><b>Meaningful Posts</b>: %{x}<br><b>Gini (Original)</b>: %{customdata[1]:.3f}<extra></extra>"
    )
    fig.update_yaxes(range=[-0.05, 1.05])
    st.plotly_chart(fig, use_container_width=True)

    # --- END OF THE CRITICAL FIX ---

    st.markdown('<h2 class="sub-header">👤 User Deep Dive</h2>', unsafe_allow_html=True)
    selected_user = st.selectbox("Select a User to Analyze", options=sorted(df['user_id'].unique()), key="selected_user_dropdown")

    if selected_user:
        user_df = df[df['user_id'] == selected_user]
        matching_users = user_metrics_df[user_metrics_df['user_id'] == selected_user]

        if matching_users.empty:
            st.warning("This user has no posts in meaningful topics (all posts were classified as outliers).")
            st.metric("Total Posts by User", len(user_df))
        else:
            user_gini_info = matching_users.iloc[0]

            # Display the top-level metrics for the user first
            c1, c2 = st.columns(2)
            with c1: st.metric("Total Posts by User", len(user_df))
            with c2: st.metric("Topic Diversity (Gini)", f"{user_gini_info['gini_coefficient']:.3f}", help=interpret_gini(user_gini_info['gini_coefficient']))
        
        st.markdown("---") # Add a visual separator

        # --- START: New Two-Column Layout for Charts ---
        col1, col2 = st.columns(2)

        with col1:
            # --- Chart 1: Topic Distribution Pie Chart ---
            user_topic_counts = user_df['topic_id'].value_counts().reset_index()
            user_topic_counts.columns = ['topic_id', 'count']
            
            fig_pie = px.pie(
                user_topic_counts[user_topic_counts.topic_id != -1], 
                names='topic_id', 
                values='count', 
                title=f"Overall Topic Distribution for {selected_user}", 
                hole=0.4
            )
            fig_pie.update_layout(margin=dict(l=0, r=0, t=40, b=0))
            st.plotly_chart(fig_pie, use_container_width=True)

        with col2:
            # --- Chart 2: Topic Evolution for User ---
            if len(user_df) > 1:
                user_evo_df = user_df[user_df['topic_id'] != -1].copy()
                user_evo_df['timestamp'] = pd.to_datetime(user_evo_df['timestamp'])

                if not user_evo_df.empty and user_evo_df['timestamp'].nunique() > 1:
                    user_pivot = user_evo_df.pivot_table(index='timestamp', columns='topic_id', aggfunc='size', fill_value=0)
                    
                    time_delta = user_pivot.index.max() - user_pivot.index.min()
                    if time_delta.days > 30: resample_freq = 'D'
                    elif time_delta.days > 2: resample_freq = 'H'
                    else: resample_freq = 'T'
                    
                    user_resampled = user_pivot.resample(resample_freq).sum()
                    row_sums = user_resampled.sum(axis=1)
                    user_proportions = user_resampled.div(row_sums, axis=0).fillna(0)

                    topic_name_map = topic_info.set_index('Topic')['Name'].to_dict()
                    user_proportions.rename(columns=topic_name_map, inplace=True)
                    
                    fig_user_evo = px.area(
                        user_proportions,
                        x=user_proportions.index,
                        y=user_proportions.columns,
                        title=f"Topic Proportion Over Time for {selected_user}",
                        labels={'value': 'Topic Proportion', 'variable': 'Topic', 'index': 'Time'},
                    )
                    fig_user_evo.update_layout(margin=dict(l=0, r=0, t=40, b=0))
                    st.plotly_chart(fig_user_evo, use_container_width=True)
                else:
                    st.info("This user has no posts in meaningful topics or all posts occurred at the same time.")
            else:
                st.info("Topic evolution requires more than one post to display.")


        st.markdown("#### User's Most Recent Posts")
        user_posts_table = user_df[['post_content', 'timestamp', 'topic_id']] \
            .sort_values(by='timestamp', ascending=False) \
            .head(100)
        user_posts_table.columns = ['Post Content', 'Timestamp', 'Assigned Topic']
        st.dataframe(user_posts_table, use_container_width=True)

        with st.expander("Show User Distribution by Post Count"):
            # We use 'user_metrics_df' because it's based on meaningful posts
            post_distribution = user_metrics_df['post_count'].value_counts().reset_index()
            post_distribution.columns = ['Number of Posts', 'Number of Users']
            post_distribution = post_distribution.sort_values(by='Number of Posts')

            # Create a bar chart for the distribution
            fig_dist = px.bar(
                post_distribution,
                x='Number of Posts',
                y='Number of Users',
                title='User Distribution by Number of Meaningful Posts'
            )
            st.plotly_chart(fig_dist, use_container_width=True)

            # Display the raw data in a table
            st.write("Data Table: User Distribution")
            st.dataframe(post_distribution, use_container_width=True)

    # --- User Similarity Analysis Section ---
    # Check if similarity analysis is enabled
    if st.session_state.get('enable_similarity', True):
        st.markdown('<h2 class="sub-header">🤝 User Similarity Analysis</h2>', unsafe_allow_html=True)

        # Get the selected method
        selected_method = st.session_state.get('similarity_method', 'Topic-Based')

        if selected_method == "Topic-Based":
            st.info("Finding users with similar **topic interests** based on their topic distributions.")
            df_for_similarity = results.get('df_meaningful', df[df['topic_id'] != -1])
            similarity_df = calculate_narrative_similarity(df_for_similarity)
        else:  # TF-IDF
            st.info("Finding users with similar **text content** using TF-IDF word analysis.")
            with st.spinner("Calculating text similarity (this may take a moment)..."):
                similarity_df = calculate_text_similarity_tfidf(df)

        if similarity_df.empty:
            st.warning("Not enough data to calculate similarity. Need at least 2 users with content.")
        else:
            # User selection for similarity analysis
            similarity_user = st.selectbox(
                "Select a User to Find Similar Users",
                options=sorted(similarity_df.index.tolist()),
                key="similarity_user_dropdown"
            )

            # Similarity threshold slider
            similarity_threshold = st.slider(
                "Similarity Threshold",
                min_value=0.0,
                max_value=1.0,
                value=0.5,
                step=0.05,
                help="Only show users with similarity score above this threshold"
            )

            if similarity_user:
                # Get similarity scores for the selected user
                user_similarities = similarity_df[similarity_user].drop(similarity_user)  # Exclude self

                # Filter by threshold
                similar_users = user_similarities[user_similarities >= similarity_threshold].sort_values(ascending=False)

                if similar_users.empty:
                    st.info(f"No users found with similarity >= {similarity_threshold}. Try lowering the threshold.")
                else:
                    # Create a results DataFrame with post counts
                    similar_users_df = pd.DataFrame({
                        'User ID': similar_users.index,
                        'Similarity Score': similar_users.values
                    })

                    # Add post count for context
                    post_counts = df.groupby('user_id').size()
                    similar_users_df['Post Count'] = similar_users_df['User ID'].map(post_counts).fillna(0).astype(int)

                    # Format the similarity score
                    similar_users_df['Similarity Score'] = similar_users_df['Similarity Score'].apply(lambda x: f"{x:.3f}")

                    method_label = "topic interests" if selected_method == "Topic-Based" else "text content"
                    st.write(f"**Found {len(similar_users_df)} users** with similar {method_label} to **{similarity_user}**:")
                    st.dataframe(similar_users_df, use_container_width=True, hide_index=True)