import streamlit as st import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import altair as alt import joblib from textblob import TextBlob from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split from datetime import datetime import hashlib data_url = "https://raw.githubusercontent.com/KeeganBarbee/KeeganBarbee.github.io/refs/heads/main/OnlineNewsPopularity.csv" @st.cache_data(ttl=3600) def load_data(): df = pd.read_csv(data_url) df.columns = df.columns.str.strip().str.replace(' ', '') channel_cols = ['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_world', 'data_channel_is_bus', 'data_channel_is_socmed'] def get_channel(row): for ch in channel_cols: if row[ch] == 1: return ch.replace('data_channel_is_', '') return 'other' df['channel'] = df.apply(get_channel, axis=1) days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] def get_day(row): for day in days: if row[f'weekday_is_{day}'] == 1: return day.title() return 'Unknown' df['day_of_week'] = df.apply(get_day, axis=1) return df[df['shares'] < 20000] def check_for_new_data(url): try: response = pd.read_csv(url, nrows=1) current_hash = hashlib.md5(str(response).encode()).hexdigest() if 'data_hash' not in st.session_state: st.session_state.data_hash = current_hash st.session_state.last_check = datetime.now() return False if current_hash != st.session_state.data_hash: st.session_state.data_hash = current_hash st.session_state.last_check = datetime.now() return True st.session_state.last_check = datetime.now() return False except: return False df = load_data() df_display = df.copy() model = joblib.load('popularity_model.pkl') feature_columns = joblib.load('model_features.pkl') feature_columns = [col.strip().replace(' ', '') for col in feature_columns] def analyze_and_predict(headline_text, content_text, num_images, publish_day, num_videos, num_hrefs): title_blob = TextBlob(headline_text) content_blob = TextBlob(content_text) n_tokens_title = len(headline_text.split()) n_tokens_content = len(content_text.split()) global_sentiment_polarity = content_blob.sentiment.polarity global_subjectivity = content_blob.sentiment.subjectivity title_sentiment_polarity = title_blob.sentiment.polarity weekday_flags = {'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0, 'weekday_is_thursday': 0, 'weekday_is_friday': 0, 'weekday_is_saturday': 0, 'weekday_is_sunday': 0} if publish_day: weekday_key = f'weekday_is_{publish_day.lower()}' if weekday_key in weekday_flags: weekday_flags[weekday_key] = 1 input_data = { 'n_tokens_title': n_tokens_title, 'n_tokens_content': n_tokens_content, 'global_sentiment_polarity': global_sentiment_polarity, 'global_subjectivity': global_subjectivity, 'title_sentiment_polarity': title_sentiment_polarity, 'num_imgs': num_images, 'num_videos': num_videos, 'num_hrefs': num_hrefs, **weekday_flags } X_pred = np.array([input_data[col] for col in feature_columns]).reshape(1, -1) log_pred_shares = model.predict(X_pred)[0] predicted_shares = np.expm1(log_pred_shares) return ( f"~{int(predicted_shares):,}", f"{global_sentiment_polarity:.3f}", f"{global_subjectivity:.3f}", f"{n_tokens_content}", f"{n_tokens_title}" ) st.title("Headline Impact: Live Popularity Predictor") st.markdown("Use this tool to test how your article's features affect its predicted share count.") with st.sidebar: st.header("Structural & Temporal Inputs") publish_day = st.selectbox("Day of Publication", ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), index=0) st.subheader("Multimedia & Linking") num_images = st.slider("Number of Images (num_imgs)", min_value=0, max_value=20, value=5, step=1) num_videos = st.slider("Number of Videos (num_videos)", min_value=0, max_value=10, value=1, step=1) st.subheader("Article Structure") num_hrefs = st.slider("Number of Links (num_hrefs)", min_value=0, max_value=30, value=5, step=1) st.markdown("---") st.subheader("📊 Data Freshness") if 'last_check' in st.session_state: st.caption(f"Last checked: {st.session_state.last_check.strftime('%Y-%m-%d %H:%M:%S')}") if st.button("🔄 Check for New Data"): with st.spinner("Checking for updates..."): data_updated = check_for_new_data(data_url) if data_updated: st.cache_data.clear() st.success("✅ New data detected and loaded!") st.rerun() else: st.info("✓ Data is up to date") auto_refresh = st.checkbox("Auto-refresh every hour") if auto_refresh and 'last_check' in st.session_state: time_diff = (datetime.now() - st.session_state.last_check).seconds if time_diff > 3600: st.rerun() st.header("Article Content") headline_text = st.text_input("Headline Text", placeholder="E.g., Revolutionary AI Tool Boosts Productivity") content_text = st.text_area("Article Snippet (for Sentiment Analysis)", placeholder="Paste a few paragraphs of the article content here.") if st.button("Analyze & Predict Shares"): if headline_text and content_text: predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict( headline_text, content_text, num_images, publish_day, num_videos, num_hrefs ) st.success(f"### Predicted Shares: {predicted_shares}") st.markdown("---") col1, col2, col3, col4 = st.columns(4) col1.metric("Content Polarity", polarity) col2.metric("Content Subjectivity", subjectivity) col3.metric("Content Word Count", content_length) col4.metric("Title Word Count", title_length) st.header("Visual Analysis: How do you compare?") tab1, tab2, tab3, tab4 = st.tabs(["Sentiment Analysis", "Interactive Market", "Day of Week", "Model Logic"]) with tab2: st.write("Explore how content length and topic channel affect shares.") sample = df_display.sample(1000) brush = alt.selection_interval() scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode( x=alt.X('n_tokens_content', title='Number of Tokens in Content'), y=alt.Y('shares', title='Shares'), color=alt.condition(brush, 'channel:N', alt.value('lightgray')), tooltip=['n_tokens_content', 'shares', 'channel']).add_params(brush).properties(height=400, title='Content Length vs Shares (Drag to select)') trendline = alt.Chart(sample).mark_line(color='firebrick', size=3).transform_filter(brush).transform_regression('n_tokens_content', 'shares').encode( x='n_tokens_content', y='shares') st.altair_chart(scatter + trendline, use_container_width=True) with tab1: st.write("Does being more positive lead to more shares?") fig, ax = plt.subplots(figsize=(10, 5)) sns.regplot( data=df_display.sample(2000), x='global_sentiment_polarity', y='shares', scatter_kws={'alpha': 0.1, 'color': 'grey'}, line_kws={'color': 'blue'}, ax=ax ) pred_val = float(predicted_shares.replace('~', '').replace(',', '')) ax.scatter(float(polarity), pred_val, color='red', s=200, marker='*', zorder=5, label='Your Article') ax.legend() ax.set_title(f"You are here: Sentiment {polarity}, Shares {int(pred_val)}") ax.set_xlabel("Global Sentiment Polarity") st.pyplot(fig) with tab3: st.write("Distribution of shares by day of week.") fig2, ax2 = plt.subplots(figsize=(10, 5)) day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] sns.boxplot( data=df_display, x='day_of_week', y='shares', order=day_order, palette="vlag", ax=ax2 ) if publish_day in day_order: day_idx = day_order.index(publish_day) ax2.axvline(day_idx, color='red', linestyle='--', alpha=0.7) ax2.text(day_idx, df_display['shares'].max()*0.9, "Your Publish Day", color='red', ha='center') ax2.set_yscale('log') ax2.set_title("Share Distribution by Day (Log Scale)") st.pyplot(fig2) with tab4: st.write("### What drives the prediction?") st.write("This chart shows which features increase (positive) or decrease (negative) the predicted shares.") coef_df = pd.DataFrame({ 'Feature': feature_columns, 'Coefficient': model.coef_ }) coef_df['Impact'] = coef_df['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative') coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs() coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False) importance_chart = alt.Chart(coef_df).mark_bar().encode( x=alt.X('Coefficient', title='Impact on Log Shares'), y=alt.Y('Feature', sort='-x', title='Feature Name'), color=alt.Color('Impact', scale=alt.Scale(domain=['Positive', 'Negative'], range=['#2ecc71', '#e74c3c'])), tooltip=['Feature', 'Coefficient']).properties(title='Feature Importance (Ridge Model Coefficients)') st.altair_chart(importance_chart, use_container_width=True) st.info( "**How to read this:** \n" "- **Green bars (Positive):** Increasing this feature generally increases shares.\n" "- **Red bars (Negative):** Increasing this feature generally decreases shares.\n" "- **Bar Length:** The longer the bar, the stronger the influence." ) else: st.warning("Please enter both a Headline and an Article Snippet to run the analysis.")