Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import altair as alt | |
| import joblib | |
| from textblob import TextBlob | |
| from sklearn.linear_model import Ridge | |
| from sklearn.model_selection import train_test_split | |
| from datetime import datetime | |
| import hashlib | |
| data_url = "https://raw.githubusercontent.com/KeeganBarbee/KeeganBarbee.github.io/refs/heads/main/OnlineNewsPopularity.csv" | |
| def load_data(): | |
| df = pd.read_csv(data_url) | |
| df.columns = df.columns.str.strip().str.replace(' ', '') | |
| channel_cols = ['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_world', | |
| 'data_channel_is_bus', 'data_channel_is_socmed'] | |
| def get_channel(row): | |
| for ch in channel_cols: | |
| if row[ch] == 1: | |
| return ch.replace('data_channel_is_', '') | |
| return 'other' | |
| df['channel'] = df.apply(get_channel, axis=1) | |
| days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] | |
| def get_day(row): | |
| for day in days: | |
| if row[f'weekday_is_{day}'] == 1: | |
| return day.title() | |
| return 'Unknown' | |
| df['day_of_week'] = df.apply(get_day, axis=1) | |
| return df[df['shares'] < 20000] | |
| def check_for_new_data(url): | |
| try: | |
| response = pd.read_csv(url, nrows=1) | |
| current_hash = hashlib.md5(str(response).encode()).hexdigest() | |
| if 'data_hash' not in st.session_state: | |
| st.session_state.data_hash = current_hash | |
| st.session_state.last_check = datetime.now() | |
| return False | |
| if current_hash != st.session_state.data_hash: | |
| st.session_state.data_hash = current_hash | |
| st.session_state.last_check = datetime.now() | |
| return True | |
| st.session_state.last_check = datetime.now() | |
| return False | |
| except: | |
| return False | |
| df = load_data() | |
| df_display = df.copy() | |
| model = joblib.load('popularity_model.pkl') | |
| feature_columns = joblib.load('model_features.pkl') | |
| feature_columns = [col.strip().replace(' ', '') for col in feature_columns] | |
| def analyze_and_predict(headline_text, content_text, num_images, publish_day, num_videos, num_hrefs): | |
| title_blob = TextBlob(headline_text) | |
| content_blob = TextBlob(content_text) | |
| n_tokens_title = len(headline_text.split()) | |
| n_tokens_content = len(content_text.split()) | |
| global_sentiment_polarity = content_blob.sentiment.polarity | |
| global_subjectivity = content_blob.sentiment.subjectivity | |
| title_sentiment_polarity = title_blob.sentiment.polarity | |
| weekday_flags = {'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0, 'weekday_is_thursday': 0, | |
| 'weekday_is_friday': 0, 'weekday_is_saturday': 0, 'weekday_is_sunday': 0} | |
| if publish_day: | |
| weekday_key = f'weekday_is_{publish_day.lower()}' | |
| if weekday_key in weekday_flags: | |
| weekday_flags[weekday_key] = 1 | |
| input_data = { | |
| 'n_tokens_title': n_tokens_title, | |
| 'n_tokens_content': n_tokens_content, | |
| 'global_sentiment_polarity': global_sentiment_polarity, | |
| 'global_subjectivity': global_subjectivity, | |
| 'title_sentiment_polarity': title_sentiment_polarity, | |
| 'num_imgs': num_images, | |
| 'num_videos': num_videos, | |
| 'num_hrefs': num_hrefs, | |
| **weekday_flags | |
| } | |
| X_pred = np.array([input_data[col] for col in feature_columns]).reshape(1, -1) | |
| log_pred_shares = model.predict(X_pred)[0] | |
| predicted_shares = np.expm1(log_pred_shares) | |
| return ( | |
| f"~{int(predicted_shares):,}", | |
| f"{global_sentiment_polarity:.3f}", | |
| f"{global_subjectivity:.3f}", | |
| f"{n_tokens_content}", | |
| f"{n_tokens_title}" | |
| ) | |
| st.title("Headline Impact: Live Popularity Predictor") | |
| st.markdown("Use this tool to test how your article's features affect its predicted share count.") | |
| with st.sidebar: | |
| st.header("Structural & Temporal Inputs") | |
| publish_day = st.selectbox("Day of Publication", ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), index=0) | |
| st.subheader("Multimedia & Linking") | |
| num_images = st.slider("Number of Images (num_imgs)", min_value=0, max_value=20, value=5, step=1) | |
| num_videos = st.slider("Number of Videos (num_videos)", min_value=0, max_value=10, value=1, step=1) | |
| st.subheader("Article Structure") | |
| num_hrefs = st.slider("Number of Links (num_hrefs)", min_value=0, max_value=30, value=5, step=1) | |
| st.markdown("---") | |
| st.subheader("📊 Data Freshness") | |
| if 'last_check' in st.session_state: | |
| st.caption(f"Last checked: {st.session_state.last_check.strftime('%Y-%m-%d %H:%M:%S')}") | |
| if st.button("🔄 Check for New Data"): | |
| with st.spinner("Checking for updates..."): | |
| data_updated = check_for_new_data(data_url) | |
| if data_updated: | |
| st.cache_data.clear() | |
| st.success("✅ New data detected and loaded!") | |
| st.rerun() | |
| else: | |
| st.info("✓ Data is up to date") | |
| auto_refresh = st.checkbox("Auto-refresh every hour") | |
| if auto_refresh and 'last_check' in st.session_state: | |
| time_diff = (datetime.now() - st.session_state.last_check).seconds | |
| if time_diff > 3600: | |
| st.rerun() | |
| st.header("Article Content") | |
| headline_text = st.text_input("Headline Text", placeholder="E.g., Revolutionary AI Tool Boosts Productivity") | |
| content_text = st.text_area("Article Snippet (for Sentiment Analysis)", placeholder="Paste a few paragraphs of the article content here.") | |
| if st.button("Analyze & Predict Shares"): | |
| if headline_text and content_text: | |
| predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict( | |
| headline_text, | |
| content_text, | |
| num_images, | |
| publish_day, | |
| num_videos, | |
| num_hrefs | |
| ) | |
| st.success(f"### Predicted Shares: {predicted_shares}") | |
| st.markdown("---") | |
| col1, col2, col3, col4 = st.columns(4) | |
| col1.metric("Content Polarity", polarity) | |
| col2.metric("Content Subjectivity", subjectivity) | |
| col3.metric("Content Word Count", content_length) | |
| col4.metric("Title Word Count", title_length) | |
| st.header("Visual Analysis: How do you compare?") | |
| tab1, tab2, tab3, tab4 = st.tabs(["Sentiment Analysis", "Interactive Market", "Day of Week", "Model Logic"]) | |
| with tab2: | |
| st.write("Explore how content length and topic channel affect shares.") | |
| sample = df_display.sample(1000) | |
| brush = alt.selection_interval() | |
| scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode( | |
| x=alt.X('n_tokens_content', title='Number of Tokens in Content'), | |
| y=alt.Y('shares', title='Shares'), | |
| color=alt.condition(brush, 'channel:N', alt.value('lightgray')), | |
| tooltip=['n_tokens_content', 'shares', 'channel']).add_params(brush).properties(height=400, title='Content Length vs Shares (Drag to select)') | |
| trendline = alt.Chart(sample).mark_line(color='firebrick', size=3).transform_filter(brush).transform_regression('n_tokens_content', 'shares').encode( | |
| x='n_tokens_content', y='shares') | |
| st.altair_chart(scatter + trendline, use_container_width=True) | |
| with tab1: | |
| st.write("Does being more positive lead to more shares?") | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| sns.regplot( | |
| data=df_display.sample(2000), | |
| x='global_sentiment_polarity', | |
| y='shares', | |
| scatter_kws={'alpha': 0.1, 'color': 'grey'}, | |
| line_kws={'color': 'blue'}, | |
| ax=ax | |
| ) | |
| pred_val = float(predicted_shares.replace('~', '').replace(',', '')) | |
| ax.scatter(float(polarity), pred_val, color='red', s=200, marker='*', zorder=5, label='Your Article') | |
| ax.legend() | |
| ax.set_title(f"You are here: Sentiment {polarity}, Shares {int(pred_val)}") | |
| ax.set_xlabel("Global Sentiment Polarity") | |
| st.pyplot(fig) | |
| with tab3: | |
| st.write("Distribution of shares by day of week.") | |
| fig2, ax2 = plt.subplots(figsize=(10, 5)) | |
| day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] | |
| sns.boxplot( | |
| data=df_display, | |
| x='day_of_week', | |
| y='shares', | |
| order=day_order, | |
| palette="vlag", | |
| ax=ax2 | |
| ) | |
| if publish_day in day_order: | |
| day_idx = day_order.index(publish_day) | |
| ax2.axvline(day_idx, color='red', linestyle='--', alpha=0.7) | |
| ax2.text(day_idx, df_display['shares'].max()*0.9, "Your Publish Day", color='red', ha='center') | |
| ax2.set_yscale('log') | |
| ax2.set_title("Share Distribution by Day (Log Scale)") | |
| st.pyplot(fig2) | |
| with tab4: | |
| st.write("### What drives the prediction?") | |
| st.write("This chart shows which features increase (positive) or decrease (negative) the predicted shares.") | |
| coef_df = pd.DataFrame({ | |
| 'Feature': feature_columns, | |
| 'Coefficient': model.coef_ | |
| }) | |
| coef_df['Impact'] = coef_df['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative') | |
| coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs() | |
| coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False) | |
| importance_chart = alt.Chart(coef_df).mark_bar().encode( | |
| x=alt.X('Coefficient', title='Impact on Log Shares'), | |
| y=alt.Y('Feature', sort='-x', title='Feature Name'), | |
| color=alt.Color('Impact', scale=alt.Scale(domain=['Positive', 'Negative'], range=['#2ecc71', '#e74c3c'])), | |
| tooltip=['Feature', 'Coefficient']).properties(title='Feature Importance (Ridge Model Coefficients)') | |
| st.altair_chart(importance_chart, use_container_width=True) | |
| st.info( | |
| "**How to read this:** \n" | |
| "- **Green bars (Positive):** Increasing this feature generally increases shares.\n" | |
| "- **Red bars (Negative):** Increasing this feature generally decreases shares.\n" | |
| "- **Bar Length:** The longer the bar, the stronger the influence." | |
| ) | |
| else: | |
| st.warning("Please enter both a Headline and an Article Snippet to run the analysis.") |