IS_445_Observatory / observatory_app.py
KeegBarb's picture
Update observatory_app.py
e53100c verified
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import joblib
from textblob import TextBlob
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from datetime import datetime
import hashlib
data_url = "https://raw.githubusercontent.com/KeeganBarbee/KeeganBarbee.github.io/refs/heads/main/OnlineNewsPopularity.csv"
@st.cache_data(ttl=3600)
def load_data():
df = pd.read_csv(data_url)
df.columns = df.columns.str.strip().str.replace(' ', '')
channel_cols = ['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_world',
'data_channel_is_bus', 'data_channel_is_socmed']
def get_channel(row):
for ch in channel_cols:
if row[ch] == 1:
return ch.replace('data_channel_is_', '')
return 'other'
df['channel'] = df.apply(get_channel, axis=1)
days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
def get_day(row):
for day in days:
if row[f'weekday_is_{day}'] == 1:
return day.title()
return 'Unknown'
df['day_of_week'] = df.apply(get_day, axis=1)
return df[df['shares'] < 20000]
def check_for_new_data(url):
try:
response = pd.read_csv(url, nrows=1)
current_hash = hashlib.md5(str(response).encode()).hexdigest()
if 'data_hash' not in st.session_state:
st.session_state.data_hash = current_hash
st.session_state.last_check = datetime.now()
return False
if current_hash != st.session_state.data_hash:
st.session_state.data_hash = current_hash
st.session_state.last_check = datetime.now()
return True
st.session_state.last_check = datetime.now()
return False
except:
return False
df = load_data()
df_display = df.copy()
model = joblib.load('popularity_model.pkl')
feature_columns = joblib.load('model_features.pkl')
feature_columns = [col.strip().replace(' ', '') for col in feature_columns]
def analyze_and_predict(headline_text, content_text, num_images, publish_day, num_videos, num_hrefs):
title_blob = TextBlob(headline_text)
content_blob = TextBlob(content_text)
n_tokens_title = len(headline_text.split())
n_tokens_content = len(content_text.split())
global_sentiment_polarity = content_blob.sentiment.polarity
global_subjectivity = content_blob.sentiment.subjectivity
title_sentiment_polarity = title_blob.sentiment.polarity
weekday_flags = {'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0, 'weekday_is_thursday': 0,
'weekday_is_friday': 0, 'weekday_is_saturday': 0, 'weekday_is_sunday': 0}
if publish_day:
weekday_key = f'weekday_is_{publish_day.lower()}'
if weekday_key in weekday_flags:
weekday_flags[weekday_key] = 1
input_data = {
'n_tokens_title': n_tokens_title,
'n_tokens_content': n_tokens_content,
'global_sentiment_polarity': global_sentiment_polarity,
'global_subjectivity': global_subjectivity,
'title_sentiment_polarity': title_sentiment_polarity,
'num_imgs': num_images,
'num_videos': num_videos,
'num_hrefs': num_hrefs,
**weekday_flags
}
X_pred = np.array([input_data[col] for col in feature_columns]).reshape(1, -1)
log_pred_shares = model.predict(X_pred)[0]
predicted_shares = np.expm1(log_pred_shares)
return (
f"~{int(predicted_shares):,}",
f"{global_sentiment_polarity:.3f}",
f"{global_subjectivity:.3f}",
f"{n_tokens_content}",
f"{n_tokens_title}"
)
st.title("Headline Impact: Live Popularity Predictor")
st.markdown("Use this tool to test how your article's features affect its predicted share count.")
with st.sidebar:
st.header("Structural & Temporal Inputs")
publish_day = st.selectbox("Day of Publication", ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), index=0)
st.subheader("Multimedia & Linking")
num_images = st.slider("Number of Images (num_imgs)", min_value=0, max_value=20, value=5, step=1)
num_videos = st.slider("Number of Videos (num_videos)", min_value=0, max_value=10, value=1, step=1)
st.subheader("Article Structure")
num_hrefs = st.slider("Number of Links (num_hrefs)", min_value=0, max_value=30, value=5, step=1)
st.markdown("---")
st.subheader("📊 Data Freshness")
if 'last_check' in st.session_state:
st.caption(f"Last checked: {st.session_state.last_check.strftime('%Y-%m-%d %H:%M:%S')}")
if st.button("🔄 Check for New Data"):
with st.spinner("Checking for updates..."):
data_updated = check_for_new_data(data_url)
if data_updated:
st.cache_data.clear()
st.success("✅ New data detected and loaded!")
st.rerun()
else:
st.info("✓ Data is up to date")
auto_refresh = st.checkbox("Auto-refresh every hour")
if auto_refresh and 'last_check' in st.session_state:
time_diff = (datetime.now() - st.session_state.last_check).seconds
if time_diff > 3600:
st.rerun()
st.header("Article Content")
headline_text = st.text_input("Headline Text", placeholder="E.g., Revolutionary AI Tool Boosts Productivity")
content_text = st.text_area("Article Snippet (for Sentiment Analysis)", placeholder="Paste a few paragraphs of the article content here.")
if st.button("Analyze & Predict Shares"):
if headline_text and content_text:
predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict(
headline_text,
content_text,
num_images,
publish_day,
num_videos,
num_hrefs
)
st.success(f"### Predicted Shares: {predicted_shares}")
st.markdown("---")
col1, col2, col3, col4 = st.columns(4)
col1.metric("Content Polarity", polarity)
col2.metric("Content Subjectivity", subjectivity)
col3.metric("Content Word Count", content_length)
col4.metric("Title Word Count", title_length)
st.header("Visual Analysis: How do you compare?")
tab1, tab2, tab3, tab4 = st.tabs(["Sentiment Analysis", "Interactive Market", "Day of Week", "Model Logic"])
with tab2:
st.write("Explore how content length and topic channel affect shares.")
sample = df_display.sample(1000)
brush = alt.selection_interval()
scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode(
x=alt.X('n_tokens_content', title='Number of Tokens in Content'),
y=alt.Y('shares', title='Shares'),
color=alt.condition(brush, 'channel:N', alt.value('lightgray')),
tooltip=['n_tokens_content', 'shares', 'channel']).add_params(brush).properties(height=400, title='Content Length vs Shares (Drag to select)')
trendline = alt.Chart(sample).mark_line(color='firebrick', size=3).transform_filter(brush).transform_regression('n_tokens_content', 'shares').encode(
x='n_tokens_content', y='shares')
st.altair_chart(scatter + trendline, use_container_width=True)
with tab1:
st.write("Does being more positive lead to more shares?")
fig, ax = plt.subplots(figsize=(10, 5))
sns.regplot(
data=df_display.sample(2000),
x='global_sentiment_polarity',
y='shares',
scatter_kws={'alpha': 0.1, 'color': 'grey'},
line_kws={'color': 'blue'},
ax=ax
)
pred_val = float(predicted_shares.replace('~', '').replace(',', ''))
ax.scatter(float(polarity), pred_val, color='red', s=200, marker='*', zorder=5, label='Your Article')
ax.legend()
ax.set_title(f"You are here: Sentiment {polarity}, Shares {int(pred_val)}")
ax.set_xlabel("Global Sentiment Polarity")
st.pyplot(fig)
with tab3:
st.write("Distribution of shares by day of week.")
fig2, ax2 = plt.subplots(figsize=(10, 5))
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sns.boxplot(
data=df_display,
x='day_of_week',
y='shares',
order=day_order,
palette="vlag",
ax=ax2
)
if publish_day in day_order:
day_idx = day_order.index(publish_day)
ax2.axvline(day_idx, color='red', linestyle='--', alpha=0.7)
ax2.text(day_idx, df_display['shares'].max()*0.9, "Your Publish Day", color='red', ha='center')
ax2.set_yscale('log')
ax2.set_title("Share Distribution by Day (Log Scale)")
st.pyplot(fig2)
with tab4:
st.write("### What drives the prediction?")
st.write("This chart shows which features increase (positive) or decrease (negative) the predicted shares.")
coef_df = pd.DataFrame({
'Feature': feature_columns,
'Coefficient': model.coef_
})
coef_df['Impact'] = coef_df['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative')
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
importance_chart = alt.Chart(coef_df).mark_bar().encode(
x=alt.X('Coefficient', title='Impact on Log Shares'),
y=alt.Y('Feature', sort='-x', title='Feature Name'),
color=alt.Color('Impact', scale=alt.Scale(domain=['Positive', 'Negative'], range=['#2ecc71', '#e74c3c'])),
tooltip=['Feature', 'Coefficient']).properties(title='Feature Importance (Ridge Model Coefficients)')
st.altair_chart(importance_chart, use_container_width=True)
st.info(
"**How to read this:** \n"
"- **Green bars (Positive):** Increasing this feature generally increases shares.\n"
"- **Red bars (Negative):** Increasing this feature generally decreases shares.\n"
"- **Bar Length:** The longer the bar, the stronger the influence."
)
else:
st.warning("Please enter both a Headline and an Article Snippet to run the analysis.")