Spaces:

IS445Group
/

IS_445_Observatory

Sleeping

App Files Files Community

IS_445_Observatory / observatory_app.py

KeegBarb

Update observatory_app.py

e53100c verified 3 months ago

raw

history blame contribute delete

10.9 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import altair as alt
	import joblib
	from textblob import TextBlob
	from sklearn.linear_model import Ridge
	from sklearn.model_selection import train_test_split
	from datetime import datetime
	import hashlib
	data_url = "https://raw.githubusercontent.com/KeeganBarbee/KeeganBarbee.github.io/refs/heads/main/OnlineNewsPopularity.csv"
	@st.cache_data(ttl=3600)
	def load_data():
	df = pd.read_csv(data_url)
	df.columns = df.columns.str.strip().str.replace(' ', '')
	channel_cols = ['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_world',
	'data_channel_is_bus', 'data_channel_is_socmed']

	def get_channel(row):
	for ch in channel_cols:
	if row[ch] == 1:
	return ch.replace('data_channel_is_', '')
	return 'other'

	df['channel'] = df.apply(get_channel, axis=1)
	days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
	def get_day(row):
	for day in days:
	if row[f'weekday_is_{day}'] == 1:
	return day.title()
	return 'Unknown'
	df['day_of_week'] = df.apply(get_day, axis=1)
	return df[df['shares'] < 20000]
	def check_for_new_data(url):
	try:
	response = pd.read_csv(url, nrows=1)
	current_hash = hashlib.md5(str(response).encode()).hexdigest()
	if 'data_hash' not in st.session_state:
	st.session_state.data_hash = current_hash
	st.session_state.last_check = datetime.now()
	return False
	if current_hash != st.session_state.data_hash:
	st.session_state.data_hash = current_hash
	st.session_state.last_check = datetime.now()
	return True
	st.session_state.last_check = datetime.now()
	return False
	except:
	return False
	df = load_data()
	df_display = df.copy()
	model = joblib.load('popularity_model.pkl')
	feature_columns = joblib.load('model_features.pkl')
	feature_columns = [col.strip().replace(' ', '') for col in feature_columns]

	def analyze_and_predict(headline_text, content_text, num_images, publish_day, num_videos, num_hrefs):
	title_blob = TextBlob(headline_text)
	content_blob = TextBlob(content_text)
	n_tokens_title = len(headline_text.split())
	n_tokens_content = len(content_text.split())
	global_sentiment_polarity = content_blob.sentiment.polarity
	global_subjectivity = content_blob.sentiment.subjectivity
	title_sentiment_polarity = title_blob.sentiment.polarity

	weekday_flags = {'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0, 'weekday_is_thursday': 0,
	'weekday_is_friday': 0, 'weekday_is_saturday': 0, 'weekday_is_sunday': 0}
	if publish_day:
	weekday_key = f'weekday_is_{publish_day.lower()}'
	if weekday_key in weekday_flags:
	weekday_flags[weekday_key] = 1

	input_data = {
	'n_tokens_title': n_tokens_title,
	'n_tokens_content': n_tokens_content,
	'global_sentiment_polarity': global_sentiment_polarity,
	'global_subjectivity': global_subjectivity,
	'title_sentiment_polarity': title_sentiment_polarity,
	'num_imgs': num_images,
	'num_videos': num_videos,
	'num_hrefs': num_hrefs,
	**weekday_flags
	}
	X_pred = np.array([input_data[col] for col in feature_columns]).reshape(1, -1)
	log_pred_shares = model.predict(X_pred)[0]
	predicted_shares = np.expm1(log_pred_shares)
	return (
	f"~{int(predicted_shares):,}",
	f"{global_sentiment_polarity:.3f}",
	f"{global_subjectivity:.3f}",
	f"{n_tokens_content}",
	f"{n_tokens_title}"
	)
	st.title("Headline Impact: Live Popularity Predictor")
	st.markdown("Use this tool to test how your article's features affect its predicted share count.")

	with st.sidebar:
	st.header("Structural & Temporal Inputs")
	publish_day = st.selectbox("Day of Publication", ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), index=0)
	st.subheader("Multimedia & Linking")
	num_images = st.slider("Number of Images (num_imgs)", min_value=0, max_value=20, value=5, step=1)
	num_videos = st.slider("Number of Videos (num_videos)", min_value=0, max_value=10, value=1, step=1)

	st.subheader("Article Structure")
	num_hrefs = st.slider("Number of Links (num_hrefs)", min_value=0, max_value=30, value=5, step=1)

	st.markdown("---")
	st.subheader("📊 Data Freshness")

	if 'last_check' in st.session_state:
	st.caption(f"Last checked: {st.session_state.last_check.strftime('%Y-%m-%d %H:%M:%S')}")

	if st.button("🔄 Check for New Data"):
	with st.spinner("Checking for updates..."):
	data_updated = check_for_new_data(data_url)
	if data_updated:
	st.cache_data.clear()
	st.success("✅ New data detected and loaded!")
	st.rerun()
	else:
	st.info("✓ Data is up to date")

	auto_refresh = st.checkbox("Auto-refresh every hour")
	if auto_refresh and 'last_check' in st.session_state:
	time_diff = (datetime.now() - st.session_state.last_check).seconds
	if time_diff > 3600:
	st.rerun()

	st.header("Article Content")

	headline_text = st.text_input("Headline Text", placeholder="E.g., Revolutionary AI Tool Boosts Productivity")
	content_text = st.text_area("Article Snippet (for Sentiment Analysis)", placeholder="Paste a few paragraphs of the article content here.")

	if st.button("Analyze & Predict Shares"):
	if headline_text and content_text:
	predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict(
	headline_text,
	content_text,
	num_images,
	publish_day,
	num_videos,
	num_hrefs
	)
	st.success(f"### Predicted Shares: {predicted_shares}")
	st.markdown("---")

	col1, col2, col3, col4 = st.columns(4)
	col1.metric("Content Polarity", polarity)
	col2.metric("Content Subjectivity", subjectivity)
	col3.metric("Content Word Count", content_length)
	col4.metric("Title Word Count", title_length)

	st.header("Visual Analysis: How do you compare?")

	tab1, tab2, tab3, tab4 = st.tabs(["Sentiment Analysis", "Interactive Market", "Day of Week", "Model Logic"])

	with tab2:
	st.write("Explore how content length and topic channel affect shares.")

	sample = df_display.sample(1000)
	brush = alt.selection_interval()

	scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode(
	x=alt.X('n_tokens_content', title='Number of Tokens in Content'),
	y=alt.Y('shares', title='Shares'),
	color=alt.condition(brush, 'channel:N', alt.value('lightgray')),
	tooltip=['n_tokens_content', 'shares', 'channel']).add_params(brush).properties(height=400, title='Content Length vs Shares (Drag to select)')

	trendline = alt.Chart(sample).mark_line(color='firebrick', size=3).transform_filter(brush).transform_regression('n_tokens_content', 'shares').encode(
	x='n_tokens_content', y='shares')

	st.altair_chart(scatter + trendline, use_container_width=True)

	with tab1:
	st.write("Does being more positive lead to more shares?")
	fig, ax = plt.subplots(figsize=(10, 5))

	sns.regplot(
	data=df_display.sample(2000),
	x='global_sentiment_polarity',
	y='shares',
	scatter_kws={'alpha': 0.1, 'color': 'grey'},
	line_kws={'color': 'blue'},
	ax=ax
	)

	pred_val = float(predicted_shares.replace('~', '').replace(',', ''))
	ax.scatter(float(polarity), pred_val, color='red', s=200, marker='*', zorder=5, label='Your Article')
	ax.legend()
	ax.set_title(f"You are here: Sentiment {polarity}, Shares {int(pred_val)}")
	ax.set_xlabel("Global Sentiment Polarity")
	st.pyplot(fig)
	with tab3:
	st.write("Distribution of shares by day of week.")
	fig2, ax2 = plt.subplots(figsize=(10, 5))
	day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
	sns.boxplot(
	data=df_display,
	x='day_of_week',
	y='shares',
	order=day_order,
	palette="vlag",
	ax=ax2
	)
	if publish_day in day_order:
	day_idx = day_order.index(publish_day)
	ax2.axvline(day_idx, color='red', linestyle='--', alpha=0.7)
	ax2.text(day_idx, df_display['shares'].max()*0.9, "Your Publish Day", color='red', ha='center')

	ax2.set_yscale('log')
	ax2.set_title("Share Distribution by Day (Log Scale)")
	st.pyplot(fig2)
	with tab4:
	st.write("### What drives the prediction?")
	st.write("This chart shows which features increase (positive) or decrease (negative) the predicted shares.")

	coef_df = pd.DataFrame({
	'Feature': feature_columns,
	'Coefficient': model.coef_
	})

	coef_df['Impact'] = coef_df['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative')
	coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
	coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

	importance_chart = alt.Chart(coef_df).mark_bar().encode(
	x=alt.X('Coefficient', title='Impact on Log Shares'),
	y=alt.Y('Feature', sort='-x', title='Feature Name'),
	color=alt.Color('Impact', scale=alt.Scale(domain=['Positive', 'Negative'], range=['#2ecc71', '#e74c3c'])),
	tooltip=['Feature', 'Coefficient']).properties(title='Feature Importance (Ridge Model Coefficients)')
	st.altair_chart(importance_chart, use_container_width=True)
	st.info(
	"How to read this: \n"
	"- Green bars (Positive): Increasing this feature generally increases shares.\n"
	"- Red bars (Negative): Increasing this feature generally decreases shares.\n"
	"- Bar Length: The longer the bar, the stronger the influence."
	)
	else:
	st.warning("Please enter both a Headline and an Article Snippet to run the analysis.")