import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import joblib
from textblob import TextBlob  
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from datetime import datetime
import hashlib
data_url = "https://raw.githubusercontent.com/KeeganBarbee/KeeganBarbee.github.io/refs/heads/main/OnlineNewsPopularity.csv"
@st.cache_data(ttl=3600) 
def load_data():
    df = pd.read_csv(data_url)
    df.columns = df.columns.str.strip().str.replace(' ', '')
    channel_cols = ['data_channel_is_lifestyle', 'data_channel_is_entertainment', 'data_channel_is_tech', 'data_channel_is_world',
                    'data_channel_is_bus', 'data_channel_is_socmed']
    
    def get_channel(row):
        for ch in channel_cols:
            if row[ch] == 1:
                return ch.replace('data_channel_is_', '')
        return 'other'
    
    df['channel'] = df.apply(get_channel, axis=1)
    days = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    def get_day(row):
        for day in days:
            if row[f'weekday_is_{day}'] == 1:
                return day.title()
        return 'Unknown'
    df['day_of_week'] = df.apply(get_day, axis=1)
    return df[df['shares'] < 20000]
def check_for_new_data(url):
    try:
        response = pd.read_csv(url, nrows=1)
        current_hash = hashlib.md5(str(response).encode()).hexdigest()
        if 'data_hash' not in st.session_state:
            st.session_state.data_hash = current_hash
            st.session_state.last_check = datetime.now()
            return False       
        if current_hash != st.session_state.data_hash:
            st.session_state.data_hash = current_hash
            st.session_state.last_check = datetime.now()
            return True
        st.session_state.last_check = datetime.now()
        return False
    except:
        return False
df = load_data()
df_display = df.copy()
model = joblib.load('popularity_model.pkl')
feature_columns = joblib.load('model_features.pkl') 
feature_columns = [col.strip().replace(' ', '') for col in feature_columns]

def analyze_and_predict(headline_text, content_text, num_images, publish_day, num_videos, num_hrefs):
    title_blob = TextBlob(headline_text)
    content_blob = TextBlob(content_text)
    n_tokens_title = len(headline_text.split())
    n_tokens_content = len(content_text.split())
    global_sentiment_polarity = content_blob.sentiment.polarity
    global_subjectivity = content_blob.sentiment.subjectivity
    title_sentiment_polarity = title_blob.sentiment.polarity
    
    weekday_flags = {'weekday_is_monday': 0, 'weekday_is_tuesday': 0, 'weekday_is_wednesday': 0,  'weekday_is_thursday': 0, 
        'weekday_is_friday': 0, 'weekday_is_saturday': 0, 'weekday_is_sunday': 0}
    if publish_day:
        weekday_key = f'weekday_is_{publish_day.lower()}'
        if weekday_key in weekday_flags:
            weekday_flags[weekday_key] = 1
    
    input_data = {
        'n_tokens_title': n_tokens_title,
        'n_tokens_content': n_tokens_content,
        'global_sentiment_polarity': global_sentiment_polarity,
        'global_subjectivity': global_subjectivity,
        'title_sentiment_polarity': title_sentiment_polarity,
        'num_imgs': num_images,
        'num_videos': num_videos,
        'num_hrefs': num_hrefs,
        **weekday_flags
    }
    X_pred = np.array([input_data[col] for col in feature_columns]).reshape(1, -1)
    log_pred_shares = model.predict(X_pred)[0]
    predicted_shares = np.expm1(log_pred_shares)
    return (
        f"~{int(predicted_shares):,}", 
        f"{global_sentiment_polarity:.3f}", 
        f"{global_subjectivity:.3f}", 
        f"{n_tokens_content}",
        f"{n_tokens_title}" 
    )
st.title("Headline Impact: Live Popularity Predictor")
st.markdown("Use this tool to test how your article's features affect its predicted share count.")

with st.sidebar:
    st.header("Structural & Temporal Inputs")
    publish_day = st.selectbox("Day of Publication", ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'), index=0)
    st.subheader("Multimedia & Linking")
    num_images = st.slider("Number of Images (num_imgs)", min_value=0, max_value=20, value=5, step=1)
    num_videos = st.slider("Number of Videos (num_videos)", min_value=0, max_value=10, value=1, step=1)
    
    st.subheader("Article Structure")
    num_hrefs = st.slider("Number of Links (num_hrefs)", min_value=0, max_value=30, value=5, step=1)

    st.markdown("---")
    st.subheader("📊 Data Freshness")
    
    if 'last_check' in st.session_state:
        st.caption(f"Last checked: {st.session_state.last_check.strftime('%Y-%m-%d %H:%M:%S')}")

    if st.button("🔄 Check for New Data"):
        with st.spinner("Checking for updates..."):
            data_updated = check_for_new_data(data_url)
            if data_updated:
                st.cache_data.clear()
                st.success("✅ New data detected and loaded!")
                st.rerun() 
            else:
                st.info("✓ Data is up to date")
    
    auto_refresh = st.checkbox("Auto-refresh every hour")
    if auto_refresh and 'last_check' in st.session_state:
        time_diff = (datetime.now() - st.session_state.last_check).seconds
        if time_diff > 3600: 
            st.rerun()

st.header("Article Content")

headline_text = st.text_input("Headline Text", placeholder="E.g., Revolutionary AI Tool Boosts Productivity")
content_text = st.text_area("Article Snippet (for Sentiment Analysis)", placeholder="Paste a few paragraphs of the article content here.")

if st.button("Analyze & Predict Shares"):
    if headline_text and content_text:
        predicted_shares, polarity, subjectivity, content_length, title_length = analyze_and_predict(
            headline_text, 
            content_text, 
            num_images, 
            publish_day, 
            num_videos,  
            num_hrefs
        )
        st.success(f"### Predicted Shares: {predicted_shares}")
        st.markdown("---")

        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Content Polarity", polarity)
        col2.metric("Content Subjectivity", subjectivity)
        col3.metric("Content Word Count", content_length)
        col4.metric("Title Word Count", title_length)

        st.header("Visual Analysis: How do you compare?")
        
        tab1, tab2, tab3, tab4 = st.tabs(["Sentiment Analysis", "Interactive Market", "Day of Week", "Model Logic"])

        with tab2:
            st.write("Explore how content length and topic channel affect shares.")

            sample = df_display.sample(1000)
            brush = alt.selection_interval()

            scatter = alt.Chart(sample).mark_circle(opacity=0.5).encode(
                x=alt.X('n_tokens_content', title='Number of Tokens in Content'),
                y=alt.Y('shares', title='Shares'),
                color=alt.condition(brush, 'channel:N', alt.value('lightgray')),
                tooltip=['n_tokens_content', 'shares', 'channel']).add_params(brush).properties(height=400, title='Content Length vs Shares (Drag to select)')
            
            trendline = alt.Chart(sample).mark_line(color='firebrick', size=3).transform_filter(brush).transform_regression('n_tokens_content', 'shares').encode(
                x='n_tokens_content', y='shares')

            st.altair_chart(scatter + trendline, use_container_width=True)
        
        with tab1:
            st.write("Does being more positive lead to more shares?")
            fig, ax = plt.subplots(figsize=(10, 5))

            sns.regplot(
                data=df_display.sample(2000), 
                x='global_sentiment_polarity', 
                y='shares',
                scatter_kws={'alpha': 0.1, 'color': 'grey'},
                line_kws={'color': 'blue'},
                ax=ax
            )
            
            pred_val = float(predicted_shares.replace('~', '').replace(',', ''))
            ax.scatter(float(polarity), pred_val, color='red', s=200, marker='*', zorder=5, label='Your Article')
            ax.legend()
            ax.set_title(f"You are here: Sentiment {polarity}, Shares {int(pred_val)}")
            ax.set_xlabel("Global Sentiment Polarity")
            st.pyplot(fig)
        with tab3:
            st.write("Distribution of shares by day of week.")
            fig2, ax2 = plt.subplots(figsize=(10, 5))
            day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            sns.boxplot(
                data=df_display, 
                x='day_of_week', 
                y='shares', 
                order=day_order, 
                palette="vlag", 
                ax=ax2
            )
            if publish_day in day_order:
                day_idx = day_order.index(publish_day)
                ax2.axvline(day_idx, color='red', linestyle='--', alpha=0.7)
                ax2.text(day_idx, df_display['shares'].max()*0.9, "Your Publish Day", color='red', ha='center')

            ax2.set_yscale('log')
            ax2.set_title("Share Distribution by Day (Log Scale)")
            st.pyplot(fig2)
        with tab4:
            st.write("### What drives the prediction?")
            st.write("This chart shows which features increase (positive) or decrease (negative) the predicted shares.")
            
            coef_df = pd.DataFrame({
                'Feature': feature_columns,
                'Coefficient': model.coef_
            })
            
            coef_df['Impact'] = coef_df['Coefficient'].apply(lambda x: 'Positive' if x > 0 else 'Negative')
            coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
            coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
        
            importance_chart = alt.Chart(coef_df).mark_bar().encode(
                x=alt.X('Coefficient', title='Impact on Log Shares'),
                y=alt.Y('Feature', sort='-x', title='Feature Name'),
                color=alt.Color('Impact', scale=alt.Scale(domain=['Positive', 'Negative'], range=['#2ecc71', '#e74c3c'])),
                tooltip=['Feature', 'Coefficient']).properties(title='Feature Importance (Ridge Model Coefficients)')
            st.altair_chart(importance_chart, use_container_width=True)
            st.info(
                "**How to read this:** \n"
                "- **Green bars (Positive):** Increasing this feature generally increases shares.\n"
                "- **Red bars (Negative):** Increasing this feature generally decreases shares.\n"
                "- **Bar Length:** The longer the bar, the stronger the influence."
            )
    else:
        st.warning("Please enter both a Headline and an Article Snippet to run the analysis.")