import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
df = pd.read_csv("amazon_reviews.csv")

# --- Page 1: Introduction ---
def page_intro():
    st.title("📘 Amazon Reviews ML App")
    st.markdown("""
    ## Dataset Overview
    This dataset contains Amazon product reviews, including the text and corresponding star ratings.

    ### What This App Does:
    1. **Unsupervised Learning**: We'll use **TF-IDF** + **K-Means** clustering to discover hidden topics in the reviews.
    2. **Supervised Learning**: We'll apply **Naive Bayes** to classify sentiment based on the review text.

    ### How to Use:
    - Navigate using the sidebar to each section.
    - Interact with charts and view model outputs.
    - Explore results and gain insights into customer sentiment and topics.
    """)
    st.markdown("### Dataset Preview")
    st.dataframe(df.head())

# --- Page 2: Unsupervised Learning ---
def page_unsupervised():
    st.title("🔍 Unsupervised Learning: Topic Clustering")

    num_clusters = st.slider("Select number of clusters", 2, 10, 5)
    max_features = st.slider("Max TF-IDF features", 100, 3000, 1000, step=100)

    tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = tfidf.fit_transform(df['reviewText'].astype(str))

    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    df['Cluster'] = kmeans.fit_predict(tfidf_matrix)

    st.markdown("### Cluster Distribution")
    cluster_counts = df['Cluster'].value_counts().sort_index()
    st.bar_chart(cluster_counts)

    st.markdown("### Sample Reviews Per Cluster")
    num_samples = st.slider("Number of sample reviews per cluster", 1, 5, 2)
    for i in range(num_clusters):
        st.subheader(f"Cluster {i}")
        samples = df[df['Cluster'] == i]['reviewText'].sample(num_samples, random_state=42).tolist()
        for s in samples:
            st.markdown(f"- {s}")

# --- Page 3: Supervised Learning ---
def page_supervised():
    st.title("🧠 Supervised Learning: Sentiment Classification")

    def convert_sentiment(star):
        if star <= 2:
            return 'negative'
        elif star == 3:
            return 'neutral'
        else:
            return 'positive'

    df['Sentiment'] = df['overall'].apply(convert_sentiment)

    max_features = st.slider("Max TF-IDF features (for classification)", 100, 3000, 1000, step=100)
    test_size = st.slider("Test set size (%)", 10, 50, 20, step=5) / 100

    tfidf = TfidfVectorizer(stop_words='english', max_features=max_features)
    X = tfidf.fit_transform(df['reviewText'].astype(str))
    y = df['Sentiment']

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42)

    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    st.markdown("### Classification Report")
    report = classification_report(y_test, y_pred, output_dict=True)
    st.dataframe(pd.DataFrame(report).transpose())

    st.markdown("### Confusion Matrix")
    cm = confusion_matrix(y_test, y_pred, labels=['positive', 'neutral', 'negative'])
    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative'])
    st.pyplot(fig)

# --- Page 4: Results & Conclusion ---
def page_results():
    st.title("📊 Results & Conclusion")
    st.markdown("""
    ### Summary
    - **Unsupervised Learning** helped group reviews into coherent topics using text similarity.
    - **Supervised Learning** effectively predicted sentiment with Naive Bayes and TF-IDF.

    ### Insights
    - Clustering allows business teams to explore latent structures without labels.
    - Sentiment prediction can automate review analysis at scale.

    ### Next Steps
    - Try BERT for more accurate sentiment classification.
    - Explore topic labeling using BERTopic.
    - Add interactivity to select specific reviews or visualize clusters in 2D.
    """)

# --- Sidebar Navigation ---
pages = {
    "1 - Introduction": page_intro,
    "2 - Unsupervised Learning": page_unsupervised,
    "3 - Supervised Learning": page_supervised,
    "4 - Results & Conclusion": page_results,
}

st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", list(pages.keys()))
pages[page]()