Spaces:

pauli1234
/

BigDataHandsonActivity5

Runtime error

File size: 19,271 Bytes

145e674

# AI-Assisted Code — Academic Integrity Notice
# Generated with The App Builder. ESCP coursework.
# Student must be able to explain all code when asked.

from pathlib import Path
import json
import traceback
import random
import warnings

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA


BASE_DIR = Path(__file__).resolve().parent
ART_DIR = BASE_DIR / "artifacts"
PY_FIG_DIR = ART_DIR / "py" / "figures"
PY_TAB_DIR = ART_DIR / "py" / "tables"
REVIEWS_CSV = BASE_DIR / "synthetic_book_reviews.csv"
SALES_CSV = BASE_DIR / "synthetic_sales_data.csv"


def ensure_dirs():
    """Create artifact folders used by the dashboard."""
    for path in [PY_FIG_DIR, PY_TAB_DIR]:
        path.mkdir(parents=True, exist_ok=True)


def artifacts_index():
    """Return the current list of generated files."""
    def list_ext(folder: Path, exts: tuple[str, ...]):
        if not folder.exists():
            return []
        return sorted([p.name for p in folder.iterdir() if p.is_file() and p.suffix.lower() in exts])

    return {
        "python": {
            "figures": list_ext(PY_FIG_DIR, (".png", ".jpg", ".jpeg")),
            "tables": list_ext(PY_TAB_DIR, (".csv", ".json")),
        }
    }


def load_inputs():
    """Load the uploaded input datasets."""
    if not REVIEWS_CSV.exists():
        raise FileNotFoundError("synthetic_book_reviews.csv not found.")
    if not SALES_CSV.exists():
        raise FileNotFoundError("synthetic_sales_data.csv not found.")

    df_reviews = pd.read_csv(REVIEWS_CSV)
    df_sales = pd.read_csv(SALES_CSV)

    if "title" not in df_reviews.columns:
        raise ValueError("Reviews CSV must contain a 'title' column.")
    if "title" not in df_sales.columns or "month" not in df_sales.columns or "units_sold" not in df_sales.columns:
        raise ValueError("Sales CSV must contain 'title', 'month', and 'units_sold' columns.")

    return df_reviews, df_sales


def add_review_text(df_reviews: pd.DataFrame) -> pd.DataFrame:
    """Recreate simple review text because the provided CSV has no text field."""
    if "review_text" in df_reviews.columns:
        return df_reviews.copy()

    random.seed(42)
    templates = {
        "One": [
            "Terrible quality and very disappointing.",
            "I would not recommend this book.",
            "Weak writing and poor value for money.",
        ],
        "Two": [
            "Some good ideas, but overall underwhelming.",
            "Average read with a few frustrating parts.",
            "Not the worst, but I expected better.",
        ],
        "Three": [
            "An okay book with mixed strengths and weaknesses.",
            "Decent overall and worth considering.",
            "Balanced experience, neither great nor bad.",
        ],
        "Four": [
            "Enjoyable and well written.",
            "A strong book with a lot to like.",
            "Very good read and easy to recommend.",
        ],
        "Five": [
            "Excellent book and highly recommended.",
            "Outstanding quality from start to finish.",
            "One of the best books in this sample.",
        ],
    }

    def build_text(row):
        rating = str(row.get("rating", "Three")).strip()
        choices = templates.get(rating, templates["Three"])
        return random.choice(choices)

    out = df_reviews.copy()
    out["review_text"] = out.apply(build_text, axis=1)
    return out


def classify_sentiment(df_reviews: pd.DataFrame) -> pd.DataFrame:
    """Score review text with VADER and map to labels."""
    analyzer = SentimentIntensityAnalyzer()
    out = df_reviews.copy()

    def label_text(text: str):
        score = analyzer.polarity_scores(str(text))["compound"]
        if score >= 0.05:
            return "positive"
        if score <= -0.05:
            return "negative"
        return "neutral"

    out["sentiment_label"] = out["review_text"].apply(label_text)
    return out


def build_dashboard_table(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame:
    """Create the monthly dashboard table."""
    sales = df_sales.copy()
    sales["month"] = pd.to_datetime(sales["month"], errors="coerce")

    revenue_df = sales.merge(df_reviews[["title", "price"]].drop_duplicates(), on="title", how="left")
    revenue_df["estimated_revenue"] = revenue_df["units_sold"] * revenue_df["price"]

    dashboard = (
        revenue_df.groupby("month", as_index=False)
        .agg(
            total_units_sold=("units_sold", "sum"),
            avg_units_sold=("units_sold", "mean"),
            total_revenue=("estimated_revenue", "sum"),
        )
        .sort_values("month")
    )
    return dashboard


def build_top_titles(df_sales: pd.DataFrame) -> pd.DataFrame:
    """Aggregate total units sold by title."""
    return (
        df_sales.groupby("title", as_index=False)["units_sold"]
        .sum()
        .sort_values("units_sold", ascending=False)
        .rename(columns={"units_sold": "total_units_sold"})
    )


def build_pricing_decisions(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame:
    """Create a simple pricing recommendation table."""
    merged = df_sales.merge(df_reviews[["title", "price", "rating"]].drop_duplicates(), on="title", how="left")
    agg = (
        merged.groupby(["title", "price", "rating"], as_index=False)["units_sold"]
        .sum()
        .rename(columns={"units_sold": "total_units_sold"})
        .sort_values("total_units_sold", ascending=False)
    )

    def recommend(row):
        rating = str(row["rating"])
        units = float(row["total_units_sold"])
        if rating in {"Four", "Five"} and units > agg["total_units_sold"].median():
            return "Consider premium pricing"
        if rating in {"One", "Two"}:
            return "Consider discounting"
        return "Keep current price"

    agg["pricing_decision"] = agg.apply(recommend, axis=1)
    return agg


def save_sales_figure(df_dashboard: pd.DataFrame):
    """Save the monthly overview figure."""
    plt.figure(figsize=(10, 5))
    plt.plot(df_dashboard["month"], df_dashboard["total_units_sold"], marker="o")
    plt.title("Monthly Units Sold")
    plt.xlabel("Month")
    plt.ylabel("Units Sold")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(PY_FIG_DIR / "sales_trends_sampled_titles.png", dpi=150)
    plt.close()


def save_sentiment_figure(df_sentiment: pd.DataFrame):
    """Save the sentiment distribution figure."""
    counts = (
        df_sentiment.groupby(["title", "sentiment_label"])
        .size()
        .unstack(fill_value=0)
        .reset_index()
    )
    for col in ["negative", "neutral", "positive"]:
        if col not in counts.columns:
            counts[col] = 0

    top = counts.head(20).copy()
    top = top.sort_values("positive", ascending=True)

    plt.figure(figsize=(11, 8))
    plt.barh(top["title"], top["negative"], label="negative")
    plt.barh(top["title"], top["neutral"], left=top["negative"], label="neutral")
    plt.barh(top["title"], top["positive"], left=top["negative"] + top["neutral"], label="positive")
    plt.title("Sentiment Distribution by Book")
    plt.xlabel("Number of Reviews")
    plt.ylabel("Book Title")
    plt.legend()
    plt.tight_layout()
    plt.savefig(PY_FIG_DIR / "sentiment_distribution_sampled_titles.png", dpi=150)
    plt.close()

    counts.rename(columns={"title": "grouped_title"}).to_csv(
        PY_TAB_DIR / "sentiment_counts_sampled.csv", index=False
    )


def save_arima_figure(df_sales: pd.DataFrame):
    """Fit a lightweight ARIMA forecast for the top title and save a figure."""
    top_title = (
        df_sales.groupby("title")["units_sold"].sum().sort_values(ascending=False).index[0]
    )
    title_sales = (
        df_sales[df_sales["title"] == top_title]
        .copy()
        .sort_values("month")
    )
    title_sales["month"] = pd.to_datetime(title_sales["month"], errors="coerce")
    title_sales = title_sales.dropna(subset=["month"]).set_index("month")

    if len(title_sales) < 6:
        return

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model = ARIMA(title_sales["units_sold"], order=(1, 1, 1))
        fitted = model.fit()
        forecast = fitted.forecast(steps=6)

    forecast_index = pd.date_range(
        start=title_sales.index.max() + pd.offsets.MonthBegin(1),
        periods=6,
        freq="MS",
    )

    plt.figure(figsize=(10, 5))
    plt.plot(title_sales.index, title_sales["units_sold"], marker="o", label="Observed")
    plt.plot(forecast_index, forecast.values, linestyle="--", marker="o", label="Forecast")
    plt.title(f"ARIMA Forecast for Top Title: {top_title}")
    plt.xlabel("Month")
    plt.ylabel("Units Sold")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(PY_FIG_DIR / "arima_forecast_sampled_titles.png", dpi=150)
    plt.close()


def save_kpis(df_sales: pd.DataFrame, df_dashboard: pd.DataFrame):
    """Save KPI summary JSON."""
    payload = {
        "n_titles": int(df_sales["title"].nunique()),
        "n_months": int(df_dashboard["month"].nunique()),
        "total_units_sold": float(df_sales["units_sold"].sum()),
    }
    if "total_revenue" in df_dashboard.columns:
        payload["total_revenue"] = float(df_dashboard["total_revenue"].sum())

    with open(PY_FIG_DIR / "kpis.json", "w", encoding="utf-8") as file:
        json.dump(payload, file, indent=2)


def run_pipeline():
    """Run the notebook logic as normal Python functions."""
    try:
        ensure_dirs()
        df_reviews_raw, df_sales = load_inputs()
        df_reviews = add_review_text(df_reviews_raw)
        df_sentiment = classify_sentiment(df_reviews)
        df_dashboard = build_dashboard_table(df_sales, df_reviews)
        df_top = build_top_titles(df_sales)
        df_pricing = build_pricing_decisions(df_sales, df_reviews)

        df_dashboard.to_csv(PY_TAB_DIR / "df_dashboard.csv", index=False)
        df_top.to_csv(PY_TAB_DIR / "top_titles_by_units_sold.csv", index=False)
        df_pricing.to_csv(PY_TAB_DIR / "pricing_decisions.csv", index=False)

        save_sales_figure(df_dashboard)
        save_sentiment_figure(df_sentiment)
        save_arima_figure(df_sales)
        save_kpis(df_sales, df_dashboard)

        return (
            "Pipeline completed successfully.\n\n"
            f"Rows in reviews: {len(df_reviews):,}\n"
            f"Rows in sales: {len(df_sales):,}\n\n"
            f"Generated figures: {', '.join(artifacts_index()['python']['figures'])}\n"
            f"Generated tables: {', '.join(artifacts_index()['python']['tables'])}"
        )
    except Exception as error:
        return f"Pipeline failed:\n{error}\n\n{traceback.format_exc()}"


def load_kpis():
    """Load KPI JSON if available."""
    path = PY_FIG_DIR / "kpis.json"
    if path.exists():
        return json.loads(path.read_text(encoding="utf-8"))
    return {}


def render_kpi_cards():
    """Render simple HTML KPI cards."""
    kpis = load_kpis()
    if not kpis:
        return "<div style='padding:18px;border:1px solid #ddd;border-radius:12px;'>Run the pipeline to generate KPIs.</div>"

    html = "<div style='display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;'>"
    for key, value in kpis.items():
        label = key.replace("_", " ").title()
        if isinstance(value, (int, float)):
            value = f"{value:,.0f}"
        html += (
            "<div style='padding:16px;border-radius:14px;border:1px solid #ddd;background:white;'>"
            f"<div style='font-size:12px;color:#666;margin-bottom:6px;'>{label}</div>"
            f"<div style='font-size:24px;font-weight:700;'>{value}</div>"
            "</div>"
        )
    html += "</div>"
    return html


def _load_table(path: Path):
    """Load CSV or JSON into a DataFrame preview."""
    if not path.exists():
        return pd.DataFrame([{"error": f"Missing file: {path.name}"}])
    if path.suffix == ".json":
        obj = json.loads(path.read_text(encoding="utf-8"))
        if isinstance(obj, dict):
            return pd.DataFrame([obj])
        return pd.DataFrame(obj)
    return pd.read_csv(path)


def build_sales_chart():
    """Interactive plotly version of the monthly overview."""
    path = PY_TAB_DIR / "df_dashboard.csv"
    if not path.exists():
        return go.Figure().update_layout(title="Run the pipeline first")
    df = pd.read_csv(path)
    df["month"] = pd.to_datetime(df["month"], errors="coerce")
    fig = go.Figure()
    for col in [c for c in df.columns if c != "month"]:
        fig.add_trace(go.Scatter(x=df["month"], y=df[col], mode="lines+markers", name=col))
    fig.update_layout(title="Monthly Overview", template="plotly_white", height=420)
    return fig


def build_sentiment_chart():
    """Interactive stacked chart for sentiment counts."""
    path = PY_TAB_DIR / "sentiment_counts_sampled.csv"
    if not path.exists():
        return go.Figure().update_layout(title="Run the pipeline first")
    df = pd.read_csv(path).head(20)
    fig = go.Figure()
    for col in ["negative", "neutral", "positive"]:
        if col in df.columns:
            fig.add_trace(go.Bar(y=df["grouped_title"], x=df[col], orientation="h", name=col))
    fig.update_layout(title="Sentiment Distribution", barmode="stack", template="plotly_white", height=500)
    return fig


def build_top_chart():
    """Interactive top sellers chart."""
    path = PY_TAB_DIR / "top_titles_by_units_sold.csv"
    if not path.exists():
        return go.Figure().update_layout(title="Run the pipeline first")
    df = pd.read_csv(path).head(15)
    title_col = "title" if "title" in df.columns else df.columns[0]
    val_col = "total_units_sold" if "total_units_sold" in df.columns else df.columns[-1]
    fig = go.Figure(go.Bar(y=df[title_col], x=df[val_col], orientation="h"))
    fig.update_layout(title="Top Sellers", template="plotly_white", height=500)
    return fig


def refresh_gallery():
    """Refresh static figures and the first available table."""
    figures = []
    for path in sorted(PY_FIG_DIR.glob("*.png")):
        figures.append((str(path), path.stem.replace("_", " ").title()))

    table_choices = artifacts_index()["python"]["tables"]
    first_df = pd.DataFrame()
    first_choice = None
    if table_choices:
        first_choice = table_choices[0]
        first_df = _load_table(PY_TAB_DIR / first_choice)

    return figures, gr.update(choices=table_choices, value=first_choice), first_df


def on_table_select(choice):
    """Load the selected table preview."""
    if not choice:
        return pd.DataFrame([{"hint": "Select a table above."}])
    return _load_table(PY_TAB_DIR / choice)


def refresh_dashboard():
    """Refresh KPI cards and all interactive charts."""
    return render_kpi_cards(), build_sales_chart(), build_sentiment_chart(), build_top_chart()


def ai_reply(question: str):
    """Simple keyword routing for the AI tab."""
    msg = (question or "").lower()
    kpis = load_kpis()
    summary = ""
    if kpis:
        summary = (
            f"We currently have {int(kpis.get('n_titles', 0))} titles and "
            f"{int(kpis.get('total_units_sold', 0)):,} total units sold."
        )

    if any(word in msg for word in ["sentiment", "review", "positive", "negative"]):
        return f"Here is the sentiment view. {summary}", build_sentiment_chart(), None
    if any(word in msg for word in ["top", "seller", "best", "popular"]):
        return f"Here are the top sellers. {summary}", None, _load_table(PY_TAB_DIR / "top_titles_by_units_sold.csv")
    if any(word in msg for word in ["price", "pricing"]):
        return f"Here are the pricing decisions. {summary}", None, _load_table(PY_TAB_DIR / "pricing_decisions.csv")
    return f"Here is the monthly overview. {summary}", build_sales_chart(), None


def ai_chat(user_msg, history):
    """Chat wrapper compatible with Gradio Chatbot."""
    if not user_msg:
        return history, "", None, None
    reply, chart, table = ai_reply(user_msg)
    history = (history or []) + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": reply}]
    return history, "", chart, table


def load_css():
    """Load optional CSS file."""
    path = BASE_DIR / "style.css"
    return path.read_text(encoding="utf-8") if path.exists() else ""


ensure_dirs()

with gr.Blocks(title="AIBDM 2026 Workshop App") as demo:
    gr.Markdown(
        "# SE21 App Template\n"
        "*Notebook converted into a Hugging Face Space*",
        elem_id="escp_title",
    )

    with gr.Tab("Pipeline Runner"):
        gr.Markdown("Run the full notebook logic with the uploaded CSV files.")
        run_btn = gr.Button("Run Full Pipeline", variant="primary")
        run_log = gr.Textbox(label="Execution Log", lines=18, max_lines=30, interactive=False)
        run_btn.click(run_pipeline, outputs=[run_log])

    with gr.Tab("Dashboard"):
        kpi_html = gr.HTML(value=render_kpi_cards())
        refresh_btn = gr.Button("Refresh Dashboard", variant="primary")

        gr.Markdown("#### Interactive Charts")
        chart_sales = gr.Plot(label="Monthly Overview")
        chart_sentiment = gr.Plot(label="Sentiment Distribution")
        chart_top = gr.Plot(label="Top Sellers")

        gr.Markdown("#### Static Figures")
        gallery = gr.Gallery(label="Generated Figures", columns=2, height=480, object_fit="contain")

        gr.Markdown("#### Data Tables")
        table_dropdown = gr.Dropdown(label="Select a table to view", choices=[], interactive=True)
        table_display = gr.Dataframe(label="Table Preview", interactive=False)

        def _on_refresh():
            kpi, c1, c2, c3 = refresh_dashboard()
            figs, dd, df = refresh_gallery()
            return kpi, c1, c2, c3, figs, dd, df

        refresh_btn.click(
            _on_refresh,
            outputs=[kpi_html, chart_sales, chart_sentiment, chart_top, gallery, table_dropdown, table_display],
        )
        table_dropdown.change(on_table_select, inputs=[table_dropdown], outputs=[table_display])

    with gr.Tab('"AI" Dashboard'):
        gr.Markdown(
            "### Ask questions about your data\n"
            "This version uses built-in keyword routing, so it works without API keys."
        )
        with gr.Row(equal_height=True):
            with gr.Column(scale=1):
                chatbot = gr.Chatbot(label="Conversation", type="messages", height=380)
                user_input = gr.Textbox(
                    label="Ask about your data",
                    placeholder="e.g. Show me sales trends / top sellers / sentiment",
                    lines=1,
                )
            with gr.Column(scale=1):
                ai_figure = gr.Plot(label="Interactive Chart")
                ai_table = gr.Dataframe(label="Data Table", interactive=False)

        user_input.submit(
            ai_chat,
            inputs=[user_input, chatbot],
            outputs=[chatbot, user_input, ai_figure, ai_table],
        )

demo.launch(css=load_css())