Spaces:

ktara
/

Session5

Build error

File size: 7,103 Bytes

# AI-Assisted Code — Academic Integrity Notice
# Generated with The App Builder. ESCP coursework.
# Student must be able to explain all code when asked.

"""Gradio Space that runs the fixed notebook workflow on bundled CSV files."""

from pathlib import Path
import shutil
import warnings
import gradio as gr
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from itertools import product
from zipfile import ZipFile

DATA_REVIEWS = "synthetic_book_reviews.csv"
DATA_SALES = "synthetic_sales_data.csv"
ART_DIR = Path("artifacts")
FIG_DIR = ART_DIR / "figures"
TAB_DIR = ART_DIR / "tables"


def ensure_dirs():
    """Create output folders used by the app."""
    FIG_DIR.mkdir(parents=True, exist_ok=True)
    TAB_DIR.mkdir(parents=True, exist_ok=True)


def load_data():
    """Load the two fixed datasets bundled with the Space."""
    reviews = pd.read_csv(DATA_REVIEWS)
    sales = pd.read_csv(DATA_SALES)
    required_reviews = {"title", "review_text", "rating", "popularity_score"}
    required_sales = {"title", "month", "units_sold"}
    if not required_reviews.issubset(reviews.columns):
        raise ValueError(f"Missing review columns: {required_reviews - set(reviews.columns)}")
    if not required_sales.issubset(sales.columns):
        raise ValueError(f"Missing sales columns: {required_sales - set(sales.columns)}")
    return reviews, sales


def build_sample_titles(reviews):
    """Pick up to 5 titles from each popularity score, like in the notebook."""
    sampled_titles = []
    for score in sorted(reviews["popularity_score"].dropna().unique()):
        titles = reviews.loc[reviews["popularity_score"] == score, "title"].dropna().unique().tolist()
        sampled_titles.extend(titles[:5])
    return sampled_titles


def save_sales_trend_chart(sampled_sales, sampled_books, sampled_titles):
    """Create the sampled sales trend figure."""
    popularity_colors = {1: "darkred", 2: "orangered", 3: "gold", 4: "mediumseagreen", 5: "royalblue"}
    fig, ax = plt.subplots(figsize=(14, 6))
    for title in sampled_titles:
        row = sampled_books[sampled_books["title"] == title].iloc[0]
        subset = sampled_sales[sampled_sales["title"] == title]
        ax.plot(subset["month"], subset["units_sold"], label=title,
                color=popularity_colors.get(row["popularity_score"], "gray"))
    ax.set_title("Sales Trends Over Time")
    ax.set_xlabel("Month")
    ax.set_ylabel("Units Sold")
    ax.tick_params(axis="x", rotation=45)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5), fontsize="small")
    fig.tight_layout()
    out = FIG_DIR / "sales_trends_sampled_titles.png"
    fig.savefig(out, dpi=150, bbox_inches="tight")
    plt.close(fig)
    return str(out)


def save_sentiment_chart(sampled_reviews):
    """Create the stacked sentiment distribution chart."""
    sampled_reviews = sampled_reviews.copy()
    sampled_reviews["grouped_title"] = sampled_reviews["rating"].astype(str) + "★ | " + sampled_reviews["title"]
    counts = sampled_reviews.groupby(["grouped_title", "sentiment_label"]).size().unstack(fill_value=0)
    counts = counts.reindex(columns=["negative", "neutral", "positive"], fill_value=0)
    counts.reset_index().to_csv(TAB_DIR / "sentiment_counts_sampled.csv", index=False)
    fig, ax = plt.subplots(figsize=(12, 12))
    counts.plot.barh(stacked=True, ax=ax, color={"negative": "royalblue", "neutral": "lightgray", "positive": "crimson"})
    ax.set_title("Sentiment Distribution in Reviews")
    ax.set_xlabel("Number of Reviews")
    ax.set_ylabel("Book Title")
    ax.grid(axis="x", linestyle="--", alpha=0.4)
    fig.tight_layout()
    out = FIG_DIR / "sentiment_distribution_sampled_titles.png"
    fig.savefig(out, dpi=150, bbox_inches="tight")
    plt.close(fig)
    return str(out)


def pricing_action(row):
    """Apply the exact notebook decision rules."""
    if row["avg_units_sold"] >= 120 and row.get("positive_ratio", 0) >= 0.6:
        return "increase price"
    if row["avg_units_sold"] <= 60 and row.get("negative_ratio", 0) >= 0.4:
        return "decrease price"
    return "keep price"


def save_decision_table(reviews, sales):
    """Compute and save the final pricing decision table."""
    avg_sales = sales.groupby("title", as_index=False)["units_sold"].mean().rename(columns={"units_sold": "avg_units_sold"})
    sentiment = reviews.groupby(["title", "sentiment_label"]).size().unstack(fill_value=0)
    sentiment["total"] = sentiment.sum(axis=1)
    sentiment["positive_ratio"] = sentiment.get("positive", 0) / sentiment["total"]
    sentiment["negative_ratio"] = sentiment.get("negative", 0) / sentiment["total"]
    decisions = avg_sales.merge(sentiment, on="title", how="left").fillna(0)
    decisions["pricing_action"] = decisions.apply(pricing_action, axis=1)
    final_cols = ["title", "avg_units_sold", "positive_ratio", "negative_ratio", "pricing_action"]
    final_df = decisions[final_cols].sort_values("title").reset_index(drop=True)
    final_df.to_csv(TAB_DIR / "pricing_decisions.csv", index=False)
    return final_df


def save_dashboard_export(sales):
    """Save a monthly dashboard export like the notebook fallback."""
    dashboard = sales.groupby("month", as_index=False).agg(total_units_sold=("units_sold", "sum")).sort_values("month")
    dashboard.to_csv(TAB_DIR / "df_dashboard.csv", index=False)


def bundle_exports():
    """Zip all generated export files for easy download."""
    zip_path = ART_DIR / "exports.zip"
    with ZipFile(zip_path, "w") as zf:
        for path in list(FIG_DIR.glob("*")) + list(TAB_DIR.glob("*")):
            zf.write(path, arcname=path.relative_to(ART_DIR))
    return str(zip_path)


def run_analysis():
    """Run the complete pipeline and return final outputs only."""
    ensure_dirs()
    reviews, sales = load_data()
    sales["month"] = pd.to_datetime(sales["month"])
    sampled_titles = build_sample_titles(reviews)
    sampled_sales = sales[sales["title"].isin(sampled_titles)].copy()
    sampled_reviews = reviews[reviews["title"].isin(sampled_titles)].copy()
    sampled_books = reviews[reviews["title"].isin(sampled_titles)].copy()
    chart_1 = save_sales_trend_chart(sampled_sales, sampled_books, sampled_titles)
    chart_2 = save_sentiment_chart(sampled_reviews)
    decision_df = save_decision_table(reviews, sales)
    save_dashboard_export(sales)
    export_zip = bundle_exports()
    return chart_1, chart_2, decision_df, export_zip


with gr.Blocks() as demo:
    gr.Markdown("# Book Analytics Dashboard")
    gr.Markdown("Runs the fixed notebook workflow on the bundled review and sales datasets.")
    run_btn = gr.Button("Run analysis")
    sales_chart = gr.Image(label="Sales trends")
    sentiment_chart = gr.Image(label="Sentiment distribution")
    decision_table = gr.Dataframe(label="Pricing decisions")
    exports = gr.File(label="Download all exports")
    run_btn.click(fn=run_analysis, inputs=None, outputs=[sales_chart, sentiment_chart, decision_table, exports])

if __name__ == "__main__":
    demo.launch()