# AI-Assisted Code — Academic Integrity Notice # Generated with The App Builder. ESCP coursework. # Student must be able to explain all code when asked. from pathlib import Path import json import traceback import random import warnings import gradio as gr import pandas as pd import plotly.graph_objects as go from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from statsmodels.tsa.arima.model import ARIMA BASE_DIR = Path(__file__).resolve().parent ART_DIR = BASE_DIR / "artifacts" PY_FIG_DIR = ART_DIR / "py" / "figures" PY_TAB_DIR = ART_DIR / "py" / "tables" REVIEWS_CSV = BASE_DIR / "synthetic_book_reviews.csv" SALES_CSV = BASE_DIR / "synthetic_sales_data.csv" def ensure_dirs(): """Create artifact folders used by the dashboard.""" for path in [PY_FIG_DIR, PY_TAB_DIR]: path.mkdir(parents=True, exist_ok=True) def artifacts_index(): """Return the current list of generated files.""" def list_ext(folder: Path, exts: tuple[str, ...]): if not folder.exists(): return [] return sorted([p.name for p in folder.iterdir() if p.is_file() and p.suffix.lower() in exts]) return { "python": { "figures": list_ext(PY_FIG_DIR, (".png", ".jpg", ".jpeg")), "tables": list_ext(PY_TAB_DIR, (".csv", ".json")), } } def load_inputs(): """Load the uploaded input datasets.""" if not REVIEWS_CSV.exists(): raise FileNotFoundError("synthetic_book_reviews.csv not found.") if not SALES_CSV.exists(): raise FileNotFoundError("synthetic_sales_data.csv not found.") df_reviews = pd.read_csv(REVIEWS_CSV) df_sales = pd.read_csv(SALES_CSV) if "title" not in df_reviews.columns: raise ValueError("Reviews CSV must contain a 'title' column.") if "title" not in df_sales.columns or "month" not in df_sales.columns or "units_sold" not in df_sales.columns: raise ValueError("Sales CSV must contain 'title', 'month', and 'units_sold' columns.") return df_reviews, df_sales def add_review_text(df_reviews: pd.DataFrame) -> pd.DataFrame: """Recreate simple review text because the provided CSV has no text field.""" if "review_text" in df_reviews.columns: return df_reviews.copy() random.seed(42) templates = { "One": [ "Terrible quality and very disappointing.", "I would not recommend this book.", "Weak writing and poor value for money.", ], "Two": [ "Some good ideas, but overall underwhelming.", "Average read with a few frustrating parts.", "Not the worst, but I expected better.", ], "Three": [ "An okay book with mixed strengths and weaknesses.", "Decent overall and worth considering.", "Balanced experience, neither great nor bad.", ], "Four": [ "Enjoyable and well written.", "A strong book with a lot to like.", "Very good read and easy to recommend.", ], "Five": [ "Excellent book and highly recommended.", "Outstanding quality from start to finish.", "One of the best books in this sample.", ], } def build_text(row): rating = str(row.get("rating", "Three")).strip() choices = templates.get(rating, templates["Three"]) return random.choice(choices) out = df_reviews.copy() out["review_text"] = out.apply(build_text, axis=1) return out def classify_sentiment(df_reviews: pd.DataFrame) -> pd.DataFrame: """Score review text with VADER and map to labels.""" analyzer = SentimentIntensityAnalyzer() out = df_reviews.copy() def label_text(text: str): score = analyzer.polarity_scores(str(text))["compound"] if score >= 0.05: return "positive" if score <= -0.05: return "negative" return "neutral" out["sentiment_label"] = out["review_text"].apply(label_text) return out def build_dashboard_table(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame: """Create the monthly dashboard table.""" sales = df_sales.copy() sales["month"] = pd.to_datetime(sales["month"], errors="coerce") revenue_df = sales.merge(df_reviews[["title", "price"]].drop_duplicates(), on="title", how="left") revenue_df["estimated_revenue"] = revenue_df["units_sold"] * revenue_df["price"] dashboard = ( revenue_df.groupby("month", as_index=False) .agg( total_units_sold=("units_sold", "sum"), avg_units_sold=("units_sold", "mean"), total_revenue=("estimated_revenue", "sum"), ) .sort_values("month") ) return dashboard def build_top_titles(df_sales: pd.DataFrame) -> pd.DataFrame: """Aggregate total units sold by title.""" return ( df_sales.groupby("title", as_index=False)["units_sold"] .sum() .sort_values("units_sold", ascending=False) .rename(columns={"units_sold": "total_units_sold"}) ) def build_pricing_decisions(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame: """Create a simple pricing recommendation table.""" merged = df_sales.merge(df_reviews[["title", "price", "rating"]].drop_duplicates(), on="title", how="left") agg = ( merged.groupby(["title", "price", "rating"], as_index=False)["units_sold"] .sum() .rename(columns={"units_sold": "total_units_sold"}) .sort_values("total_units_sold", ascending=False) ) def recommend(row): rating = str(row["rating"]) units = float(row["total_units_sold"]) if rating in {"Four", "Five"} and units > agg["total_units_sold"].median(): return "Consider premium pricing" if rating in {"One", "Two"}: return "Consider discounting" return "Keep current price" agg["pricing_decision"] = agg.apply(recommend, axis=1) return agg def save_sales_figure(df_dashboard: pd.DataFrame): """Save the monthly overview figure.""" plt.figure(figsize=(10, 5)) plt.plot(df_dashboard["month"], df_dashboard["total_units_sold"], marker="o") plt.title("Monthly Units Sold") plt.xlabel("Month") plt.ylabel("Units Sold") plt.xticks(rotation=45) plt.tight_layout() plt.savefig(PY_FIG_DIR / "sales_trends_sampled_titles.png", dpi=150) plt.close() def save_sentiment_figure(df_sentiment: pd.DataFrame): """Save the sentiment distribution figure.""" counts = ( df_sentiment.groupby(["title", "sentiment_label"]) .size() .unstack(fill_value=0) .reset_index() ) for col in ["negative", "neutral", "positive"]: if col not in counts.columns: counts[col] = 0 top = counts.head(20).copy() top = top.sort_values("positive", ascending=True) plt.figure(figsize=(11, 8)) plt.barh(top["title"], top["negative"], label="negative") plt.barh(top["title"], top["neutral"], left=top["negative"], label="neutral") plt.barh(top["title"], top["positive"], left=top["negative"] + top["neutral"], label="positive") plt.title("Sentiment Distribution by Book") plt.xlabel("Number of Reviews") plt.ylabel("Book Title") plt.legend() plt.tight_layout() plt.savefig(PY_FIG_DIR / "sentiment_distribution_sampled_titles.png", dpi=150) plt.close() counts.rename(columns={"title": "grouped_title"}).to_csv( PY_TAB_DIR / "sentiment_counts_sampled.csv", index=False ) def save_arima_figure(df_sales: pd.DataFrame): """Fit a lightweight ARIMA forecast for the top title and save a figure.""" top_title = ( df_sales.groupby("title")["units_sold"].sum().sort_values(ascending=False).index[0] ) title_sales = ( df_sales[df_sales["title"] == top_title] .copy() .sort_values("month") ) title_sales["month"] = pd.to_datetime(title_sales["month"], errors="coerce") title_sales = title_sales.dropna(subset=["month"]).set_index("month") if len(title_sales) < 6: return with warnings.catch_warnings(): warnings.simplefilter("ignore") model = ARIMA(title_sales["units_sold"], order=(1, 1, 1)) fitted = model.fit() forecast = fitted.forecast(steps=6) forecast_index = pd.date_range( start=title_sales.index.max() + pd.offsets.MonthBegin(1), periods=6, freq="MS", ) plt.figure(figsize=(10, 5)) plt.plot(title_sales.index, title_sales["units_sold"], marker="o", label="Observed") plt.plot(forecast_index, forecast.values, linestyle="--", marker="o", label="Forecast") plt.title(f"ARIMA Forecast for Top Title: {top_title}") plt.xlabel("Month") plt.ylabel("Units Sold") plt.legend() plt.xticks(rotation=45) plt.tight_layout() plt.savefig(PY_FIG_DIR / "arima_forecast_sampled_titles.png", dpi=150) plt.close() def save_kpis(df_sales: pd.DataFrame, df_dashboard: pd.DataFrame): """Save KPI summary JSON.""" payload = { "n_titles": int(df_sales["title"].nunique()), "n_months": int(df_dashboard["month"].nunique()), "total_units_sold": float(df_sales["units_sold"].sum()), } if "total_revenue" in df_dashboard.columns: payload["total_revenue"] = float(df_dashboard["total_revenue"].sum()) with open(PY_FIG_DIR / "kpis.json", "w", encoding="utf-8") as file: json.dump(payload, file, indent=2) def run_pipeline(): """Run the notebook logic as normal Python functions.""" try: ensure_dirs() df_reviews_raw, df_sales = load_inputs() df_reviews = add_review_text(df_reviews_raw) df_sentiment = classify_sentiment(df_reviews) df_dashboard = build_dashboard_table(df_sales, df_reviews) df_top = build_top_titles(df_sales) df_pricing = build_pricing_decisions(df_sales, df_reviews) df_dashboard.to_csv(PY_TAB_DIR / "df_dashboard.csv", index=False) df_top.to_csv(PY_TAB_DIR / "top_titles_by_units_sold.csv", index=False) df_pricing.to_csv(PY_TAB_DIR / "pricing_decisions.csv", index=False) save_sales_figure(df_dashboard) save_sentiment_figure(df_sentiment) save_arima_figure(df_sales) save_kpis(df_sales, df_dashboard) return ( "Pipeline completed successfully.\n\n" f"Rows in reviews: {len(df_reviews):,}\n" f"Rows in sales: {len(df_sales):,}\n\n" f"Generated figures: {', '.join(artifacts_index()['python']['figures'])}\n" f"Generated tables: {', '.join(artifacts_index()['python']['tables'])}" ) except Exception as error: return f"Pipeline failed:\n{error}\n\n{traceback.format_exc()}" def load_kpis(): """Load KPI JSON if available.""" path = PY_FIG_DIR / "kpis.json" if path.exists(): return json.loads(path.read_text(encoding="utf-8")) return {} def render_kpi_cards(): """Render simple HTML KPI cards.""" kpis = load_kpis() if not kpis: return "