Spaces:
Runtime error
Runtime error
| # AI-Assisted Code — Academic Integrity Notice | |
| # Generated with The App Builder. ESCP coursework. | |
| # Student must be able to explain all code when asked. | |
| from pathlib import Path | |
| import json | |
| import traceback | |
| import random | |
| import warnings | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| from statsmodels.tsa.arima.model import ARIMA | |
| BASE_DIR = Path(__file__).resolve().parent | |
| ART_DIR = BASE_DIR / "artifacts" | |
| PY_FIG_DIR = ART_DIR / "py" / "figures" | |
| PY_TAB_DIR = ART_DIR / "py" / "tables" | |
| REVIEWS_CSV = BASE_DIR / "synthetic_book_reviews.csv" | |
| SALES_CSV = BASE_DIR / "synthetic_sales_data.csv" | |
| def ensure_dirs(): | |
| """Create artifact folders used by the dashboard.""" | |
| for path in [PY_FIG_DIR, PY_TAB_DIR]: | |
| path.mkdir(parents=True, exist_ok=True) | |
| def artifacts_index(): | |
| """Return the current list of generated files.""" | |
| def list_ext(folder: Path, exts: tuple[str, ...]): | |
| if not folder.exists(): | |
| return [] | |
| return sorted([p.name for p in folder.iterdir() if p.is_file() and p.suffix.lower() in exts]) | |
| return { | |
| "python": { | |
| "figures": list_ext(PY_FIG_DIR, (".png", ".jpg", ".jpeg")), | |
| "tables": list_ext(PY_TAB_DIR, (".csv", ".json")), | |
| } | |
| } | |
| def load_inputs(): | |
| """Load the uploaded input datasets.""" | |
| if not REVIEWS_CSV.exists(): | |
| raise FileNotFoundError("synthetic_book_reviews.csv not found.") | |
| if not SALES_CSV.exists(): | |
| raise FileNotFoundError("synthetic_sales_data.csv not found.") | |
| df_reviews = pd.read_csv(REVIEWS_CSV) | |
| df_sales = pd.read_csv(SALES_CSV) | |
| if "title" not in df_reviews.columns: | |
| raise ValueError("Reviews CSV must contain a 'title' column.") | |
| if "title" not in df_sales.columns or "month" not in df_sales.columns or "units_sold" not in df_sales.columns: | |
| raise ValueError("Sales CSV must contain 'title', 'month', and 'units_sold' columns.") | |
| return df_reviews, df_sales | |
| def add_review_text(df_reviews: pd.DataFrame) -> pd.DataFrame: | |
| """Recreate simple review text because the provided CSV has no text field.""" | |
| if "review_text" in df_reviews.columns: | |
| return df_reviews.copy() | |
| random.seed(42) | |
| templates = { | |
| "One": [ | |
| "Terrible quality and very disappointing.", | |
| "I would not recommend this book.", | |
| "Weak writing and poor value for money.", | |
| ], | |
| "Two": [ | |
| "Some good ideas, but overall underwhelming.", | |
| "Average read with a few frustrating parts.", | |
| "Not the worst, but I expected better.", | |
| ], | |
| "Three": [ | |
| "An okay book with mixed strengths and weaknesses.", | |
| "Decent overall and worth considering.", | |
| "Balanced experience, neither great nor bad.", | |
| ], | |
| "Four": [ | |
| "Enjoyable and well written.", | |
| "A strong book with a lot to like.", | |
| "Very good read and easy to recommend.", | |
| ], | |
| "Five": [ | |
| "Excellent book and highly recommended.", | |
| "Outstanding quality from start to finish.", | |
| "One of the best books in this sample.", | |
| ], | |
| } | |
| def build_text(row): | |
| rating = str(row.get("rating", "Three")).strip() | |
| choices = templates.get(rating, templates["Three"]) | |
| return random.choice(choices) | |
| out = df_reviews.copy() | |
| out["review_text"] = out.apply(build_text, axis=1) | |
| return out | |
| def classify_sentiment(df_reviews: pd.DataFrame) -> pd.DataFrame: | |
| """Score review text with VADER and map to labels.""" | |
| analyzer = SentimentIntensityAnalyzer() | |
| out = df_reviews.copy() | |
| def label_text(text: str): | |
| score = analyzer.polarity_scores(str(text))["compound"] | |
| if score >= 0.05: | |
| return "positive" | |
| if score <= -0.05: | |
| return "negative" | |
| return "neutral" | |
| out["sentiment_label"] = out["review_text"].apply(label_text) | |
| return out | |
| def build_dashboard_table(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame: | |
| """Create the monthly dashboard table.""" | |
| sales = df_sales.copy() | |
| sales["month"] = pd.to_datetime(sales["month"], errors="coerce") | |
| revenue_df = sales.merge(df_reviews[["title", "price"]].drop_duplicates(), on="title", how="left") | |
| revenue_df["estimated_revenue"] = revenue_df["units_sold"] * revenue_df["price"] | |
| dashboard = ( | |
| revenue_df.groupby("month", as_index=False) | |
| .agg( | |
| total_units_sold=("units_sold", "sum"), | |
| avg_units_sold=("units_sold", "mean"), | |
| total_revenue=("estimated_revenue", "sum"), | |
| ) | |
| .sort_values("month") | |
| ) | |
| return dashboard | |
| def build_top_titles(df_sales: pd.DataFrame) -> pd.DataFrame: | |
| """Aggregate total units sold by title.""" | |
| return ( | |
| df_sales.groupby("title", as_index=False)["units_sold"] | |
| .sum() | |
| .sort_values("units_sold", ascending=False) | |
| .rename(columns={"units_sold": "total_units_sold"}) | |
| ) | |
| def build_pricing_decisions(df_sales: pd.DataFrame, df_reviews: pd.DataFrame) -> pd.DataFrame: | |
| """Create a simple pricing recommendation table.""" | |
| merged = df_sales.merge(df_reviews[["title", "price", "rating"]].drop_duplicates(), on="title", how="left") | |
| agg = ( | |
| merged.groupby(["title", "price", "rating"], as_index=False)["units_sold"] | |
| .sum() | |
| .rename(columns={"units_sold": "total_units_sold"}) | |
| .sort_values("total_units_sold", ascending=False) | |
| ) | |
| def recommend(row): | |
| rating = str(row["rating"]) | |
| units = float(row["total_units_sold"]) | |
| if rating in {"Four", "Five"} and units > agg["total_units_sold"].median(): | |
| return "Consider premium pricing" | |
| if rating in {"One", "Two"}: | |
| return "Consider discounting" | |
| return "Keep current price" | |
| agg["pricing_decision"] = agg.apply(recommend, axis=1) | |
| return agg | |
| def save_sales_figure(df_dashboard: pd.DataFrame): | |
| """Save the monthly overview figure.""" | |
| plt.figure(figsize=(10, 5)) | |
| plt.plot(df_dashboard["month"], df_dashboard["total_units_sold"], marker="o") | |
| plt.title("Monthly Units Sold") | |
| plt.xlabel("Month") | |
| plt.ylabel("Units Sold") | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| plt.savefig(PY_FIG_DIR / "sales_trends_sampled_titles.png", dpi=150) | |
| plt.close() | |
| def save_sentiment_figure(df_sentiment: pd.DataFrame): | |
| """Save the sentiment distribution figure.""" | |
| counts = ( | |
| df_sentiment.groupby(["title", "sentiment_label"]) | |
| .size() | |
| .unstack(fill_value=0) | |
| .reset_index() | |
| ) | |
| for col in ["negative", "neutral", "positive"]: | |
| if col not in counts.columns: | |
| counts[col] = 0 | |
| top = counts.head(20).copy() | |
| top = top.sort_values("positive", ascending=True) | |
| plt.figure(figsize=(11, 8)) | |
| plt.barh(top["title"], top["negative"], label="negative") | |
| plt.barh(top["title"], top["neutral"], left=top["negative"], label="neutral") | |
| plt.barh(top["title"], top["positive"], left=top["negative"] + top["neutral"], label="positive") | |
| plt.title("Sentiment Distribution by Book") | |
| plt.xlabel("Number of Reviews") | |
| plt.ylabel("Book Title") | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.savefig(PY_FIG_DIR / "sentiment_distribution_sampled_titles.png", dpi=150) | |
| plt.close() | |
| counts.rename(columns={"title": "grouped_title"}).to_csv( | |
| PY_TAB_DIR / "sentiment_counts_sampled.csv", index=False | |
| ) | |
| def save_arima_figure(df_sales: pd.DataFrame): | |
| """Fit a lightweight ARIMA forecast for the top title and save a figure.""" | |
| top_title = ( | |
| df_sales.groupby("title")["units_sold"].sum().sort_values(ascending=False).index[0] | |
| ) | |
| title_sales = ( | |
| df_sales[df_sales["title"] == top_title] | |
| .copy() | |
| .sort_values("month") | |
| ) | |
| title_sales["month"] = pd.to_datetime(title_sales["month"], errors="coerce") | |
| title_sales = title_sales.dropna(subset=["month"]).set_index("month") | |
| if len(title_sales) < 6: | |
| return | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter("ignore") | |
| model = ARIMA(title_sales["units_sold"], order=(1, 1, 1)) | |
| fitted = model.fit() | |
| forecast = fitted.forecast(steps=6) | |
| forecast_index = pd.date_range( | |
| start=title_sales.index.max() + pd.offsets.MonthBegin(1), | |
| periods=6, | |
| freq="MS", | |
| ) | |
| plt.figure(figsize=(10, 5)) | |
| plt.plot(title_sales.index, title_sales["units_sold"], marker="o", label="Observed") | |
| plt.plot(forecast_index, forecast.values, linestyle="--", marker="o", label="Forecast") | |
| plt.title(f"ARIMA Forecast for Top Title: {top_title}") | |
| plt.xlabel("Month") | |
| plt.ylabel("Units Sold") | |
| plt.legend() | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| plt.savefig(PY_FIG_DIR / "arima_forecast_sampled_titles.png", dpi=150) | |
| plt.close() | |
| def save_kpis(df_sales: pd.DataFrame, df_dashboard: pd.DataFrame): | |
| """Save KPI summary JSON.""" | |
| payload = { | |
| "n_titles": int(df_sales["title"].nunique()), | |
| "n_months": int(df_dashboard["month"].nunique()), | |
| "total_units_sold": float(df_sales["units_sold"].sum()), | |
| } | |
| if "total_revenue" in df_dashboard.columns: | |
| payload["total_revenue"] = float(df_dashboard["total_revenue"].sum()) | |
| with open(PY_FIG_DIR / "kpis.json", "w", encoding="utf-8") as file: | |
| json.dump(payload, file, indent=2) | |
| def run_pipeline(): | |
| """Run the notebook logic as normal Python functions.""" | |
| try: | |
| ensure_dirs() | |
| df_reviews_raw, df_sales = load_inputs() | |
| df_reviews = add_review_text(df_reviews_raw) | |
| df_sentiment = classify_sentiment(df_reviews) | |
| df_dashboard = build_dashboard_table(df_sales, df_reviews) | |
| df_top = build_top_titles(df_sales) | |
| df_pricing = build_pricing_decisions(df_sales, df_reviews) | |
| df_dashboard.to_csv(PY_TAB_DIR / "df_dashboard.csv", index=False) | |
| df_top.to_csv(PY_TAB_DIR / "top_titles_by_units_sold.csv", index=False) | |
| df_pricing.to_csv(PY_TAB_DIR / "pricing_decisions.csv", index=False) | |
| save_sales_figure(df_dashboard) | |
| save_sentiment_figure(df_sentiment) | |
| save_arima_figure(df_sales) | |
| save_kpis(df_sales, df_dashboard) | |
| return ( | |
| "Pipeline completed successfully.\n\n" | |
| f"Rows in reviews: {len(df_reviews):,}\n" | |
| f"Rows in sales: {len(df_sales):,}\n\n" | |
| f"Generated figures: {', '.join(artifacts_index()['python']['figures'])}\n" | |
| f"Generated tables: {', '.join(artifacts_index()['python']['tables'])}" | |
| ) | |
| except Exception as error: | |
| return f"Pipeline failed:\n{error}\n\n{traceback.format_exc()}" | |
| def load_kpis(): | |
| """Load KPI JSON if available.""" | |
| path = PY_FIG_DIR / "kpis.json" | |
| if path.exists(): | |
| return json.loads(path.read_text(encoding="utf-8")) | |
| return {} | |
| def render_kpi_cards(): | |
| """Render simple HTML KPI cards.""" | |
| kpis = load_kpis() | |
| if not kpis: | |
| return "<div style='padding:18px;border:1px solid #ddd;border-radius:12px;'>Run the pipeline to generate KPIs.</div>" | |
| html = "<div style='display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;'>" | |
| for key, value in kpis.items(): | |
| label = key.replace("_", " ").title() | |
| if isinstance(value, (int, float)): | |
| value = f"{value:,.0f}" | |
| html += ( | |
| "<div style='padding:16px;border-radius:14px;border:1px solid #ddd;background:white;'>" | |
| f"<div style='font-size:12px;color:#666;margin-bottom:6px;'>{label}</div>" | |
| f"<div style='font-size:24px;font-weight:700;'>{value}</div>" | |
| "</div>" | |
| ) | |
| html += "</div>" | |
| return html | |
| def _load_table(path: Path): | |
| """Load CSV or JSON into a DataFrame preview.""" | |
| if not path.exists(): | |
| return pd.DataFrame([{"error": f"Missing file: {path.name}"}]) | |
| if path.suffix == ".json": | |
| obj = json.loads(path.read_text(encoding="utf-8")) | |
| if isinstance(obj, dict): | |
| return pd.DataFrame([obj]) | |
| return pd.DataFrame(obj) | |
| return pd.read_csv(path) | |
| def build_sales_chart(): | |
| """Interactive plotly version of the monthly overview.""" | |
| path = PY_TAB_DIR / "df_dashboard.csv" | |
| if not path.exists(): | |
| return go.Figure().update_layout(title="Run the pipeline first") | |
| df = pd.read_csv(path) | |
| df["month"] = pd.to_datetime(df["month"], errors="coerce") | |
| fig = go.Figure() | |
| for col in [c for c in df.columns if c != "month"]: | |
| fig.add_trace(go.Scatter(x=df["month"], y=df[col], mode="lines+markers", name=col)) | |
| fig.update_layout(title="Monthly Overview", template="plotly_white", height=420) | |
| return fig | |
| def build_sentiment_chart(): | |
| """Interactive stacked chart for sentiment counts.""" | |
| path = PY_TAB_DIR / "sentiment_counts_sampled.csv" | |
| if not path.exists(): | |
| return go.Figure().update_layout(title="Run the pipeline first") | |
| df = pd.read_csv(path).head(20) | |
| fig = go.Figure() | |
| for col in ["negative", "neutral", "positive"]: | |
| if col in df.columns: | |
| fig.add_trace(go.Bar(y=df["grouped_title"], x=df[col], orientation="h", name=col)) | |
| fig.update_layout(title="Sentiment Distribution", barmode="stack", template="plotly_white", height=500) | |
| return fig | |
| def build_top_chart(): | |
| """Interactive top sellers chart.""" | |
| path = PY_TAB_DIR / "top_titles_by_units_sold.csv" | |
| if not path.exists(): | |
| return go.Figure().update_layout(title="Run the pipeline first") | |
| df = pd.read_csv(path).head(15) | |
| title_col = "title" if "title" in df.columns else df.columns[0] | |
| val_col = "total_units_sold" if "total_units_sold" in df.columns else df.columns[-1] | |
| fig = go.Figure(go.Bar(y=df[title_col], x=df[val_col], orientation="h")) | |
| fig.update_layout(title="Top Sellers", template="plotly_white", height=500) | |
| return fig | |
| def refresh_gallery(): | |
| """Refresh static figures and the first available table.""" | |
| figures = [] | |
| for path in sorted(PY_FIG_DIR.glob("*.png")): | |
| figures.append((str(path), path.stem.replace("_", " ").title())) | |
| table_choices = artifacts_index()["python"]["tables"] | |
| first_df = pd.DataFrame() | |
| first_choice = None | |
| if table_choices: | |
| first_choice = table_choices[0] | |
| first_df = _load_table(PY_TAB_DIR / first_choice) | |
| return figures, gr.update(choices=table_choices, value=first_choice), first_df | |
| def on_table_select(choice): | |
| """Load the selected table preview.""" | |
| if not choice: | |
| return pd.DataFrame([{"hint": "Select a table above."}]) | |
| return _load_table(PY_TAB_DIR / choice) | |
| def refresh_dashboard(): | |
| """Refresh KPI cards and all interactive charts.""" | |
| return render_kpi_cards(), build_sales_chart(), build_sentiment_chart(), build_top_chart() | |
| def ai_reply(question: str): | |
| """Simple keyword routing for the AI tab.""" | |
| msg = (question or "").lower() | |
| kpis = load_kpis() | |
| summary = "" | |
| if kpis: | |
| summary = ( | |
| f"We currently have {int(kpis.get('n_titles', 0))} titles and " | |
| f"{int(kpis.get('total_units_sold', 0)):,} total units sold." | |
| ) | |
| if any(word in msg for word in ["sentiment", "review", "positive", "negative"]): | |
| return f"Here is the sentiment view. {summary}", build_sentiment_chart(), None | |
| if any(word in msg for word in ["top", "seller", "best", "popular"]): | |
| return f"Here are the top sellers. {summary}", None, _load_table(PY_TAB_DIR / "top_titles_by_units_sold.csv") | |
| if any(word in msg for word in ["price", "pricing"]): | |
| return f"Here are the pricing decisions. {summary}", None, _load_table(PY_TAB_DIR / "pricing_decisions.csv") | |
| return f"Here is the monthly overview. {summary}", build_sales_chart(), None | |
| def ai_chat(user_msg, history): | |
| """Chat wrapper compatible with Gradio Chatbot.""" | |
| if not user_msg: | |
| return history, "", None, None | |
| reply, chart, table = ai_reply(user_msg) | |
| history = (history or []) + [{"role": "user", "content": user_msg}, {"role": "assistant", "content": reply}] | |
| return history, "", chart, table | |
| def load_css(): | |
| """Load optional CSS file.""" | |
| path = BASE_DIR / "style.css" | |
| return path.read_text(encoding="utf-8") if path.exists() else "" | |
| ensure_dirs() | |
| with gr.Blocks(title="AIBDM 2026 Workshop App") as demo: | |
| gr.Markdown( | |
| "# SE21 App Template\n" | |
| "*Notebook converted into a Hugging Face Space*", | |
| elem_id="escp_title", | |
| ) | |
| with gr.Tab("Pipeline Runner"): | |
| gr.Markdown("Run the full notebook logic with the uploaded CSV files.") | |
| run_btn = gr.Button("Run Full Pipeline", variant="primary") | |
| run_log = gr.Textbox(label="Execution Log", lines=18, max_lines=30, interactive=False) | |
| run_btn.click(run_pipeline, outputs=[run_log]) | |
| with gr.Tab("Dashboard"): | |
| kpi_html = gr.HTML(value=render_kpi_cards()) | |
| refresh_btn = gr.Button("Refresh Dashboard", variant="primary") | |
| gr.Markdown("#### Interactive Charts") | |
| chart_sales = gr.Plot(label="Monthly Overview") | |
| chart_sentiment = gr.Plot(label="Sentiment Distribution") | |
| chart_top = gr.Plot(label="Top Sellers") | |
| gr.Markdown("#### Static Figures") | |
| gallery = gr.Gallery(label="Generated Figures", columns=2, height=480, object_fit="contain") | |
| gr.Markdown("#### Data Tables") | |
| table_dropdown = gr.Dropdown(label="Select a table to view", choices=[], interactive=True) | |
| table_display = gr.Dataframe(label="Table Preview", interactive=False) | |
| def _on_refresh(): | |
| kpi, c1, c2, c3 = refresh_dashboard() | |
| figs, dd, df = refresh_gallery() | |
| return kpi, c1, c2, c3, figs, dd, df | |
| refresh_btn.click( | |
| _on_refresh, | |
| outputs=[kpi_html, chart_sales, chart_sentiment, chart_top, gallery, table_dropdown, table_display], | |
| ) | |
| table_dropdown.change(on_table_select, inputs=[table_dropdown], outputs=[table_display]) | |
| with gr.Tab('"AI" Dashboard'): | |
| gr.Markdown( | |
| "### Ask questions about your data\n" | |
| "This version uses built-in keyword routing, so it works without API keys." | |
| ) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| chatbot = gr.Chatbot(label="Conversation", type="messages", height=380) | |
| user_input = gr.Textbox( | |
| label="Ask about your data", | |
| placeholder="e.g. Show me sales trends / top sellers / sentiment", | |
| lines=1, | |
| ) | |
| with gr.Column(scale=1): | |
| ai_figure = gr.Plot(label="Interactive Chart") | |
| ai_table = gr.Dataframe(label="Data Table", interactive=False) | |
| user_input.submit( | |
| ai_chat, | |
| inputs=[user_input, chatbot], | |
| outputs=[chatbot, user_input, ai_figure, ai_table], | |
| ) | |
| demo.launch(css=load_css()) | |