Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- app.py +280 -0
- datacreation.ipynb +1086 -0
- pythonanalysis.ipynb +0 -0
- scraped_motivational_quotes.csv +20 -0
- scraped_quotes.csv +51 -0
- student_dataset.csv +0 -0
- student_dataset_enriched.csv +0 -0
- synthetic_student_reviews.csv +0 -0
app.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import matplotlib
|
| 6 |
+
matplotlib.use("Agg")
|
| 7 |
+
import seaborn as sns
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 10 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 11 |
+
from sklearn.model_selection import train_test_split
|
| 12 |
+
from sklearn.metrics import accuracy_score
|
| 13 |
+
import warnings
|
| 14 |
+
warnings.filterwarnings("ignore")
|
| 15 |
+
|
| 16 |
+
# ─── Load data ───────────────────────────────────────────────
|
| 17 |
+
try:
|
| 18 |
+
df = pd.read_csv("student_dataset_enriched.csv")
|
| 19 |
+
df_reviews = pd.read_csv("synthetic_student_reviews.csv")
|
| 20 |
+
DATA_LOADED = True
|
| 21 |
+
except:
|
| 22 |
+
DATA_LOADED = False
|
| 23 |
+
|
| 24 |
+
analyzer = SentimentIntensityAnalyzer()
|
| 25 |
+
|
| 26 |
+
# ─── Train RF model once at startup ──────────────────────────
|
| 27 |
+
def train_model():
|
| 28 |
+
if not DATA_LOADED:
|
| 29 |
+
return None
|
| 30 |
+
features = ["hours_studied", "attendance", "sleep_hours",
|
| 31 |
+
"previous_scores", "motivation_level",
|
| 32 |
+
"tutoring_sessions", "stress_score", "focus_level"]
|
| 33 |
+
df["risk_label"] = (df["risk_level"] == "high").astype(int)
|
| 34 |
+
X, y = df[features], df["risk_label"]
|
| 35 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 36 |
+
X, y, test_size=0.2, random_state=2025)
|
| 37 |
+
clf = RandomForestClassifier(n_estimators=100, random_state=2025)
|
| 38 |
+
clf.fit(X_train, y_train)
|
| 39 |
+
acc = accuracy_score(y_test, clf.predict(X_test))
|
| 40 |
+
return clf, features, acc
|
| 41 |
+
|
| 42 |
+
MODEL_DATA = train_model()
|
| 43 |
+
|
| 44 |
+
# ══════════════════════════════════════════════════════════════
|
| 45 |
+
# TAB 1 — Student Risk Analyser
|
| 46 |
+
# ══════════════════════════════════════════════════════════════
|
| 47 |
+
def analyse_student(hours, attendance, sleep, prev_score,
|
| 48 |
+
motivation, tutoring, comment, days_exam, difficulty):
|
| 49 |
+
stress = (10 - sleep) + (2 - motivation)
|
| 50 |
+
focus = motivation * (sleep / 10)
|
| 51 |
+
burnout = int(stress > 7 and hours < 15)
|
| 52 |
+
|
| 53 |
+
# Risk level
|
| 54 |
+
if prev_score < 60 or (stress > 7 and hours < 15):
|
| 55 |
+
risk = "🔴 HIGH RISK"
|
| 56 |
+
else:
|
| 57 |
+
risk = "🟢 LOW RISK"
|
| 58 |
+
|
| 59 |
+
# Priority
|
| 60 |
+
urgency = 0
|
| 61 |
+
if days_exam <= 1: urgency += 3
|
| 62 |
+
elif days_exam <= 3: urgency += 2
|
| 63 |
+
elif days_exam <= 7: urgency += 1
|
| 64 |
+
if difficulty >= 4: urgency += 2
|
| 65 |
+
elif difficulty >= 3: urgency += 1
|
| 66 |
+
if stress > 8: urgency += 2
|
| 67 |
+
elif stress > 5: urgency += 1
|
| 68 |
+
if sleep < 5: urgency += 1
|
| 69 |
+
|
| 70 |
+
if urgency >= 6: priority = "🚨 CRITICAL"
|
| 71 |
+
elif urgency >= 4: priority = "🔶 HIGH"
|
| 72 |
+
elif urgency >= 2: priority = "🔷 MEDIUM"
|
| 73 |
+
else: priority = "🟩 LOW"
|
| 74 |
+
|
| 75 |
+
# Sentiment
|
| 76 |
+
score = analyzer.polarity_scores(comment)["compound"]
|
| 77 |
+
if score >= 0.05: sentiment = "😊 Positive"
|
| 78 |
+
elif score <= -0.05: sentiment = "😟 Negative"
|
| 79 |
+
else: sentiment = "😐 Neutral"
|
| 80 |
+
|
| 81 |
+
# Study tips
|
| 82 |
+
tips = []
|
| 83 |
+
if sleep < 6:
|
| 84 |
+
tips.append("💤 Prioritise sleep — aim for at least 7h to improve focus and memory consolidation.")
|
| 85 |
+
if hours < 15:
|
| 86 |
+
tips.append("📚 Increase daily study time gradually — start with +1h per day using the Pomodoro technique.")
|
| 87 |
+
if stress > 7:
|
| 88 |
+
tips.append("🧘 Practice stress management: 5-min breathing exercises before each study session.")
|
| 89 |
+
if burnout:
|
| 90 |
+
tips.append("⚠️ Burnout risk detected — take short breaks every 45 min and avoid all-nighters.")
|
| 91 |
+
if motivation < 1:
|
| 92 |
+
tips.append("🎯 Set small daily goals and reward yourself when you achieve them.")
|
| 93 |
+
if not tips:
|
| 94 |
+
tips.append("✅ You're on track! Keep your current routine and review key concepts regularly.")
|
| 95 |
+
|
| 96 |
+
result = f"""
|
| 97 |
+
## 📊 Student Analysis Report
|
| 98 |
+
|
| 99 |
+
| Indicator | Value |
|
| 100 |
+
|-----------|-------|
|
| 101 |
+
| Stress Score | {stress:.1f} / 12 |
|
| 102 |
+
| Focus Level | {focus:.2f} |
|
| 103 |
+
| Burnout Risk | {"⚠️ Yes" if burnout else "✅ No"} |
|
| 104 |
+
|
| 105 |
+
## 🎯 Risk Level: {risk}
|
| 106 |
+
## ⏱️ Study Priority: {priority}
|
| 107 |
+
## 💬 Comment Sentiment: {sentiment} (score: {score:.2f})
|
| 108 |
+
|
| 109 |
+
## 💡 Study Tips:
|
| 110 |
+
""" + "\n".join(f"- {t}" for t in tips)
|
| 111 |
+
|
| 112 |
+
return result
|
| 113 |
+
|
| 114 |
+
# ══════════════════════════════════════════════════════════════
|
| 115 |
+
# TAB 2 — Data Visualisation
|
| 116 |
+
# ══════════════════════════════════════════════════════════════
|
| 117 |
+
def plot_chart(chart_type):
|
| 118 |
+
if not DATA_LOADED:
|
| 119 |
+
fig, ax = plt.subplots()
|
| 120 |
+
ax.text(0.5, 0.5, "Data not loaded", ha="center")
|
| 121 |
+
return fig
|
| 122 |
+
|
| 123 |
+
fig, ax = plt.subplots(figsize=(9, 5))
|
| 124 |
+
sns.set_theme(style="whitegrid")
|
| 125 |
+
|
| 126 |
+
if chart_type == "Study Hours vs Final Score":
|
| 127 |
+
sc = ax.scatter(df["hours_studied"], df["final_exam_score"],
|
| 128 |
+
c=df["stress_score"], cmap="RdYlGn_r", alpha=0.6)
|
| 129 |
+
plt.colorbar(sc, ax=ax, label="Stress Score")
|
| 130 |
+
ax.set_xlabel("Hours Studied")
|
| 131 |
+
ax.set_ylabel("Final Exam Score")
|
| 132 |
+
ax.set_title("Study Hours vs Final Score")
|
| 133 |
+
|
| 134 |
+
elif chart_type == "Stress Score by Risk Level":
|
| 135 |
+
df.boxplot(column="stress_score", by="risk_level", ax=ax,
|
| 136 |
+
patch_artist=True,
|
| 137 |
+
boxprops=dict(facecolor="steelblue", color="navy"),
|
| 138 |
+
medianprops=dict(color="white", linewidth=2))
|
| 139 |
+
ax.set_title("Stress Score by Risk Level")
|
| 140 |
+
plt.suptitle("")
|
| 141 |
+
|
| 142 |
+
elif chart_type == "Correlation Heatmap":
|
| 143 |
+
cols = ["hours_studied", "attendance", "sleep_hours",
|
| 144 |
+
"previous_scores", "stress_score", "focus_level", "final_exam_score"]
|
| 145 |
+
corr = df[cols].corr()
|
| 146 |
+
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
|
| 147 |
+
ax.set_title("Correlation Heatmap")
|
| 148 |
+
|
| 149 |
+
elif chart_type == "Sentiment Distribution":
|
| 150 |
+
if DATA_LOADED and "vader_label" in df_reviews.columns:
|
| 151 |
+
counts = df_reviews["vader_label"].value_counts()
|
| 152 |
+
else:
|
| 153 |
+
df_reviews["vader_label"] = df_reviews["student_comment"].apply(
|
| 154 |
+
lambda x: "positive" if analyzer.polarity_scores(str(x))["compound"] >= 0.05
|
| 155 |
+
else ("negative" if analyzer.polarity_scores(str(x))["compound"] <= -0.05 else "neutral"))
|
| 156 |
+
counts = df_reviews["vader_label"].value_counts()
|
| 157 |
+
colours = [{"positive": "mediumseagreen", "neutral": "gold",
|
| 158 |
+
"negative": "tomato"}.get(l, "gray") for l in counts.index]
|
| 159 |
+
counts.plot(kind="bar", ax=ax, color=colours, edgecolor="white")
|
| 160 |
+
ax.set_title("Sentiment Distribution of Student Comments")
|
| 161 |
+
ax.tick_params(axis="x", rotation=0)
|
| 162 |
+
|
| 163 |
+
elif chart_type == "Feature Importance (Random Forest)":
|
| 164 |
+
if MODEL_DATA:
|
| 165 |
+
clf, features, acc = MODEL_DATA
|
| 166 |
+
imp = pd.Series(clf.feature_importances_, index=features).sort_values()
|
| 167 |
+
imp.plot(kind="barh", ax=ax, color="steelblue")
|
| 168 |
+
ax.set_title(f"Random Forest Feature Importance (Accuracy: {acc:.0%})")
|
| 169 |
+
|
| 170 |
+
plt.tight_layout()
|
| 171 |
+
return fig
|
| 172 |
+
|
| 173 |
+
# ══════════════════════════════════════════════════════════════
|
| 174 |
+
# TAB 3 — Sentiment Analyser
|
| 175 |
+
# ══════════════════════════════════════════════════════════════
|
| 176 |
+
def analyse_sentiment(text):
|
| 177 |
+
if not text.strip():
|
| 178 |
+
return "Please enter a comment."
|
| 179 |
+
scores = analyzer.polarity_scores(text)
|
| 180 |
+
compound = scores["compound"]
|
| 181 |
+
if compound >= 0.05: label = "😊 Positive"
|
| 182 |
+
elif compound <= -0.05: label = "😟 Negative"
|
| 183 |
+
else: label = "😐 Neutral"
|
| 184 |
+
return f"""
|
| 185 |
+
**Sentiment:** {label}
|
| 186 |
+
**Compound score:** {compound:.3f}
|
| 187 |
+
**Positive:** {scores['pos']:.3f} | **Neutral:** {scores['neu']:.3f} | **Negative:** {scores['neg']:.3f}
|
| 188 |
+
|
| 189 |
+
*Interpretation:* {'This student appears confident and motivated.' if compound >= 0.05 else 'This student may be struggling — consider flagging for support.' if compound <= -0.05 else 'This student has a neutral outlook — monitor progress.'}
|
| 190 |
+
"""
|
| 191 |
+
|
| 192 |
+
# ══════════════════════════════════════════════════════════════
|
| 193 |
+
# TAB 4 — Dataset Explorer
|
| 194 |
+
# ══════════════════════════════════════════════════════════════
|
| 195 |
+
def explore_data():
|
| 196 |
+
if not DATA_LOADED:
|
| 197 |
+
return "Dataset not loaded.", ""
|
| 198 |
+
summary = df.describe().round(2).to_markdown()
|
| 199 |
+
sample = df.head(10).to_markdown()
|
| 200 |
+
return f"### Summary Statistics\n{summary}", f"### First 10 rows\n{sample}"
|
| 201 |
+
|
| 202 |
+
# ══════════════════════════════════════════════════════════════
|
| 203 |
+
# BUILD THE APP
|
| 204 |
+
# ══════════════════════════════════════════════════════════════
|
| 205 |
+
with gr.Blocks(title="Smart Study Planner", theme=gr.themes.Soft()) as demo:
|
| 206 |
+
|
| 207 |
+
gr.Markdown("""
|
| 208 |
+
# 🎓 Smart Study Planner
|
| 209 |
+
**AI for Big Data Management — Group C2**
|
| 210 |
+
*Rosamaria Guadalupi · Sasha Nouaille · Alexia Beraud Valero · Aurora Vimercati · Luna Mariah Gallina*
|
| 211 |
+
---
|
| 212 |
+
""")
|
| 213 |
+
|
| 214 |
+
with gr.Tabs():
|
| 215 |
+
|
| 216 |
+
# ── TAB 1 ���────────────────────────────────────────────
|
| 217 |
+
with gr.TabItem("🔍 Student Analyser"):
|
| 218 |
+
gr.Markdown("### Enter a student profile to get a risk assessment and study recommendations.")
|
| 219 |
+
with gr.Row():
|
| 220 |
+
with gr.Column():
|
| 221 |
+
hours = gr.Slider(0, 40, value=20, label="Hours Studied per Week")
|
| 222 |
+
attendance = gr.Slider(0, 100, value=80, label="Attendance (%)")
|
| 223 |
+
sleep = gr.Slider(3, 12, value=7, step=0.5, label="Sleep Hours per Night")
|
| 224 |
+
prev_score = gr.Slider(0, 100, value=65, label="Previous Scores")
|
| 225 |
+
motivation = gr.Slider(0, 2, value=1, step=1,
|
| 226 |
+
label="Motivation Level (0=Low, 1=Medium, 2=High)")
|
| 227 |
+
tutoring = gr.Slider(0, 5, value=1, step=1, label="Tutoring Sessions per Week")
|
| 228 |
+
with gr.Column():
|
| 229 |
+
comment = gr.Textbox(label="Student Comment",
|
| 230 |
+
placeholder="e.g. I feel overwhelmed this week...",
|
| 231 |
+
lines=3)
|
| 232 |
+
days_exam = gr.Slider(1, 30, value=5, step=1, label="Days Until Exam")
|
| 233 |
+
difficulty = gr.Slider(1, 5, value=3, step=1, label="Exam Difficulty (1-5)")
|
| 234 |
+
btn1 = gr.Button("🔍 Analyse Student", variant="primary")
|
| 235 |
+
|
| 236 |
+
output1 = gr.Markdown()
|
| 237 |
+
btn1.click(analyse_student,
|
| 238 |
+
inputs=[hours, attendance, sleep, prev_score,
|
| 239 |
+
motivation, tutoring, comment, days_exam, difficulty],
|
| 240 |
+
outputs=output1)
|
| 241 |
+
|
| 242 |
+
# ── TAB 2 ─────────────────────────────────────────────
|
| 243 |
+
with gr.TabItem("📊 Data Visualisation"):
|
| 244 |
+
gr.Markdown("### Explore the student dataset through interactive charts.")
|
| 245 |
+
chart_choice = gr.Dropdown(
|
| 246 |
+
choices=["Study Hours vs Final Score",
|
| 247 |
+
"Stress Score by Risk Level",
|
| 248 |
+
"Correlation Heatmap",
|
| 249 |
+
"Sentiment Distribution",
|
| 250 |
+
"Feature Importance (Random Forest)"],
|
| 251 |
+
value="Study Hours vs Final Score",
|
| 252 |
+
label="Select Chart")
|
| 253 |
+
btn2 = gr.Button("📊 Generate Chart", variant="primary")
|
| 254 |
+
plot = gr.Plot()
|
| 255 |
+
btn2.click(plot_chart, inputs=chart_choice, outputs=plot)
|
| 256 |
+
|
| 257 |
+
# ── TAB 3 ─────────────────────────────────────────────
|
| 258 |
+
with gr.TabItem("💬 Sentiment Analyser"):
|
| 259 |
+
gr.Markdown("### Analyse the sentiment of any student comment using VADER.")
|
| 260 |
+
comment_input = gr.Textbox(label="Enter a student comment",
|
| 261 |
+
placeholder="e.g. I can't focus and I'm exhausted...",
|
| 262 |
+
lines=4)
|
| 263 |
+
btn3 = gr.Button("💬 Analyse Sentiment", variant="primary")
|
| 264 |
+
output3 = gr.Markdown()
|
| 265 |
+
btn3.click(analyse_sentiment, inputs=comment_input, outputs=output3)
|
| 266 |
+
|
| 267 |
+
# ── TAB 4 ─────────────────────────────────────────────
|
| 268 |
+
with gr.TabItem("📂 Dataset Explorer"):
|
| 269 |
+
gr.Markdown("### Explore the enriched student dataset.")
|
| 270 |
+
btn4 = gr.Button("📂 Load Dataset", variant="primary")
|
| 271 |
+
stats = gr.Markdown()
|
| 272 |
+
sample = gr.Markdown()
|
| 273 |
+
btn4.click(explore_data, outputs=[stats, sample])
|
| 274 |
+
|
| 275 |
+
gr.Markdown("""
|
| 276 |
+
---
|
| 277 |
+
*Smart Study Planner · AI for Big Data Management · ESCP Business School*
|
| 278 |
+
""")
|
| 279 |
+
|
| 280 |
+
demo.launch()
|
datacreation.ipynb
ADDED
|
@@ -0,0 +1,1086 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "4ba6aba8"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"# 🤖 **Data Collection, Creation, Storage, and Processing**\n"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "markdown",
|
| 14 |
+
"metadata": {
|
| 15 |
+
"id": "jpASMyIQMaAq"
|
| 16 |
+
},
|
| 17 |
+
"source": [
|
| 18 |
+
"## **1.** 📦 Install required packages"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": null,
|
| 24 |
+
"metadata": {
|
| 25 |
+
"colab": {
|
| 26 |
+
"base_uri": "https://localhost:8080/"
|
| 27 |
+
},
|
| 28 |
+
"id": "f48c8f8c",
|
| 29 |
+
"outputId": "97ebbe50-6c81-43ef-e739-9bf54ef48d1f"
|
| 30 |
+
},
|
| 31 |
+
"outputs": [
|
| 32 |
+
{
|
| 33 |
+
"output_type": "stream",
|
| 34 |
+
"name": "stdout",
|
| 35 |
+
"text": [
|
| 36 |
+
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.12/dist-packages (4.13.5)\n",
|
| 37 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.2.2)\n",
|
| 38 |
+
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)\n",
|
| 39 |
+
"Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)\n",
|
| 40 |
+
"Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.0.2)\n",
|
| 41 |
+
"Requirement already satisfied: textblob in /usr/local/lib/python3.12/dist-packages (0.19.0)\n",
|
| 42 |
+
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (2.8.3)\n",
|
| 43 |
+
"Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.12/dist-packages (from beautifulsoup4) (4.15.0)\n",
|
| 44 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)\n",
|
| 45 |
+
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)\n",
|
| 46 |
+
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)\n",
|
| 47 |
+
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)\n",
|
| 48 |
+
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)\n",
|
| 49 |
+
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)\n",
|
| 50 |
+
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)\n",
|
| 51 |
+
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.1)\n",
|
| 52 |
+
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)\n",
|
| 53 |
+
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)\n",
|
| 54 |
+
"Requirement already satisfied: nltk>=3.9 in /usr/local/lib/python3.12/dist-packages (from textblob) (3.9.1)\n",
|
| 55 |
+
"Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (8.3.3)\n",
|
| 56 |
+
"Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (1.5.3)\n",
|
| 57 |
+
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (2025.11.3)\n",
|
| 58 |
+
"Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk>=3.9->textblob) (4.67.3)\n",
|
| 59 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
|
| 60 |
+
]
|
| 61 |
+
}
|
| 62 |
+
],
|
| 63 |
+
"source": [
|
| 64 |
+
"!pip install beautifulsoup4 pandas matplotlib seaborn numpy textblob"
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"cell_type": "markdown",
|
| 69 |
+
"metadata": {
|
| 70 |
+
"id": "lquNYCbfL9IM"
|
| 71 |
+
},
|
| 72 |
+
"source": [
|
| 73 |
+
"## **2.** ⛏ Web-scrape all book titles, prices, and ratings from books.toscrape.com"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "markdown",
|
| 78 |
+
"metadata": {
|
| 79 |
+
"id": "0IWuNpxxYDJF"
|
| 80 |
+
},
|
| 81 |
+
"source": [
|
| 82 |
+
"### *a. Initial setup*\n",
|
| 83 |
+
"Define the base url of the website you will scrape as well as how and what you will scrape"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"cell_type": "code",
|
| 88 |
+
"execution_count": null,
|
| 89 |
+
"metadata": {
|
| 90 |
+
"id": "91d52125"
|
| 91 |
+
},
|
| 92 |
+
"outputs": [],
|
| 93 |
+
"source": [
|
| 94 |
+
"import requests\n",
|
| 95 |
+
"from bs4 import BeautifulSoup\n",
|
| 96 |
+
"import pandas as pd\n",
|
| 97 |
+
"import time\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"base_url = \"https://books.toscrape.com/catalogue/page-{}.html\"\n",
|
| 100 |
+
"headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
|
| 101 |
+
"\n",
|
| 102 |
+
"titles, prices, ratings = [], [], []"
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"cell_type": "markdown",
|
| 107 |
+
"metadata": {
|
| 108 |
+
"id": "oCdTsin2Yfp3"
|
| 109 |
+
},
|
| 110 |
+
"source": [
|
| 111 |
+
"### *b. Fill titles, prices, and ratings from the web pages*"
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"cell_type": "code",
|
| 116 |
+
"execution_count": null,
|
| 117 |
+
"metadata": {
|
| 118 |
+
"id": "xqO5Y3dnYhxt"
|
| 119 |
+
},
|
| 120 |
+
"outputs": [],
|
| 121 |
+
"source": [
|
| 122 |
+
"# Loop through all 50 pages\n",
|
| 123 |
+
"for page in range(1, 51):\n",
|
| 124 |
+
" url = base_url.format(page)\n",
|
| 125 |
+
" response = requests.get(url, headers=headers)\n",
|
| 126 |
+
" soup = BeautifulSoup(response.content, \"html.parser\")\n",
|
| 127 |
+
" books = soup.find_all(\"article\", class_=\"product_pod\")\n",
|
| 128 |
+
"\n",
|
| 129 |
+
" for book in books:\n",
|
| 130 |
+
" titles.append(book.h3.a[\"title\"])\n",
|
| 131 |
+
" prices.append(float(book.find(\"p\", class_=\"price_color\").text[1:]))\n",
|
| 132 |
+
" ratings.append(book.p.get(\"class\")[1])\n",
|
| 133 |
+
"\n",
|
| 134 |
+
" time.sleep(0.5) # polite scraping delay"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "markdown",
|
| 139 |
+
"metadata": {
|
| 140 |
+
"id": "T0TOeRC4Yrnn"
|
| 141 |
+
},
|
| 142 |
+
"source": [
|
| 143 |
+
"### *c. ✋🏻🛑⛔️ Create a dataframe df_books that contains the now complete \"title\", \"price\", and \"rating\" objects*"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"cell_type": "code",
|
| 148 |
+
"execution_count": null,
|
| 149 |
+
"metadata": {
|
| 150 |
+
"id": "l5FkkNhUYTHh"
|
| 151 |
+
},
|
| 152 |
+
"outputs": [],
|
| 153 |
+
"source": [
|
| 154 |
+
"# 🗂️ Create DataFrame\n",
|
| 155 |
+
"df_books = pd.DataFrame({\n",
|
| 156 |
+
" \"title\": titles,\n",
|
| 157 |
+
" \"price\": prices,\n",
|
| 158 |
+
" \"rating\": ratings\n",
|
| 159 |
+
"})"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"cell_type": "markdown",
|
| 164 |
+
"metadata": {
|
| 165 |
+
"id": "duI5dv3CZYvF"
|
| 166 |
+
},
|
| 167 |
+
"source": [
|
| 168 |
+
"### *d. Save web-scraped dataframe either as a CSV or Excel file*"
|
| 169 |
+
]
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"cell_type": "code",
|
| 173 |
+
"execution_count": null,
|
| 174 |
+
"metadata": {
|
| 175 |
+
"id": "lC1U_YHtZifh"
|
| 176 |
+
},
|
| 177 |
+
"outputs": [],
|
| 178 |
+
"source": [
|
| 179 |
+
"# 💾 Save to CSV\n",
|
| 180 |
+
"df_books.to_csv(\"books_data.csv\", index=False)\n",
|
| 181 |
+
"\n",
|
| 182 |
+
"# 💾 Or save to Excel\n",
|
| 183 |
+
"# df_books.to_excel(\"books_data.xlsx\", index=False)"
|
| 184 |
+
]
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"cell_type": "markdown",
|
| 188 |
+
"metadata": {
|
| 189 |
+
"id": "qMjRKMBQZlJi"
|
| 190 |
+
},
|
| 191 |
+
"source": [
|
| 192 |
+
"### *e. ✋🏻🛑⛔️ View first fiew lines*"
|
| 193 |
+
]
|
| 194 |
+
},
|
| 195 |
+
{
|
| 196 |
+
"cell_type": "code",
|
| 197 |
+
"execution_count": null,
|
| 198 |
+
"metadata": {
|
| 199 |
+
"colab": {
|
| 200 |
+
"base_uri": "https://localhost:8080/"
|
| 201 |
+
},
|
| 202 |
+
"id": "O_wIvTxYZqCK",
|
| 203 |
+
"outputId": "df1bdc12-9805-4689-d5f3-528e991a48f1"
|
| 204 |
+
},
|
| 205 |
+
"outputs": [
|
| 206 |
+
{
|
| 207 |
+
"output_type": "execute_result",
|
| 208 |
+
"data": {
|
| 209 |
+
"text/plain": [
|
| 210 |
+
" title price rating\n",
|
| 211 |
+
"0 A Light in the Attic 51.77 Three\n",
|
| 212 |
+
"1 Tipping the Velvet 53.74 One\n",
|
| 213 |
+
"2 Soumission 50.10 One\n",
|
| 214 |
+
"3 Sharp Objects 47.82 Four\n",
|
| 215 |
+
"4 Sapiens: A Brief History of Humankind 54.23 Five"
|
| 216 |
+
],
|
| 217 |
+
"text/html": [
|
| 218 |
+
"\n",
|
| 219 |
+
" <div id=\"df-2ebb90b2-8344-410e-9d8b-174fef06d37d\" class=\"colab-df-container\">\n",
|
| 220 |
+
" <div>\n",
|
| 221 |
+
"<style scoped>\n",
|
| 222 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 223 |
+
" vertical-align: middle;\n",
|
| 224 |
+
" }\n",
|
| 225 |
+
"\n",
|
| 226 |
+
" .dataframe tbody tr th {\n",
|
| 227 |
+
" vertical-align: top;\n",
|
| 228 |
+
" }\n",
|
| 229 |
+
"\n",
|
| 230 |
+
" .dataframe thead th {\n",
|
| 231 |
+
" text-align: right;\n",
|
| 232 |
+
" }\n",
|
| 233 |
+
"</style>\n",
|
| 234 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 235 |
+
" <thead>\n",
|
| 236 |
+
" <tr style=\"text-align: right;\">\n",
|
| 237 |
+
" <th></th>\n",
|
| 238 |
+
" <th>title</th>\n",
|
| 239 |
+
" <th>price</th>\n",
|
| 240 |
+
" <th>rating</th>\n",
|
| 241 |
+
" </tr>\n",
|
| 242 |
+
" </thead>\n",
|
| 243 |
+
" <tbody>\n",
|
| 244 |
+
" <tr>\n",
|
| 245 |
+
" <th>0</th>\n",
|
| 246 |
+
" <td>A Light in the Attic</td>\n",
|
| 247 |
+
" <td>51.77</td>\n",
|
| 248 |
+
" <td>Three</td>\n",
|
| 249 |
+
" </tr>\n",
|
| 250 |
+
" <tr>\n",
|
| 251 |
+
" <th>1</th>\n",
|
| 252 |
+
" <td>Tipping the Velvet</td>\n",
|
| 253 |
+
" <td>53.74</td>\n",
|
| 254 |
+
" <td>One</td>\n",
|
| 255 |
+
" </tr>\n",
|
| 256 |
+
" <tr>\n",
|
| 257 |
+
" <th>2</th>\n",
|
| 258 |
+
" <td>Soumission</td>\n",
|
| 259 |
+
" <td>50.10</td>\n",
|
| 260 |
+
" <td>One</td>\n",
|
| 261 |
+
" </tr>\n",
|
| 262 |
+
" <tr>\n",
|
| 263 |
+
" <th>3</th>\n",
|
| 264 |
+
" <td>Sharp Objects</td>\n",
|
| 265 |
+
" <td>47.82</td>\n",
|
| 266 |
+
" <td>Four</td>\n",
|
| 267 |
+
" </tr>\n",
|
| 268 |
+
" <tr>\n",
|
| 269 |
+
" <th>4</th>\n",
|
| 270 |
+
" <td>Sapiens: A Brief History of Humankind</td>\n",
|
| 271 |
+
" <td>54.23</td>\n",
|
| 272 |
+
" <td>Five</td>\n",
|
| 273 |
+
" </tr>\n",
|
| 274 |
+
" </tbody>\n",
|
| 275 |
+
"</table>\n",
|
| 276 |
+
"</div>\n",
|
| 277 |
+
" <div class=\"colab-df-buttons\">\n",
|
| 278 |
+
"\n",
|
| 279 |
+
" <div class=\"colab-df-container\">\n",
|
| 280 |
+
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2ebb90b2-8344-410e-9d8b-174fef06d37d')\"\n",
|
| 281 |
+
" title=\"Convert this dataframe to an interactive table.\"\n",
|
| 282 |
+
" style=\"display:none;\">\n",
|
| 283 |
+
"\n",
|
| 284 |
+
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
|
| 285 |
+
" <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
|
| 286 |
+
" </svg>\n",
|
| 287 |
+
" </button>\n",
|
| 288 |
+
"\n",
|
| 289 |
+
" <style>\n",
|
| 290 |
+
" .colab-df-container {\n",
|
| 291 |
+
" display:flex;\n",
|
| 292 |
+
" gap: 12px;\n",
|
| 293 |
+
" }\n",
|
| 294 |
+
"\n",
|
| 295 |
+
" .colab-df-convert {\n",
|
| 296 |
+
" background-color: #E8F0FE;\n",
|
| 297 |
+
" border: none;\n",
|
| 298 |
+
" border-radius: 50%;\n",
|
| 299 |
+
" cursor: pointer;\n",
|
| 300 |
+
" display: none;\n",
|
| 301 |
+
" fill: #1967D2;\n",
|
| 302 |
+
" height: 32px;\n",
|
| 303 |
+
" padding: 0 0 0 0;\n",
|
| 304 |
+
" width: 32px;\n",
|
| 305 |
+
" }\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" .colab-df-convert:hover {\n",
|
| 308 |
+
" background-color: #E2EBFA;\n",
|
| 309 |
+
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
|
| 310 |
+
" fill: #174EA6;\n",
|
| 311 |
+
" }\n",
|
| 312 |
+
"\n",
|
| 313 |
+
" .colab-df-buttons div {\n",
|
| 314 |
+
" margin-bottom: 4px;\n",
|
| 315 |
+
" }\n",
|
| 316 |
+
"\n",
|
| 317 |
+
" [theme=dark] .colab-df-convert {\n",
|
| 318 |
+
" background-color: #3B4455;\n",
|
| 319 |
+
" fill: #D2E3FC;\n",
|
| 320 |
+
" }\n",
|
| 321 |
+
"\n",
|
| 322 |
+
" [theme=dark] .colab-df-convert:hover {\n",
|
| 323 |
+
" background-color: #434B5C;\n",
|
| 324 |
+
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
|
| 325 |
+
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
|
| 326 |
+
" fill: #FFFFFF;\n",
|
| 327 |
+
" }\n",
|
| 328 |
+
" </style>\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" <script>\n",
|
| 331 |
+
" const buttonEl =\n",
|
| 332 |
+
" document.querySelector('#df-2ebb90b2-8344-410e-9d8b-174fef06d37d button.colab-df-convert');\n",
|
| 333 |
+
" buttonEl.style.display =\n",
|
| 334 |
+
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
|
| 335 |
+
"\n",
|
| 336 |
+
" async function convertToInteractive(key) {\n",
|
| 337 |
+
" const element = document.querySelector('#df-2ebb90b2-8344-410e-9d8b-174fef06d37d');\n",
|
| 338 |
+
" const dataTable =\n",
|
| 339 |
+
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
|
| 340 |
+
" [key], {});\n",
|
| 341 |
+
" if (!dataTable) return;\n",
|
| 342 |
+
"\n",
|
| 343 |
+
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
|
| 344 |
+
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
|
| 345 |
+
" + ' to learn more about interactive tables.';\n",
|
| 346 |
+
" element.innerHTML = '';\n",
|
| 347 |
+
" dataTable['output_type'] = 'display_data';\n",
|
| 348 |
+
" await google.colab.output.renderOutput(dataTable, element);\n",
|
| 349 |
+
" const docLink = document.createElement('div');\n",
|
| 350 |
+
" docLink.innerHTML = docLinkHtml;\n",
|
| 351 |
+
" element.appendChild(docLink);\n",
|
| 352 |
+
" }\n",
|
| 353 |
+
" </script>\n",
|
| 354 |
+
" </div>\n",
|
| 355 |
+
"\n",
|
| 356 |
+
"\n",
|
| 357 |
+
" </div>\n",
|
| 358 |
+
" </div>\n"
|
| 359 |
+
],
|
| 360 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 361 |
+
"type": "dataframe",
|
| 362 |
+
"variable_name": "df_books",
|
| 363 |
+
"summary": "{\n \"name\": \"df_books\",\n \"rows\": 1000,\n \"fields\": [\n {\n \"column\": \"title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 999,\n \"samples\": [\n \"The Grownup\",\n \"Persepolis: The Story of a Childhood (Persepolis #1-2)\",\n \"Ayumi's Violin\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.446689669952772,\n \"min\": 10.0,\n \"max\": 59.99,\n \"num_unique_values\": 903,\n \"samples\": [\n 19.73,\n 55.65,\n 46.31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rating\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"One\",\n \"Two\",\n \"Four\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
|
| 364 |
+
}
|
| 365 |
+
},
|
| 366 |
+
"metadata": {},
|
| 367 |
+
"execution_count": 6
|
| 368 |
+
}
|
| 369 |
+
],
|
| 370 |
+
"source": [
|
| 371 |
+
"df_books.head()"
|
| 372 |
+
]
|
| 373 |
+
},
|
| 374 |
+
{
|
| 375 |
+
"cell_type": "markdown",
|
| 376 |
+
"metadata": {
|
| 377 |
+
"id": "p-1Pr2szaqLk"
|
| 378 |
+
},
|
| 379 |
+
"source": [
|
| 380 |
+
"## **3.** 🧩 Create a meaningful connection between real & synthetic datasets"
|
| 381 |
+
]
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"cell_type": "markdown",
|
| 385 |
+
"metadata": {
|
| 386 |
+
"id": "SIaJUGIpaH4V"
|
| 387 |
+
},
|
| 388 |
+
"source": [
|
| 389 |
+
"### *a. Initial setup*"
|
| 390 |
+
]
|
| 391 |
+
},
|
| 392 |
+
{
|
| 393 |
+
"cell_type": "code",
|
| 394 |
+
"execution_count": null,
|
| 395 |
+
"metadata": {
|
| 396 |
+
"id": "-gPXGcRPuV_9"
|
| 397 |
+
},
|
| 398 |
+
"outputs": [],
|
| 399 |
+
"source": [
|
| 400 |
+
"import numpy as np\n",
|
| 401 |
+
"import random\n",
|
| 402 |
+
"from datetime import datetime\n",
|
| 403 |
+
"import warnings\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"warnings.filterwarnings(\"ignore\")\n",
|
| 406 |
+
"random.seed(2025)\n",
|
| 407 |
+
"np.random.seed(2025)"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"cell_type": "markdown",
|
| 412 |
+
"metadata": {
|
| 413 |
+
"id": "pY4yCoIuaQqp"
|
| 414 |
+
},
|
| 415 |
+
"source": [
|
| 416 |
+
"### *b. Generate popularity scores based on rating (with some randomness) with a generate_popularity_score function*"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"cell_type": "code",
|
| 421 |
+
"execution_count": null,
|
| 422 |
+
"metadata": {
|
| 423 |
+
"id": "mnd5hdAbaNjz"
|
| 424 |
+
},
|
| 425 |
+
"outputs": [],
|
| 426 |
+
"source": [
|
| 427 |
+
"def generate_popularity_score(rating):\n",
|
| 428 |
+
" base = {\"One\": 2, \"Two\": 3, \"Three\": 3, \"Four\": 4, \"Five\": 4}.get(rating, 3)\n",
|
| 429 |
+
" trend_factor = random.choices([-1, 0, 1], weights=[1, 3, 2])[0]\n",
|
| 430 |
+
" return int(np.clip(base + trend_factor, 1, 5))"
|
| 431 |
+
]
|
| 432 |
+
},
|
| 433 |
+
{
|
| 434 |
+
"cell_type": "markdown",
|
| 435 |
+
"metadata": {
|
| 436 |
+
"id": "n4-TaNTFgPak"
|
| 437 |
+
},
|
| 438 |
+
"source": [
|
| 439 |
+
"### *c. ✋🏻🛑⛔️ Run the function to create a \"popularity_score\" column from \"rating\"*"
|
| 440 |
+
]
|
| 441 |
+
},
|
| 442 |
+
{
|
| 443 |
+
"cell_type": "code",
|
| 444 |
+
"execution_count": null,
|
| 445 |
+
"metadata": {
|
| 446 |
+
"id": "V-G3OCUCgR07"
|
| 447 |
+
},
|
| 448 |
+
"outputs": [],
|
| 449 |
+
"source": [
|
| 450 |
+
"df_books[\"popularity_score\"] = df_books[\"rating\"].apply(generate_popularity_score)"
|
| 451 |
+
]
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"cell_type": "markdown",
|
| 455 |
+
"metadata": {
|
| 456 |
+
"id": "HnngRNTgacYt"
|
| 457 |
+
},
|
| 458 |
+
"source": [
|
| 459 |
+
"### *d. Decide on the sentiment_label based on the popularity score with a get_sentiment function*"
|
| 460 |
+
]
|
| 461 |
+
},
|
| 462 |
+
{
|
| 463 |
+
"cell_type": "code",
|
| 464 |
+
"execution_count": null,
|
| 465 |
+
"metadata": {
|
| 466 |
+
"id": "kUtWmr8maZLZ"
|
| 467 |
+
},
|
| 468 |
+
"outputs": [],
|
| 469 |
+
"source": [
|
| 470 |
+
"def get_sentiment(popularity_score):\n",
|
| 471 |
+
" if popularity_score <= 2:\n",
|
| 472 |
+
" return \"negative\"\n",
|
| 473 |
+
" elif popularity_score == 3:\n",
|
| 474 |
+
" return \"neutral\"\n",
|
| 475 |
+
" else:\n",
|
| 476 |
+
" return \"positive\""
|
| 477 |
+
]
|
| 478 |
+
},
|
| 479 |
+
{
|
| 480 |
+
"cell_type": "markdown",
|
| 481 |
+
"metadata": {
|
| 482 |
+
"id": "HF9F9HIzgT7Z"
|
| 483 |
+
},
|
| 484 |
+
"source": [
|
| 485 |
+
"### *e. ✋🏻🛑⛔️ Run the function to create a \"sentiment_label\" column from \"popularity_score\"*"
|
| 486 |
+
]
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"cell_type": "code",
|
| 490 |
+
"execution_count": null,
|
| 491 |
+
"metadata": {
|
| 492 |
+
"id": "tafQj8_7gYCG"
|
| 493 |
+
},
|
| 494 |
+
"outputs": [],
|
| 495 |
+
"source": [
|
| 496 |
+
"df_books[\"sentiment_label\"] = df_books[\"popularity_score\"].apply(get_sentiment)"
|
| 497 |
+
]
|
| 498 |
+
},
|
| 499 |
+
{
|
| 500 |
+
"cell_type": "markdown",
|
| 501 |
+
"metadata": {
|
| 502 |
+
"id": "T8AdKkmASq9a"
|
| 503 |
+
},
|
| 504 |
+
"source": [
|
| 505 |
+
"## **4.** 📈 Generate synthetic book sales data of 18 months"
|
| 506 |
+
]
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"cell_type": "markdown",
|
| 510 |
+
"metadata": {
|
| 511 |
+
"id": "OhXbdGD5fH0c"
|
| 512 |
+
},
|
| 513 |
+
"source": [
|
| 514 |
+
"### *a. Create a generate_sales_profit function that would generate sales patterns based on sentiment_label (with some randomness)*"
|
| 515 |
+
]
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"cell_type": "code",
|
| 519 |
+
"execution_count": null,
|
| 520 |
+
"metadata": {
|
| 521 |
+
"id": "qkVhYPXGbgEn"
|
| 522 |
+
},
|
| 523 |
+
"outputs": [],
|
| 524 |
+
"source": [
|
| 525 |
+
"def generate_sales_profile(sentiment):\n",
|
| 526 |
+
" months = pd.date_range(end=datetime.today(), periods=18, freq=\"M\")\n",
|
| 527 |
+
"\n",
|
| 528 |
+
" if sentiment == \"positive\":\n",
|
| 529 |
+
" base = random.randint(200, 300)\n",
|
| 530 |
+
" trend = np.linspace(base, base + random.randint(20, 60), len(months))\n",
|
| 531 |
+
" elif sentiment == \"negative\":\n",
|
| 532 |
+
" base = random.randint(20, 80)\n",
|
| 533 |
+
" trend = np.linspace(base, base - random.randint(10, 30), len(months))\n",
|
| 534 |
+
" else: # neutral\n",
|
| 535 |
+
" base = random.randint(80, 160)\n",
|
| 536 |
+
" trend = np.full(len(months), base + random.randint(-10, 10))\n",
|
| 537 |
+
"\n",
|
| 538 |
+
" seasonality = 10 * np.sin(np.linspace(0, 3 * np.pi, len(months)))\n",
|
| 539 |
+
" noise = np.random.normal(0, 5, len(months))\n",
|
| 540 |
+
" monthly_sales = np.clip(trend + seasonality + noise, a_min=0, a_max=None).astype(int)\n",
|
| 541 |
+
"\n",
|
| 542 |
+
" return list(zip(months.strftime(\"%Y-%m\"), monthly_sales))"
|
| 543 |
+
]
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
"cell_type": "markdown",
|
| 547 |
+
"metadata": {
|
| 548 |
+
"id": "L2ak1HlcgoTe"
|
| 549 |
+
},
|
| 550 |
+
"source": [
|
| 551 |
+
"### *b. Run the function as part of building sales_data*"
|
| 552 |
+
]
|
| 553 |
+
},
|
| 554 |
+
{
|
| 555 |
+
"cell_type": "code",
|
| 556 |
+
"execution_count": null,
|
| 557 |
+
"metadata": {
|
| 558 |
+
"id": "SlJ24AUafoDB"
|
| 559 |
+
},
|
| 560 |
+
"outputs": [],
|
| 561 |
+
"source": [
|
| 562 |
+
"sales_data = []\n",
|
| 563 |
+
"for _, row in df_books.iterrows():\n",
|
| 564 |
+
" records = generate_sales_profile(row[\"sentiment_label\"])\n",
|
| 565 |
+
" for month, units in records:\n",
|
| 566 |
+
" sales_data.append({\n",
|
| 567 |
+
" \"title\": row[\"title\"],\n",
|
| 568 |
+
" \"month\": month,\n",
|
| 569 |
+
" \"units_sold\": units,\n",
|
| 570 |
+
" \"sentiment_label\": row[\"sentiment_label\"]\n",
|
| 571 |
+
" })"
|
| 572 |
+
]
|
| 573 |
+
},
|
| 574 |
+
{
|
| 575 |
+
"cell_type": "markdown",
|
| 576 |
+
"metadata": {
|
| 577 |
+
"id": "4IXZKcCSgxnq"
|
| 578 |
+
},
|
| 579 |
+
"source": [
|
| 580 |
+
"### *c. ✋🏻🛑⛔️ Create a df_sales DataFrame from sales_data*"
|
| 581 |
+
]
|
| 582 |
+
},
|
| 583 |
+
{
|
| 584 |
+
"cell_type": "code",
|
| 585 |
+
"execution_count": null,
|
| 586 |
+
"metadata": {
|
| 587 |
+
"id": "wcN6gtiZg-ws"
|
| 588 |
+
},
|
| 589 |
+
"outputs": [],
|
| 590 |
+
"source": [
|
| 591 |
+
"df_sales = pd.DataFrame(sales_data)"
|
| 592 |
+
]
|
| 593 |
+
},
|
| 594 |
+
{
|
| 595 |
+
"cell_type": "markdown",
|
| 596 |
+
"metadata": {
|
| 597 |
+
"id": "EhIjz9WohAmZ"
|
| 598 |
+
},
|
| 599 |
+
"source": [
|
| 600 |
+
"### *d. Save df_sales as synthetic_sales_data.csv & view first few lines*"
|
| 601 |
+
]
|
| 602 |
+
},
|
| 603 |
+
{
|
| 604 |
+
"cell_type": "code",
|
| 605 |
+
"execution_count": null,
|
| 606 |
+
"metadata": {
|
| 607 |
+
"colab": {
|
| 608 |
+
"base_uri": "https://localhost:8080/"
|
| 609 |
+
},
|
| 610 |
+
"id": "MzbZvLcAhGaH",
|
| 611 |
+
"outputId": "646e0702-73dc-4224-8027-e5a787b405b2"
|
| 612 |
+
},
|
| 613 |
+
"outputs": [
|
| 614 |
+
{
|
| 615 |
+
"output_type": "stream",
|
| 616 |
+
"name": "stdout",
|
| 617 |
+
"text": [
|
| 618 |
+
" title month units_sold sentiment_label\n",
|
| 619 |
+
"0 A Light in the Attic 2024-10 100 neutral\n",
|
| 620 |
+
"1 A Light in the Attic 2024-11 109 neutral\n",
|
| 621 |
+
"2 A Light in the Attic 2024-12 102 neutral\n",
|
| 622 |
+
"3 A Light in the Attic 2025-01 107 neutral\n",
|
| 623 |
+
"4 A Light in the Attic 2025-02 108 neutral\n"
|
| 624 |
+
]
|
| 625 |
+
}
|
| 626 |
+
],
|
| 627 |
+
"source": [
|
| 628 |
+
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
|
| 629 |
+
"\n",
|
| 630 |
+
"print(df_sales.head())"
|
| 631 |
+
]
|
| 632 |
+
},
|
| 633 |
+
{
|
| 634 |
+
"cell_type": "markdown",
|
| 635 |
+
"metadata": {
|
| 636 |
+
"id": "7g9gqBgQMtJn"
|
| 637 |
+
},
|
| 638 |
+
"source": [
|
| 639 |
+
"## **5.** 🎯 Generate synthetic customer reviews"
|
| 640 |
+
]
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"cell_type": "markdown",
|
| 644 |
+
"metadata": {
|
| 645 |
+
"id": "Gi4y9M9KuDWx"
|
| 646 |
+
},
|
| 647 |
+
"source": [
|
| 648 |
+
"### *a. ✋🏻🛑⛔️ Ask ChatGPT to create a list of 50 distinct generic book review texts for the sentiment labels \"positive\", \"neutral\", and \"negative\" called synthetic_reviews_by_sentiment*"
|
| 649 |
+
]
|
| 650 |
+
},
|
| 651 |
+
{
|
| 652 |
+
"cell_type": "code",
|
| 653 |
+
"execution_count": null,
|
| 654 |
+
"metadata": {
|
| 655 |
+
"id": "b3cd2a50"
|
| 656 |
+
},
|
| 657 |
+
"outputs": [],
|
| 658 |
+
"source": [
|
| 659 |
+
"synthetic_reviews_by_sentiment = {\n",
|
| 660 |
+
" \"positive\": [\n",
|
| 661 |
+
" \"A compelling and heartwarming read that stayed with me long after I finished.\",\n",
|
| 662 |
+
" \"Brilliantly written! The characters were unforgettable and the plot was engaging.\",\n",
|
| 663 |
+
" \"One of the best books I've read this year — inspiring and emotionally rich.\",\n",
|
| 664 |
+
" \"The author's storytelling was vivid and powerful. Highly recommended!\",\n",
|
| 665 |
+
" \"An absolute masterpiece. I couldn't put it down from start to finish.\",\n",
|
| 666 |
+
" \"Gripping, intelligent, and beautifully crafted — I loved every page.\",\n",
|
| 667 |
+
" \"The emotional depth and layered narrative were just perfect.\",\n",
|
| 668 |
+
" \"A thought-provoking journey with stunning character development.\",\n",
|
| 669 |
+
" \"Everything about this book just clicked. A top-tier read!\",\n",
|
| 670 |
+
" \"A flawless blend of emotion, intrigue, and style. Truly impressive.\",\n",
|
| 671 |
+
" \"Absolutely stunning work of fiction. Five stars from me.\",\n",
|
| 672 |
+
" \"Remarkably executed with breathtaking prose.\",\n",
|
| 673 |
+
" \"The pacing was perfect and I was hooked from page one.\",\n",
|
| 674 |
+
" \"Heartfelt and hopeful — a story well worth telling.\",\n",
|
| 675 |
+
" \"A vivid journey through complex emotions and stunning imagery.\",\n",
|
| 676 |
+
" \"This book had soul. Every word felt like it mattered.\",\n",
|
| 677 |
+
" \"It delivered more than I ever expected. Powerful and wise.\",\n",
|
| 678 |
+
" \"The characters leapt off the page and into my heart.\",\n",
|
| 679 |
+
" \"I could see every scene clearly in my mind — beautifully descriptive.\",\n",
|
| 680 |
+
" \"Refreshing, original, and impossible to forget.\",\n",
|
| 681 |
+
" \"A radiant celebration of resilience and love.\",\n",
|
| 682 |
+
" \"Powerful themes handled with grace and insight.\",\n",
|
| 683 |
+
" \"An unforgettable literary experience.\",\n",
|
| 684 |
+
" \"The best book club pick we’ve had all year.\",\n",
|
| 685 |
+
" \"A layered, lyrical narrative that resonates deeply.\",\n",
|
| 686 |
+
" \"Surprising, profound, and deeply humane.\",\n",
|
| 687 |
+
" \"One of those rare books I wish I could read again for the first time.\",\n",
|
| 688 |
+
" \"Both epic and intimate — a perfect balance.\",\n",
|
| 689 |
+
" \"It reads like a love letter to the human spirit.\",\n",
|
| 690 |
+
" \"Satisfying and uplifting with a memorable ending.\",\n",
|
| 691 |
+
" \"This novel deserves every bit of praise it gets.\",\n",
|
| 692 |
+
" \"Introspective, emotional, and elegantly composed.\",\n",
|
| 693 |
+
" \"A tour de force in contemporary fiction.\",\n",
|
| 694 |
+
" \"Left me smiling, teary-eyed, and completely fulfilled.\",\n",
|
| 695 |
+
" \"A novel with the rare ability to entertain and enlighten.\",\n",
|
| 696 |
+
" \"Incredibly moving. I highlighted so many lines.\",\n",
|
| 697 |
+
" \"A smart, sensitive take on relationships and identity.\",\n",
|
| 698 |
+
" \"You feel wiser by the end of it.\",\n",
|
| 699 |
+
" \"A gorgeously crafted tale about hope and second chances.\",\n",
|
| 700 |
+
" \"Poignant and real — a beautiful escape.\",\n",
|
| 701 |
+
" \"Brims with insight and authenticity.\",\n",
|
| 702 |
+
" \"Compelling characters and a satisfying plot.\",\n",
|
| 703 |
+
" \"An empowering and important read.\",\n",
|
| 704 |
+
" \"Elegantly crafted and deeply humane.\",\n",
|
| 705 |
+
" \"Taut storytelling that never lets go.\",\n",
|
| 706 |
+
" \"Each chapter offered a new treasure.\",\n",
|
| 707 |
+
" \"Lyrical writing that stays with you.\",\n",
|
| 708 |
+
" \"A wonderful blend of passion and thoughtfulness.\",\n",
|
| 709 |
+
" \"Uplifting, honest, and completely engrossing.\",\n",
|
| 710 |
+
" \"This one made me believe in storytelling again.\"\n",
|
| 711 |
+
" ],\n",
|
| 712 |
+
" \"neutral\": [\n",
|
| 713 |
+
" \"An average book — not great, but not bad either.\",\n",
|
| 714 |
+
" \"Some parts really stood out, others felt a bit flat.\",\n",
|
| 715 |
+
" \"It was okay overall. A decent way to pass the time.\",\n",
|
| 716 |
+
" \"The writing was fine, though I didn’t fully connect with the story.\",\n",
|
| 717 |
+
" \"Had a few memorable moments but lacked depth in some areas.\",\n",
|
| 718 |
+
" \"A mixed experience — neither fully engaging nor forgettable.\",\n",
|
| 719 |
+
" \"There was potential, but it didn't quite come together for me.\",\n",
|
| 720 |
+
" \"A reasonable effort that just didn’t leave a lasting impression.\",\n",
|
| 721 |
+
" \"Serviceable but not something I'd go out of my way to recommend.\",\n",
|
| 722 |
+
" \"Not much to dislike, but not much to rave about either.\",\n",
|
| 723 |
+
" \"It had its strengths, though they didn’t shine consistently.\",\n",
|
| 724 |
+
" \"I’m on the fence — parts were enjoyable, others not so much.\",\n",
|
| 725 |
+
" \"The book had a unique concept but lacked execution.\",\n",
|
| 726 |
+
" \"A middle-of-the-road read.\",\n",
|
| 727 |
+
" \"Engaging at times, but it lost momentum.\",\n",
|
| 728 |
+
" \"Would have benefited from stronger character development.\",\n",
|
| 729 |
+
" \"It passed the time, but I wouldn't reread it.\",\n",
|
| 730 |
+
" \"The plot had some holes that affected immersion.\",\n",
|
| 731 |
+
" \"Mediocre pacing made it hard to stay invested.\",\n",
|
| 732 |
+
" \"Satisfying in parts, underwhelming in others.\",\n",
|
| 733 |
+
" \"Neutral on this one — didn’t love it or hate it.\",\n",
|
| 734 |
+
" \"Fairly forgettable but with glimpses of promise.\",\n",
|
| 735 |
+
" \"The themes were solid, but not well explored.\",\n",
|
| 736 |
+
" \"Competent, just not compelling.\",\n",
|
| 737 |
+
" \"Had moments of clarity and moments of confusion.\",\n",
|
| 738 |
+
" \"I didn’t regret reading it, but I wouldn’t recommend it.\",\n",
|
| 739 |
+
" \"Readable, yet uninspired.\",\n",
|
| 740 |
+
" \"There was a spark, but it didn’t ignite.\",\n",
|
| 741 |
+
" \"A slow burn that didn’t quite catch fire.\",\n",
|
| 742 |
+
" \"I expected more nuance given the premise.\",\n",
|
| 743 |
+
" \"A safe, inoffensive choice.\",\n",
|
| 744 |
+
" \"Some parts lagged, others piqued my interest.\",\n",
|
| 745 |
+
" \"Decent, but needed polish.\",\n",
|
| 746 |
+
" \"Moderately engaging but didn’t stick the landing.\",\n",
|
| 747 |
+
" \"It simply lacked that emotional punch.\",\n",
|
| 748 |
+
" \"Just fine — no better, no worse.\",\n",
|
| 749 |
+
" \"Some thoughtful passages amid otherwise dry writing.\",\n",
|
| 750 |
+
" \"I appreciated the ideas more than the execution.\",\n",
|
| 751 |
+
" \"Struggled with cohesion.\",\n",
|
| 752 |
+
" \"Solidly average.\",\n",
|
| 753 |
+
" \"Good on paper, flat in practice.\",\n",
|
| 754 |
+
" \"A few bright spots, but mostly dim.\",\n",
|
| 755 |
+
" \"The kind of book that fades from memory.\",\n",
|
| 756 |
+
" \"It scratched the surface but didn’t dig deep.\",\n",
|
| 757 |
+
" \"Standard fare with some promise.\",\n",
|
| 758 |
+
" \"Okay, but not memorable.\",\n",
|
| 759 |
+
" \"Had potential that went unrealized.\",\n",
|
| 760 |
+
" \"Could have been tighter, sharper, deeper.\",\n",
|
| 761 |
+
" \"A blend of mediocrity and mild interest.\",\n",
|
| 762 |
+
" \"I kept reading, but barely.\"\n",
|
| 763 |
+
" ],\n",
|
| 764 |
+
" \"negative\": [\n",
|
| 765 |
+
" \"I struggled to get through this one — it just didn’t grab me.\",\n",
|
| 766 |
+
" \"The plot was confusing and the characters felt underdeveloped.\",\n",
|
| 767 |
+
" \"Disappointing. I had high hopes, but they weren't met.\",\n",
|
| 768 |
+
" \"Uninspired writing and a story that never quite took off.\",\n",
|
| 769 |
+
" \"Unfortunately, it was dull and predictable throughout.\",\n",
|
| 770 |
+
" \"The pacing dragged and I couldn’t find anything compelling.\",\n",
|
| 771 |
+
" \"This felt like a chore to read — lacked heart and originality.\",\n",
|
| 772 |
+
" \"Nothing really worked for me in this book.\",\n",
|
| 773 |
+
" \"A frustrating read that left me unsatisfied.\",\n",
|
| 774 |
+
" \"I kept hoping it would improve, but it never did.\",\n",
|
| 775 |
+
" \"The characters didn’t feel real, and the dialogue was forced.\",\n",
|
| 776 |
+
" \"I couldn't connect with the story at all.\",\n",
|
| 777 |
+
" \"A slow, meandering narrative with little payoff.\",\n",
|
| 778 |
+
" \"Tried too hard to be deep, but just felt empty.\",\n",
|
| 779 |
+
" \"The tone was uneven and confusing.\",\n",
|
| 780 |
+
" \"Way too repetitive and lacking progression.\",\n",
|
| 781 |
+
" \"The ending was abrupt and unsatisfying.\",\n",
|
| 782 |
+
" \"No emotional resonance — I felt nothing throughout.\",\n",
|
| 783 |
+
" \"I expected much more, but this fell flat.\",\n",
|
| 784 |
+
" \"Poorly edited and full of clichés.\",\n",
|
| 785 |
+
" \"The premise was interesting, but poorly executed.\",\n",
|
| 786 |
+
" \"Just didn’t live up to the praise.\",\n",
|
| 787 |
+
" \"A disjointed mess from start to finish.\",\n",
|
| 788 |
+
" \"Overly long and painfully dull.\",\n",
|
| 789 |
+
" \"Dialogue that felt robotic and unrealistic.\",\n",
|
| 790 |
+
" \"A hollow shell of what it could’ve been.\",\n",
|
| 791 |
+
" \"It lacked a coherent structure.\",\n",
|
| 792 |
+
" \"More confusing than complex.\",\n",
|
| 793 |
+
" \"Reading it felt like a task, not a treat.\",\n",
|
| 794 |
+
" \"There was no tension, no emotion — just words.\",\n",
|
| 795 |
+
" \"Characters with no motivation or development.\",\n",
|
| 796 |
+
" \"The plot twists were nonsensical.\",\n",
|
| 797 |
+
" \"Regret buying this book.\",\n",
|
| 798 |
+
" \"Nothing drew me in, nothing made me stay.\",\n",
|
| 799 |
+
" \"Too many subplots and none were satisfying.\",\n",
|
| 800 |
+
" \"Tedious and unimaginative.\",\n",
|
| 801 |
+
" \"Like reading a rough draft.\",\n",
|
| 802 |
+
" \"Disjointed, distant, and disappointing.\",\n",
|
| 803 |
+
" \"A lot of buildup with no payoff.\",\n",
|
| 804 |
+
" \"I don’t understand the hype.\",\n",
|
| 805 |
+
" \"This book simply didn’t work.\",\n",
|
| 806 |
+
" \"Forgettable in every sense.\",\n",
|
| 807 |
+
" \"More effort should’ve gone into editing.\",\n",
|
| 808 |
+
" \"The story lost its way early on.\",\n",
|
| 809 |
+
" \"It dragged endlessly.\",\n",
|
| 810 |
+
" \"I kept checking how many pages were left.\",\n",
|
| 811 |
+
" \"This lacked vision and clarity.\",\n",
|
| 812 |
+
" \"I expected substance — got fluff.\",\n",
|
| 813 |
+
" \"It failed to make me care.\"\n",
|
| 814 |
+
" ]\n",
|
| 815 |
+
"}"
|
| 816 |
+
]
|
| 817 |
+
},
|
| 818 |
+
{
|
| 819 |
+
"cell_type": "markdown",
|
| 820 |
+
"metadata": {
|
| 821 |
+
"id": "fQhfVaDmuULT"
|
| 822 |
+
},
|
| 823 |
+
"source": [
|
| 824 |
+
"### *b. Generate 10 reviews per book using random sampling from the corresponding 50*"
|
| 825 |
+
]
|
| 826 |
+
},
|
| 827 |
+
{
|
| 828 |
+
"cell_type": "code",
|
| 829 |
+
"execution_count": null,
|
| 830 |
+
"metadata": {
|
| 831 |
+
"id": "l2SRc3PjuTGM"
|
| 832 |
+
},
|
| 833 |
+
"outputs": [],
|
| 834 |
+
"source": [
|
| 835 |
+
"review_rows = []\n",
|
| 836 |
+
"for _, row in df_books.iterrows():\n",
|
| 837 |
+
" title = row['title']\n",
|
| 838 |
+
" sentiment_label = row['sentiment_label']\n",
|
| 839 |
+
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
|
| 840 |
+
" sampled_reviews = random.sample(review_pool, 10)\n",
|
| 841 |
+
" for review_text in sampled_reviews:\n",
|
| 842 |
+
" review_rows.append({\n",
|
| 843 |
+
" \"title\": title,\n",
|
| 844 |
+
" \"sentiment_label\": sentiment_label,\n",
|
| 845 |
+
" \"review_text\": review_text,\n",
|
| 846 |
+
" \"rating\": row['rating'],\n",
|
| 847 |
+
" \"popularity_score\": row['popularity_score']\n",
|
| 848 |
+
" })"
|
| 849 |
+
]
|
| 850 |
+
},
|
| 851 |
+
{
|
| 852 |
+
"cell_type": "markdown",
|
| 853 |
+
"metadata": {
|
| 854 |
+
"id": "bmJMXF-Bukdm"
|
| 855 |
+
},
|
| 856 |
+
"source": [
|
| 857 |
+
"### *c. Create the final dataframe df_reviews & save it as synthetic_book_reviews.csv*"
|
| 858 |
+
]
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"cell_type": "code",
|
| 862 |
+
"execution_count": null,
|
| 863 |
+
"metadata": {
|
| 864 |
+
"id": "ZUKUqZsuumsp"
|
| 865 |
+
},
|
| 866 |
+
"outputs": [],
|
| 867 |
+
"source": [
|
| 868 |
+
"df_reviews = pd.DataFrame(review_rows)\n",
|
| 869 |
+
"df_reviews.to_csv(\"synthetic_book_reviews.csv\", index=False)"
|
| 870 |
+
]
|
| 871 |
+
},
|
| 872 |
+
{
|
| 873 |
+
"cell_type": "code",
|
| 874 |
+
"execution_count": null,
|
| 875 |
+
"metadata": {
|
| 876 |
+
"colab": {
|
| 877 |
+
"base_uri": "https://localhost:8080/"
|
| 878 |
+
},
|
| 879 |
+
"id": "3946e521",
|
| 880 |
+
"outputId": "4ad8c609-99fd-41e8-9be7-43ebe4eef336"
|
| 881 |
+
},
|
| 882 |
+
"outputs": [
|
| 883 |
+
{
|
| 884 |
+
"output_type": "stream",
|
| 885 |
+
"name": "stdout",
|
| 886 |
+
"text": [
|
| 887 |
+
"✅ Wrote synthetic_title_level_features.csv\n",
|
| 888 |
+
"✅ Wrote synthetic_monthly_revenue_series.csv\n"
|
| 889 |
+
]
|
| 890 |
+
}
|
| 891 |
+
],
|
| 892 |
+
"source": [
|
| 893 |
+
"\n",
|
| 894 |
+
"# ============================================================\n",
|
| 895 |
+
"# ✅ Create \"R-ready\" derived inputs (root-level files)\n",
|
| 896 |
+
"# ============================================================\n",
|
| 897 |
+
"# These two files make the R notebook robust and fast:\n",
|
| 898 |
+
"# 1) synthetic_title_level_features.csv -> regression-ready, one row per title\n",
|
| 899 |
+
"# 2) synthetic_monthly_revenue_series.csv -> forecasting-ready, one row per month\n",
|
| 900 |
+
"\n",
|
| 901 |
+
"import numpy as np\n",
|
| 902 |
+
"\n",
|
| 903 |
+
"def _safe_num(s):\n",
|
| 904 |
+
" return pd.to_numeric(\n",
|
| 905 |
+
" pd.Series(s).astype(str).str.replace(r\"[^0-9.]\", \"\", regex=True),\n",
|
| 906 |
+
" errors=\"coerce\"\n",
|
| 907 |
+
" )\n",
|
| 908 |
+
"\n",
|
| 909 |
+
"# --- Clean book metadata (price/rating) ---\n",
|
| 910 |
+
"df_books_r = df_books.copy()\n",
|
| 911 |
+
"if \"price\" in df_books_r.columns:\n",
|
| 912 |
+
" df_books_r[\"price\"] = _safe_num(df_books_r[\"price\"])\n",
|
| 913 |
+
"if \"rating\" in df_books_r.columns:\n",
|
| 914 |
+
" df_books_r[\"rating\"] = _safe_num(df_books_r[\"rating\"])\n",
|
| 915 |
+
"\n",
|
| 916 |
+
"df_books_r[\"title\"] = df_books_r[\"title\"].astype(str).str.strip()\n",
|
| 917 |
+
"\n",
|
| 918 |
+
"# --- Clean sales ---\n",
|
| 919 |
+
"df_sales_r = df_sales.copy()\n",
|
| 920 |
+
"df_sales_r[\"title\"] = df_sales_r[\"title\"].astype(str).str.strip()\n",
|
| 921 |
+
"df_sales_r[\"month\"] = pd.to_datetime(df_sales_r[\"month\"], errors=\"coerce\")\n",
|
| 922 |
+
"df_sales_r[\"units_sold\"] = _safe_num(df_sales_r[\"units_sold\"])\n",
|
| 923 |
+
"\n",
|
| 924 |
+
"# --- Clean reviews ---\n",
|
| 925 |
+
"df_reviews_r = df_reviews.copy()\n",
|
| 926 |
+
"df_reviews_r[\"title\"] = df_reviews_r[\"title\"].astype(str).str.strip()\n",
|
| 927 |
+
"df_reviews_r[\"sentiment_label\"] = df_reviews_r[\"sentiment_label\"].astype(str).str.lower().str.strip()\n",
|
| 928 |
+
"if \"rating\" in df_reviews_r.columns:\n",
|
| 929 |
+
" df_reviews_r[\"rating\"] = _safe_num(df_reviews_r[\"rating\"])\n",
|
| 930 |
+
"if \"popularity_score\" in df_reviews_r.columns:\n",
|
| 931 |
+
" df_reviews_r[\"popularity_score\"] = _safe_num(df_reviews_r[\"popularity_score\"])\n",
|
| 932 |
+
"\n",
|
| 933 |
+
"# --- Sentiment shares per title (from reviews) ---\n",
|
| 934 |
+
"sent_counts = (\n",
|
| 935 |
+
" df_reviews_r.groupby([\"title\", \"sentiment_label\"])\n",
|
| 936 |
+
" .size()\n",
|
| 937 |
+
" .unstack(fill_value=0)\n",
|
| 938 |
+
")\n",
|
| 939 |
+
"for lab in [\"positive\", \"neutral\", \"negative\"]:\n",
|
| 940 |
+
" if lab not in sent_counts.columns:\n",
|
| 941 |
+
" sent_counts[lab] = 0\n",
|
| 942 |
+
"\n",
|
| 943 |
+
"sent_counts[\"total_reviews\"] = sent_counts[[\"positive\", \"neutral\", \"negative\"]].sum(axis=1)\n",
|
| 944 |
+
"den = sent_counts[\"total_reviews\"].replace(0, np.nan)\n",
|
| 945 |
+
"sent_counts[\"share_positive\"] = sent_counts[\"positive\"] / den\n",
|
| 946 |
+
"sent_counts[\"share_neutral\"] = sent_counts[\"neutral\"] / den\n",
|
| 947 |
+
"sent_counts[\"share_negative\"] = sent_counts[\"negative\"] / den\n",
|
| 948 |
+
"sent_counts = sent_counts.reset_index()\n",
|
| 949 |
+
"\n",
|
| 950 |
+
"# --- Sales aggregation per title ---\n",
|
| 951 |
+
"sales_by_title = (\n",
|
| 952 |
+
" df_sales_r.dropna(subset=[\"title\"])\n",
|
| 953 |
+
" .groupby(\"title\", as_index=False)\n",
|
| 954 |
+
" .agg(\n",
|
| 955 |
+
" months_observed=(\"month\", \"nunique\"),\n",
|
| 956 |
+
" avg_units_sold=(\"units_sold\", \"mean\"),\n",
|
| 957 |
+
" total_units_sold=(\"units_sold\", \"sum\"),\n",
|
| 958 |
+
" )\n",
|
| 959 |
+
")\n",
|
| 960 |
+
"\n",
|
| 961 |
+
"# --- Title-level features (join sales + books + sentiment) ---\n",
|
| 962 |
+
"df_title = (\n",
|
| 963 |
+
" sales_by_title\n",
|
| 964 |
+
" .merge(df_books_r[[\"title\", \"price\", \"rating\"]], on=\"title\", how=\"left\")\n",
|
| 965 |
+
" .merge(sent_counts[[\"title\", \"share_positive\", \"share_neutral\", \"share_negative\", \"total_reviews\"]],\n",
|
| 966 |
+
" on=\"title\", how=\"left\")\n",
|
| 967 |
+
")\n",
|
| 968 |
+
"\n",
|
| 969 |
+
"df_title[\"avg_revenue\"] = df_title[\"avg_units_sold\"] * df_title[\"price\"]\n",
|
| 970 |
+
"df_title[\"total_revenue\"] = df_title[\"total_units_sold\"] * df_title[\"price\"]\n",
|
| 971 |
+
"\n",
|
| 972 |
+
"df_title.to_csv(\"synthetic_title_level_features.csv\", index=False)\n",
|
| 973 |
+
"print(\"✅ Wrote synthetic_title_level_features.csv\")\n",
|
| 974 |
+
"\n",
|
| 975 |
+
"# --- Monthly revenue series (proxy: units_sold * price) ---\n",
|
| 976 |
+
"monthly_rev = (\n",
|
| 977 |
+
" df_sales_r.merge(df_books_r[[\"title\", \"price\"]], on=\"title\", how=\"left\")\n",
|
| 978 |
+
")\n",
|
| 979 |
+
"monthly_rev[\"revenue\"] = monthly_rev[\"units_sold\"] * monthly_rev[\"price\"]\n",
|
| 980 |
+
"\n",
|
| 981 |
+
"df_monthly = (\n",
|
| 982 |
+
" monthly_rev.dropna(subset=[\"month\"])\n",
|
| 983 |
+
" .groupby(\"month\", as_index=False)[\"revenue\"]\n",
|
| 984 |
+
" .sum()\n",
|
| 985 |
+
" .rename(columns={\"revenue\": \"total_revenue\"})\n",
|
| 986 |
+
" .sort_values(\"month\")\n",
|
| 987 |
+
")\n",
|
| 988 |
+
"# if revenue is all NA (e.g., missing price), fallback to units_sold as a teaching proxy\n",
|
| 989 |
+
"if df_monthly[\"total_revenue\"].notna().sum() == 0:\n",
|
| 990 |
+
" df_monthly = (\n",
|
| 991 |
+
" df_sales_r.dropna(subset=[\"month\"])\n",
|
| 992 |
+
" .groupby(\"month\", as_index=False)[\"units_sold\"]\n",
|
| 993 |
+
" .sum()\n",
|
| 994 |
+
" .rename(columns={\"units_sold\": \"total_revenue\"})\n",
|
| 995 |
+
" .sort_values(\"month\")\n",
|
| 996 |
+
" )\n",
|
| 997 |
+
"\n",
|
| 998 |
+
"df_monthly[\"month\"] = pd.to_datetime(df_monthly[\"month\"], errors=\"coerce\").dt.strftime(\"%Y-%m-%d\")\n",
|
| 999 |
+
"df_monthly.to_csv(\"synthetic_monthly_revenue_series.csv\", index=False)\n",
|
| 1000 |
+
"print(\"✅ Wrote synthetic_monthly_revenue_series.csv\")\n"
|
| 1001 |
+
]
|
| 1002 |
+
},
|
| 1003 |
+
{
|
| 1004 |
+
"cell_type": "markdown",
|
| 1005 |
+
"metadata": {
|
| 1006 |
+
"id": "RYvGyVfXuo54"
|
| 1007 |
+
},
|
| 1008 |
+
"source": [
|
| 1009 |
+
"### *d. ✋🏻🛑⛔️ View the first few lines*"
|
| 1010 |
+
]
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"cell_type": "code",
|
| 1014 |
+
"execution_count": null,
|
| 1015 |
+
"metadata": {
|
| 1016 |
+
"colab": {
|
| 1017 |
+
"base_uri": "https://localhost:8080/"
|
| 1018 |
+
},
|
| 1019 |
+
"id": "xfE8NMqOurKo",
|
| 1020 |
+
"outputId": "c21804fe-c941-4881-d6d2-279f1ce27cb4"
|
| 1021 |
+
},
|
| 1022 |
+
"outputs": [
|
| 1023 |
+
{
|
| 1024 |
+
"output_type": "stream",
|
| 1025 |
+
"name": "stdout",
|
| 1026 |
+
"text": [
|
| 1027 |
+
" title sentiment_label \\\n",
|
| 1028 |
+
"0 A Light in the Attic neutral \n",
|
| 1029 |
+
"1 A Light in the Attic neutral \n",
|
| 1030 |
+
"2 A Light in the Attic neutral \n",
|
| 1031 |
+
"3 A Light in the Attic neutral \n",
|
| 1032 |
+
"4 A Light in the Attic neutral \n",
|
| 1033 |
+
"\n",
|
| 1034 |
+
" review_text rating popularity_score \n",
|
| 1035 |
+
"0 Had potential that went unrealized. Three 3 \n",
|
| 1036 |
+
"1 The themes were solid, but not well explored. Three 3 \n",
|
| 1037 |
+
"2 It simply lacked that emotional punch. Three 3 \n",
|
| 1038 |
+
"3 Serviceable but not something I'd go out of my... Three 3 \n",
|
| 1039 |
+
"4 Standard fare with some promise. Three 3 \n"
|
| 1040 |
+
]
|
| 1041 |
+
}
|
| 1042 |
+
],
|
| 1043 |
+
"source": [
|
| 1044 |
+
"print(df_reviews.head())"
|
| 1045 |
+
]
|
| 1046 |
+
}
|
| 1047 |
+
],
|
| 1048 |
+
"metadata": {
|
| 1049 |
+
"colab": {
|
| 1050 |
+
"collapsed_sections": [
|
| 1051 |
+
"jpASMyIQMaAq",
|
| 1052 |
+
"lquNYCbfL9IM",
|
| 1053 |
+
"0IWuNpxxYDJF",
|
| 1054 |
+
"oCdTsin2Yfp3",
|
| 1055 |
+
"T0TOeRC4Yrnn",
|
| 1056 |
+
"duI5dv3CZYvF",
|
| 1057 |
+
"qMjRKMBQZlJi",
|
| 1058 |
+
"p-1Pr2szaqLk",
|
| 1059 |
+
"SIaJUGIpaH4V",
|
| 1060 |
+
"pY4yCoIuaQqp",
|
| 1061 |
+
"n4-TaNTFgPak",
|
| 1062 |
+
"HnngRNTgacYt",
|
| 1063 |
+
"HF9F9HIzgT7Z",
|
| 1064 |
+
"T8AdKkmASq9a",
|
| 1065 |
+
"OhXbdGD5fH0c",
|
| 1066 |
+
"L2ak1HlcgoTe",
|
| 1067 |
+
"4IXZKcCSgxnq",
|
| 1068 |
+
"EhIjz9WohAmZ",
|
| 1069 |
+
"Gi4y9M9KuDWx",
|
| 1070 |
+
"fQhfVaDmuULT",
|
| 1071 |
+
"bmJMXF-Bukdm",
|
| 1072 |
+
"RYvGyVfXuo54"
|
| 1073 |
+
],
|
| 1074 |
+
"provenance": []
|
| 1075 |
+
},
|
| 1076 |
+
"kernelspec": {
|
| 1077 |
+
"display_name": "Python 3",
|
| 1078 |
+
"name": "python3"
|
| 1079 |
+
},
|
| 1080 |
+
"language_info": {
|
| 1081 |
+
"name": "python"
|
| 1082 |
+
}
|
| 1083 |
+
},
|
| 1084 |
+
"nbformat": 4,
|
| 1085 |
+
"nbformat_minor": 0
|
| 1086 |
+
}
|
pythonanalysis.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scraped_motivational_quotes.csv
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
quote,author,tags
|
| 2 |
+
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”,Albert Einstein,"inspirational, life, live, miracle, miracles"
|
| 3 |
+
"“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",Marilyn Monroe,"be-yourself, inspirational"
|
| 4 |
+
“Try not to become a man of success. Rather become a man of value.”,Albert Einstein,"adulthood, success, value"
|
| 5 |
+
“It is better to be hated for what you are than to be loved for what you are not.”,André Gide,"life, love"
|
| 6 |
+
"“I have not failed. I've just found 10,000 ways that won't work.”",Thomas A. Edison,"edison, failure, inspirational, paraphrased"
|
| 7 |
+
"“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.”",Marilyn Monroe,"friends, heartbreak, inspirational, life, love, sisters"
|
| 8 |
+
"“I may not have gone where I intended to go, but I think I have ended up where I needed to be.”",Douglas Adams,"life, navigation"
|
| 9 |
+
"“The opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.”",Elie Wiesel,"activism, apathy, hate, indifference, inspirational, love, opposite, philosophy"
|
| 10 |
+
"“Good friends, good books, and a sleepy conscience: this is the ideal life.”",Mark Twain,"books, contentment, friends, friendship, life"
|
| 11 |
+
“Life is what happens to us while we are making other plans.”,Allen Saunders,"fate, life, misattributed-john-lennon, planning, plans"
|
| 12 |
+
"“Today you are You, that is truer than true. There is no one alive who is Youer than You.”",Dr. Seuss,"comedy, life, yourself"
|
| 13 |
+
"“The more that you read, the more things you will know. The more that you learn, the more places you'll go.”",Dr. Seuss,"learning, reading, seuss"
|
| 14 |
+
"“To the well-organized mind, death is but the next great adventure.”",J.K. Rowling,"death, inspirational"
|
| 15 |
+
“Any fool can know. The point is to understand.”,Albert Einstein,"knowledge, learning, understanding, wisdom"
|
| 16 |
+
“It is never too late to be what you might have been.”,George Eliot,inspirational
|
| 17 |
+
“You can never get a cup of tea large enough or a book long enough to suit me.”,C.S. Lewis,"books, inspirational, reading, tea"
|
| 18 |
+
“You believe lies so you eventually learn to trust no one but yourself.”,Marilyn Monroe,
|
| 19 |
+
"“Life is like riding a bicycle. To keep your balance, you must keep moving.”",Albert Einstein,"life, simile"
|
| 20 |
+
“Only in the darkness can you see the stars.”,Martin Luther King Jr.,"hope, inspirational"
|
scraped_quotes.csv
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
quote,author,tags
|
| 2 |
+
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,Albert Einstein,"change, deep-thoughts, thinking, world"
|
| 3 |
+
"“It is our choices, Harry, that show what we truly are, far more than our abilities.”",J.K. Rowling,"abilities, choices"
|
| 4 |
+
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”,Albert Einstein,"inspirational, life, live, miracle, miracles"
|
| 5 |
+
"“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”",Jane Austen,"aliteracy, books, classic, humor"
|
| 6 |
+
"“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",Marilyn Monroe,"be-yourself, inspirational"
|
| 7 |
+
“Try not to become a man of success. Rather become a man of value.”,Albert Einstein,"adulthood, success, value"
|
| 8 |
+
“It is better to be hated for what you are than to be loved for what you are not.”,André Gide,"life, love"
|
| 9 |
+
"“I have not failed. I've just found 10,000 ways that won't work.”",Thomas A. Edison,"edison, failure, inspirational, paraphrased"
|
| 10 |
+
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”,Eleanor Roosevelt,misattributed-eleanor-roosevelt
|
| 11 |
+
"“A day without sunshine is like, you know, night.”",Steve Martin,"humor, obvious, simile"
|
| 12 |
+
"“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.”",Marilyn Monroe,"friends, heartbreak, inspirational, life, love, sisters"
|
| 13 |
+
"“It takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.”",J.K. Rowling,"courage, friends"
|
| 14 |
+
"“If you can't explain it to a six year old, you don't understand it yourself.”",Albert Einstein,"simplicity, understand"
|
| 15 |
+
"“You may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect—you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break—her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.”",Bob Marley,love
|
| 16 |
+
"“I like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.”",Dr. Seuss,fantasy
|
| 17 |
+
"“I may not have gone where I intended to go, but I think I have ended up where I needed to be.”",Douglas Adams,"life, navigation"
|
| 18 |
+
"“The opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.”",Elie Wiesel,"activism, apathy, hate, indifference, inspirational, love, opposite, philosophy"
|
| 19 |
+
"“It is not a lack of love, but a lack of friendship that makes unhappy marriages.”",Friedrich Nietzsche,"friendship, lack-of-friendship, lack-of-love, love, marriage, unhappy-marriage"
|
| 20 |
+
"“Good friends, good books, and a sleepy conscience: this is the ideal life.”",Mark Twain,"books, contentment, friends, friendship, life"
|
| 21 |
+
“Life is what happens to us while we are making other plans.”,Allen Saunders,"fate, life, misattributed-john-lennon, planning, plans"
|
| 22 |
+
"“I love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.”",Pablo Neruda,"love, poetry"
|
| 23 |
+
“For every minute you are angry you lose sixty seconds of happiness.”,Ralph Waldo Emerson,happiness
|
| 24 |
+
"“If you judge people, you have no time to love them.”",Mother Teresa,attributed-no-source
|
| 25 |
+
“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.”,Garrison Keillor,"humor, religion"
|
| 26 |
+
“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.”,Jim Henson,humor
|
| 27 |
+
"“Today you are You, that is truer than true. There is no one alive who is Youer than You.”",Dr. Seuss,"comedy, life, yourself"
|
| 28 |
+
"“If you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent, read them more fairy tales.”",Albert Einstein,"children, fairy-tales"
|
| 29 |
+
"“It is impossible to live without failing at something, unless you live so cautiously that you might as well not have lived at all - in which case, you fail by default.”",J.K. Rowling,
|
| 30 |
+
“Logic will get you from A to Z; imagination will get you everywhere.”,Albert Einstein,imagination
|
| 31 |
+
"“One good thing about music, when it hits you, you feel no pain.”",Bob Marley,music
|
| 32 |
+
"“The more that you read, the more things you will know. The more that you learn, the more places you'll go.”",Dr. Seuss,"learning, reading, seuss"
|
| 33 |
+
"“Of course it is happening inside your head, Harry, but why on earth should that mean that it is not real?”",J.K. Rowling,dumbledore
|
| 34 |
+
"“The truth is, everyone is going to hurt you. You just got to find the ones worth suffering for.”",Bob Marley,friendship
|
| 35 |
+
“Not all of us can do great things. But we can do small things with great love.”,Mother Teresa,"misattributed-to-mother-teresa, paraphrased"
|
| 36 |
+
"“To the well-organized mind, death is but the next great adventure.”",J.K. Rowling,"death, inspirational"
|
| 37 |
+
“All you need is love. But a little chocolate now and then doesn't hurt.”,Charles M. Schulz,"chocolate, food, humor"
|
| 38 |
+
“We read to know we're not alone.”,William Nicholson,"misattributed-to-c-s-lewis, reading"
|
| 39 |
+
“Any fool can know. The point is to understand.”,Albert Einstein,"knowledge, learning, understanding, wisdom"
|
| 40 |
+
“I have always imagined that Paradise will be a kind of library.”,Jorge Luis Borges,"books, library"
|
| 41 |
+
“It is never too late to be what you might have been.”,George Eliot,inspirational
|
| 42 |
+
"“A reader lives a thousand lives before he dies, said Jojen. The man who never reads lives only one.”",George R.R. Martin,"read, readers, reading, reading-books"
|
| 43 |
+
“You can never get a cup of tea large enough or a book long enough to suit me.”,C.S. Lewis,"books, inspirational, reading, tea"
|
| 44 |
+
“You believe lies so you eventually learn to trust no one but yourself.”,Marilyn Monroe,
|
| 45 |
+
"“If you can make a woman laugh, you can make her do anything.”",Marilyn Monroe,"girls, love"
|
| 46 |
+
"“Life is like riding a bicycle. To keep your balance, you must keep moving.”",Albert Einstein,"life, simile"
|
| 47 |
+
“The real lover is the man who can thrill you by kissing your forehead or smiling into your eyes or just staring into space.”,Marilyn Monroe,love
|
| 48 |
+
"“A wise girl kisses but doesn't love, listens but doesn't believe, and leaves before she is left.”",Marilyn Monroe,attributed-no-source
|
| 49 |
+
“Only in the darkness can you see the stars.”,Martin Luther King Jr.,"hope, inspirational"
|
| 50 |
+
"“It matters not what someone is born, but what they grow to be.”",J.K. Rowling,dumbledore
|
| 51 |
+
"“Love does not begin and end the way we seem to think it does. Love is a battle, love is a war; love is a growing up.”",James Baldwin,love
|
student_dataset.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
student_dataset_enriched.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
synthetic_student_reviews.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|