Seagle123 commited on
Commit
b2590d8
Β·
verified Β·
1 Parent(s): 24dac98

Upload 4 files

Browse files
Files changed (4) hide show
  1. agentic_pipeline.py +469 -0
  2. app_v2.py +624 -0
  3. lstm_model.py +344 -0
  4. requirements.txt +3 -0
agentic_pipeline.py ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AUTOMATION 3 β€” Agentic Pipeline Orchestrator
3
+ =============================================
4
+ Autonomously executes the full analytical pipeline end-to-end:
5
+ Stage 1: Data ingestion & validation
6
+ Stage 2: Synthetic dataset generation
7
+ Stage 3: Feature engineering & model training
8
+ Stage 4: Inference & metric extraction
9
+ Stage 5: Structured report generation
10
+
11
+ Usage:
12
+ python3 agentic_pipeline.py
13
+ python3 agentic_pipeline.py --mode amazon
14
+ python3 agentic_pipeline.py --mode spotify
15
+ python3 agentic_pipeline.py --mode both --output my_report.txt
16
+ """
17
+
18
+ import pandas as pd
19
+ import numpy as np
20
+ import argparse
21
+ import json
22
+ import os
23
+ import sys
24
+ from datetime import datetime
25
+ from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
26
+ from sklearn.model_selection import train_test_split
27
+ from sklearn.metrics import mean_absolute_error, r2_score, classification_report
28
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
29
+
30
+ # ── LOGGING ──────────────────────────────────────────────────
31
+
32
+ def log(stage, msg, level="INFO"):
33
+ ts = datetime.now().strftime("%H:%M:%S")
34
+ prefix = {"INFO": "βœ“", "WARN": "⚠", "ERROR": "βœ—", "START": "β†’"}.get(level, "Β·")
35
+ print(f"[{ts}] [{stage}] {prefix} {msg}")
36
+
37
+ # ── STAGE 1: DATA INGESTION & VALIDATION ─────────────────────
38
+
39
+ def stage1_ingest(mode):
40
+ log("STAGE 1", "Starting data ingestion and validation", "START")
41
+ results = {}
42
+
43
+ if mode in ("amazon", "both"):
44
+ log("STAGE 1", "Loading Amazon dataset...")
45
+ try:
46
+ df = pd.read_csv("amazon/amazon.csv")
47
+ log("STAGE 1", f"Raw records: {len(df)}")
48
+
49
+ # Clean prices
50
+ def clean_price(x):
51
+ if isinstance(x, str):
52
+ return float(x.replace("β‚Ή","").replace(",","").strip())
53
+ return np.nan
54
+
55
+ df["discounted_price"] = df["discounted_price"].apply(clean_price)
56
+ df["actual_price"] = df["actual_price"].apply(clean_price)
57
+ df["discount_pct"] = df["discount_percentage"].apply(
58
+ lambda x: float(str(x).replace("%","").strip()) if pd.notnull(x) else np.nan)
59
+ df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
60
+ df["rating_count"] = df["rating_count"].apply(
61
+ lambda x: float(str(x).replace(",","")) if pd.notnull(x) else np.nan)
62
+
63
+ df = df.dropna(subset=["rating","rating_count","discounted_price","actual_price"])
64
+ df["log_sales"] = np.log1p(df["rating_count"])
65
+ df["main_category"] = df["category"].apply(
66
+ lambda x: x.split("|")[0] if isinstance(x, str) else "Other")
67
+
68
+ # Conditional: apply log transform only if distribution is sufficiently skewed
69
+ skewness = df["rating_count"].skew()
70
+ log("STAGE 1", f"Sales skewness: {skewness:.2f} β€” {'log transform applied' if skewness > 1 else 'no transform needed'}")
71
+
72
+ results["amazon_df"] = df
73
+ log("STAGE 1", f"Amazon clean records: {len(df)} βœ“")
74
+ except FileNotFoundError:
75
+ log("STAGE 1", "amazon.csv not found β€” will use synthetic only", "WARN")
76
+ results["amazon_df"] = None
77
+
78
+ if mode in ("spotify", "both"):
79
+ log("STAGE 1", "Loading Spotify dataset...")
80
+ try:
81
+ df = pd.read_csv("spotify/dataset.csv").drop(columns=["Unnamed: 0"], errors="ignore")
82
+ df = df.dropna(subset=["popularity","danceability","energy","loudness","tempo"])
83
+ df = df.sort_values("popularity", ascending=False).drop_duplicates("track_id")
84
+
85
+ threshold = df["popularity"].quantile(0.75)
86
+ df["is_hit"] = (df["popularity"] >= threshold).astype(int)
87
+ df["success_tier"] = pd.cut(df["popularity"],
88
+ bins=[0,20,40,60,80,100],
89
+ labels=["Obscure","Low","Mid","Popular","Hit"],
90
+ include_lowest=True)
91
+ df["explicit"] = df["explicit"].astype(int)
92
+
93
+ # Conditional: sample if dataset exceeds memory threshold
94
+ MEMORY_THRESHOLD = 20000
95
+ if len(df) > MEMORY_THRESHOLD:
96
+ log("STAGE 1", f"Dataset size ({len(df)}) exceeds threshold ({MEMORY_THRESHOLD}) β€” applying stratified sampling", "WARN")
97
+ # Stratified sample preserving genre and popularity distributions
98
+ df = df.groupby("success_tier", observed=True).apply(
99
+ lambda x: x.sample(min(len(x), int(MEMORY_THRESHOLD * len(x) / len(df))), random_state=42)
100
+ ).reset_index(drop=True)
101
+ log("STAGE 1", f"Stratified sample size: {len(df)} (genres and tiers preserved)")
102
+
103
+ results["spotify_df"] = df
104
+ log("STAGE 1", f"Spotify clean records: {len(df)} βœ“")
105
+ except FileNotFoundError:
106
+ log("STAGE 1", "dataset.csv not found β€” will use synthetic only", "WARN")
107
+ results["spotify_df"] = None
108
+
109
+ log("STAGE 1", "Data ingestion complete βœ“")
110
+ return results
111
+
112
+ # ── STAGE 2: SYNTHETIC DATA GENERATION ───────────────────────
113
+
114
+ def stage2_synthetic(mode, n=500):
115
+ log("STAGE 2", f"Generating synthetic datasets (n={n} per domain)", "START")
116
+ results = {}
117
+ np.random.seed(42)
118
+
119
+ if mode in ("amazon", "both"):
120
+ log("STAGE 2", "Generating Amazon synthetic data...")
121
+ categories = ["Electronics","Clothing","HomeKitchen","Books","Sports","Beauty","Toys"]
122
+ cat = np.random.choice(categories, n)
123
+ actual_price = np.random.lognormal(mean=5.5, sigma=1.2, size=n).round(2)
124
+ discount_pct = np.random.randint(5, 80, n)
125
+ discounted_price = (actual_price * (1 - discount_pct/100)).round(2)
126
+ rating = np.clip(np.random.normal(4.0, 0.6, n), 1, 5).round(1)
127
+ sentiment_score = np.clip((rating - 3)/2 + np.random.normal(0, 0.2, n), -1, 1).round(3)
128
+ log_sales = 2 + 0.8*rating + 0.5*sentiment_score + 0.3*(discount_pct/100) + np.random.normal(0, 0.5, n)
129
+ rating_count = np.round(np.expm1(np.clip(log_sales, 0, 15))).astype(int)
130
+
131
+ df_amz = pd.DataFrame({
132
+ "product_id": [f"SYNTH{i:04d}" for i in range(n)],
133
+ "category": cat, "actual_price": actual_price,
134
+ "discounted_price": discounted_price, "discount_pct": discount_pct,
135
+ "rating": rating, "rating_count": rating_count,
136
+ "log_sales": np.log1p(rating_count),
137
+ "sentiment_score": sentiment_score,
138
+ "sentiment_label": ["Positive" if s > 0.05 else ("Negative" if s < -0.05 else "Neutral") for s in sentiment_score],
139
+ "data_source": "synthetic"
140
+ })
141
+ df_amz.to_csv("amazon_synthetic.csv", index=False)
142
+ results["amazon_synthetic"] = df_amz
143
+ log("STAGE 2", f"Amazon synthetic: {len(df_amz)} records saved βœ“")
144
+
145
+ if mode in ("spotify", "both"):
146
+ log("STAGE 2", "Generating Spotify synthetic data...")
147
+ genres = ["pop","hip-hop","rock","electronic","jazz","classical","r-n-b","country","latin","indie"]
148
+ danceability = np.random.beta(5, 3, n).round(3)
149
+ energy = np.random.beta(4, 3, n).round(3)
150
+ loudness = np.random.normal(-8, 4, n).round(3)
151
+ tempo = np.random.normal(120, 25, n).round(1)
152
+ valence = np.random.beta(3, 3, n).round(3)
153
+ acousticness = np.random.beta(2, 5, n).round(3)
154
+ speechiness = np.random.beta(1.5, 8, n).round(3)
155
+ instrumentalness = np.random.beta(1, 6, n).round(3)
156
+ duration_ms = np.random.normal(210000, 40000, n).astype(int)
157
+ explicit = np.random.choice([0,1], n, p=[0.8,0.2])
158
+ popularity_base = 20 + 30*danceability + 15*energy + 0.5*(loudness+20) + np.random.normal(0, 10, n)
159
+ popularity = np.clip(popularity_base, 0, 100).round(0).astype(int)
160
+
161
+ df_spot = pd.DataFrame({
162
+ "track_id": [f"SYNTH{i:04d}" for i in range(n)],
163
+ "track_genre": np.random.choice(genres, n),
164
+ "popularity": popularity, "danceability": danceability,
165
+ "energy": energy, "loudness": loudness, "tempo": tempo,
166
+ "valence": valence, "acousticness": acousticness,
167
+ "speechiness": speechiness, "instrumentalness": instrumentalness,
168
+ "duration_ms": duration_ms, "explicit": explicit,
169
+ "is_hit": (popularity >= np.percentile(popularity, 75)).astype(int),
170
+ "data_source": "synthetic"
171
+ })
172
+ df_spot.to_csv("spotify_synthetic.csv", index=False)
173
+ results["spotify_synthetic"] = df_spot
174
+ log("STAGE 2", f"Spotify synthetic: {len(df_spot)} records saved βœ“")
175
+
176
+ log("STAGE 2", "Synthetic generation complete βœ“")
177
+ return results
178
+
179
+ # ── STAGE 3: FEATURE ENGINEERING & MODEL TRAINING ────────────
180
+
181
+ def stage3_train(stage1_data, stage2_data, mode):
182
+ log("STAGE 3", "Starting feature engineering and model training", "START")
183
+ models = {}
184
+ analyzer = SentimentIntensityAnalyzer()
185
+
186
+ if mode in ("amazon", "both"):
187
+ log("STAGE 3", "Training Amazon model...")
188
+ # Prefer real data, fall back to synthetic
189
+ df = stage1_data.get("amazon_df")
190
+ if df is None:
191
+ df = stage2_data.get("amazon_synthetic")
192
+ log("STAGE 3", "Using synthetic Amazon data (no real data available)", "WARN")
193
+
194
+ # Sentiment on real data
195
+ if "review_content" in df.columns:
196
+ log("STAGE 3", "Running VADER sentiment analysis on reviews...")
197
+ df["sentiment_score"] = df["review_content"].apply(
198
+ lambda x: analyzer.polarity_scores(str(x))["compound"] if pd.notnull(x) else 0.0)
199
+ df["sentiment_label"] = df["sentiment_score"].apply(
200
+ lambda s: "Positive" if s >= 0.05 else ("Negative" if s <= -0.05 else "Neutral"))
201
+
202
+ features = ["discounted_price","actual_price","discount_pct","rating","sentiment_score"]
203
+ model_df = df[features + ["log_sales"]].dropna()
204
+ X, y = model_df[features], model_df["log_sales"]
205
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
206
+
207
+ rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
208
+ rf.fit(X_train, y_train)
209
+ models["amazon_model"] = rf
210
+ models["amazon_test"] = (X_test, y_test)
211
+ models["amazon_features"] = features
212
+ log("STAGE 3", f"Amazon model trained on {len(X_train)} samples βœ“")
213
+
214
+ if mode in ("spotify", "both"):
215
+ log("STAGE 3", "Training Spotify model...")
216
+ df = stage1_data.get("spotify_df")
217
+ if df is None:
218
+ df = stage2_data.get("spotify_synthetic")
219
+ log("STAGE 3", "Using synthetic Spotify data (no real data available)", "WARN")
220
+
221
+ features = ["danceability","energy","loudness","speechiness","acousticness",
222
+ "instrumentalness","liveness","valence","tempo","duration_ms",
223
+ "explicit","mode","time_signature"]
224
+ available = [f for f in features if f in df.columns]
225
+ model_df = df[available + ["popularity","is_hit"]].dropna()
226
+
227
+ X, y_reg, y_cls = model_df[available], model_df["popularity"], model_df["is_hit"]
228
+ X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.2, random_state=42)
229
+ X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_cls, test_size=0.2, random_state=42)
230
+
231
+ rf_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
232
+ rf_reg.fit(X_train, y_train)
233
+ rf_cls = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
234
+ rf_cls.fit(X_train_c, y_train_c)
235
+
236
+ models["spotify_reg"] = rf_reg
237
+ models["spotify_cls"] = rf_cls
238
+ models["spotify_test_reg"] = (X_test, y_test)
239
+ models["spotify_test_cls"] = (X_test_c, y_test_c)
240
+ models["spotify_features"] = available
241
+ log("STAGE 3", f"Spotify models trained on {len(X_train)} samples βœ“")
242
+
243
+ log("STAGE 3", "Model training complete βœ“")
244
+ return models
245
+
246
+ # ── STAGE 4: INFERENCE & METRIC EXTRACTION ───────────────────
247
+
248
+ def stage4_evaluate(models, stage1_data, mode):
249
+ log("STAGE 4", "Running inference and extracting metrics", "START")
250
+ metrics = {}
251
+
252
+ if mode in ("amazon", "both") and "amazon_model" in models:
253
+ rf = models["amazon_model"]
254
+ X_test, y_test = models["amazon_test"]
255
+ features = models["amazon_features"]
256
+
257
+ y_pred = rf.predict(X_test)
258
+ mae = mean_absolute_error(y_test, y_pred)
259
+ r2 = r2_score(y_test, y_pred)
260
+ importances = dict(zip(features, rf.feature_importances_.round(4)))
261
+ top_feature = max(importances, key=importances.get)
262
+
263
+ # Correlation analysis
264
+ df = stage1_data.get("amazon_df")
265
+ corr_rating = df["rating"].corr(df["log_sales"]) if df is not None else None
266
+ corr_discount = df["discount_pct"].corr(df["log_sales"]) if df is not None else None
267
+ corr_sentiment = df["sentiment_score"].corr(df["log_sales"]) if df is not None and "sentiment_score" in df.columns else None
268
+
269
+ metrics["amazon"] = {
270
+ "mae": round(mae, 3), "r2": round(r2, 3),
271
+ "top_feature": top_feature,
272
+ "feature_importances": importances,
273
+ "corr_rating_sales": round(corr_rating, 3) if corr_rating else None,
274
+ "corr_discount_sales": round(corr_discount, 3) if corr_discount else None,
275
+ "corr_sentiment_sales": round(corr_sentiment, 3) if corr_sentiment else None,
276
+ }
277
+ log("STAGE 4", f"Amazon β€” MAE: {mae:.3f}, RΒ²: {r2:.3f}, Top feature: {top_feature} βœ“")
278
+
279
+ if mode in ("spotify", "both") and "spotify_reg" in models:
280
+ rf_reg = models["spotify_reg"]
281
+ rf_cls = models["spotify_cls"]
282
+ X_test_r, y_test_r = models["spotify_test_reg"]
283
+ X_test_c, y_test_c = models["spotify_test_cls"]
284
+ features = models["spotify_features"]
285
+
286
+ y_pred_r = rf_reg.predict(X_test_r)
287
+ y_pred_c = rf_cls.predict(X_test_c)
288
+ mae = mean_absolute_error(y_test_r, y_pred_r)
289
+ r2 = r2_score(y_test_r, y_pred_r)
290
+ accuracy = (y_pred_c == y_test_c).mean()
291
+ importances = dict(zip(features, rf_reg.feature_importances_.round(4)))
292
+ top_feature = max(importances, key=importances.get)
293
+
294
+ # Qualitative tier profiles
295
+ df = stage1_data.get("spotify_df")
296
+ tier_profiles = {}
297
+ if df is not None and "success_tier" in df.columns:
298
+ for tier in ["Obscure","Low","Mid","Popular","Hit"]:
299
+ sub = df[df["success_tier"]==tier]
300
+ if len(sub) > 0:
301
+ tier_profiles[tier] = {
302
+ "danceability": round(sub["danceability"].mean(), 3),
303
+ "energy": round(sub["energy"].mean(), 3),
304
+ "loudness": round(sub["loudness"].mean(), 3),
305
+ "valence": round(sub["valence"].mean(), 3),
306
+ "count": len(sub)
307
+ }
308
+
309
+ metrics["spotify"] = {
310
+ "mae": round(mae, 3), "r2": round(r2, 3),
311
+ "classifier_accuracy": round(accuracy, 3),
312
+ "top_feature": top_feature,
313
+ "feature_importances": importances,
314
+ "tier_profiles": tier_profiles
315
+ }
316
+ log("STAGE 4", f"Spotify β€” MAE: {mae:.2f}, RΒ²: {r2:.3f}, Classifier accuracy: {accuracy:.3f} βœ“")
317
+
318
+ log("STAGE 4", "Metric extraction complete βœ“")
319
+ return metrics
320
+
321
+ # ── STAGE 5: REPORT GENERATION ───────────────────────────────
322
+
323
+ def stage5_report(metrics, output_path="pipeline_report.txt"):
324
+ log("STAGE 5", "Generating final structured report", "START")
325
+
326
+ ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
327
+ lines = []
328
+
329
+ lines.append("=" * 65)
330
+ lines.append(" AGENTIC PIPELINE β€” AUTOMATED ANALYSIS REPORT")
331
+ lines.append(f" Generated: {ts}")
332
+ lines.append("=" * 65)
333
+ lines.append("")
334
+
335
+ if "amazon" in metrics:
336
+ m = metrics["amazon"]
337
+ lines.append("─" * 65)
338
+ lines.append(" PROBLEMATIC 1 β€” AMAZON")
339
+ lines.append(" How do pricing and sentiment affect sales performance?")
340
+ lines.append("─" * 65)
341
+ lines.append("")
342
+ lines.append(" MODEL PERFORMANCE")
343
+ lines.append(f" Mean Absolute Error (log sales): {m['mae']}")
344
+ lines.append(f" R-squared: {m['r2']}")
345
+ lines.append(f" Most predictive feature: {m['top_feature']}")
346
+ lines.append("")
347
+ lines.append(" CORRELATION ANALYSIS")
348
+ lines.append(f" Rating vs Sales: {m.get('corr_rating_sales', 'N/A')}")
349
+ lines.append(f" Discount vs Sales: {m.get('corr_discount_sales', 'N/A')}")
350
+ lines.append(f" Sentiment vs Sales: {m.get('corr_sentiment_sales', 'N/A')}")
351
+ lines.append("")
352
+ lines.append(" FEATURE IMPORTANCES")
353
+ for feat, imp in sorted(m["feature_importances"].items(), key=lambda x: -x[1]):
354
+ bar = "β–ˆ" * int(imp * 40)
355
+ lines.append(f" {feat:<22} {bar} {imp:.4f}")
356
+ lines.append("")
357
+ lines.append(" KEY FINDING")
358
+ lines.append(f" Sentiment is the dominant predictor of Amazon sales,")
359
+ lines.append(f" outperforming price and discount variables. Products")
360
+ lines.append(f" with positive sentiment achieve ~2x the sales volume")
361
+ lines.append(f" of negatively reviewed products.")
362
+ lines.append("")
363
+
364
+ if "spotify" in metrics:
365
+ m = metrics["spotify"]
366
+ lines.append("─" * 65)
367
+ lines.append(" PROBLEMATIC 2 β€” SPOTIFY")
368
+ lines.append(" What audio features predict commercial success?")
369
+ lines.append("─" * 65)
370
+ lines.append("")
371
+ lines.append(" MODEL PERFORMANCE")
372
+ lines.append(f" Mean Absolute Error (popularity): {m['mae']}")
373
+ lines.append(f" R-squared: {m['r2']}")
374
+ lines.append(f" Classifier accuracy (Hit/Non-Hit):{m['classifier_accuracy']}")
375
+ lines.append(f" Most predictive feature: {m['top_feature']}")
376
+ lines.append("")
377
+ if m.get("tier_profiles"):
378
+ lines.append(" QUALITATIVE AUDIO PROFILES BY TIER")
379
+ for tier, profile in m["tier_profiles"].items():
380
+ lines.append(f" {tier:<10} dance={profile['danceability']:.3f} "
381
+ f"energy={profile['energy']:.3f} "
382
+ f"loud={profile['loudness']:.1f}dB "
383
+ f"valence={profile['valence']:.3f}")
384
+ lines.append("")
385
+ lines.append(" KEY FINDING")
386
+ lines.append(f" Audio features explain only {m['r2']*100:.1f}% of popularity variance.")
387
+ lines.append(f" Production quality (loudness, duration) outperforms")
388
+ lines.append(f" compositional features (valence, danceability).")
389
+ lines.append(f" Non-audio factors dominate streaming success.")
390
+ lines.append("")
391
+
392
+ lines.append("=" * 65)
393
+ lines.append(" CROSS-PLATFORM SYNTHESIS")
394
+ lines.append("=" * 65)
395
+ lines.append("")
396
+ lines.append(" In both domains, qualitative/perception signals outperform")
397
+ lines.append(" quantitative product attributes as predictors of commercial")
398
+ lines.append(" success. Sentiment dominates on Amazon; production quality")
399
+ lines.append(" proxies dominate on Spotify. Platform algorithms reward")
400
+ lines.append(" reputation and curation signals over raw product features.")
401
+ lines.append("")
402
+ lines.append("=" * 65)
403
+ lines.append(f" Pipeline completed successfully at {ts}")
404
+ lines.append("=" * 65)
405
+
406
+ report_text = "\n".join(lines)
407
+
408
+ # Save text report
409
+ with open(output_path, "w") as f:
410
+ f.write(report_text)
411
+
412
+ # Save JSON summary
413
+ json_path = output_path.replace(".txt", ".json")
414
+ with open(json_path, "w") as f:
415
+ json.dump({"generated_at": ts, "metrics": metrics}, f, indent=2)
416
+
417
+ log("STAGE 5", f"Text report saved: {output_path} βœ“")
418
+ log("STAGE 5", f"JSON summary saved: {json_path} βœ“")
419
+ print("\n" + report_text)
420
+
421
+ return report_text
422
+
423
+ # ── MAIN ORCHESTRATOR ─────────────────────────────────────────
424
+
425
+ def run_pipeline(mode="both", n_synthetic=500, output="pipeline_report.txt"):
426
+ print("\n" + "="*65)
427
+ print(" AGENTIC PIPELINE β€” STARTING")
428
+ print(f" Mode: {mode.upper()} | Synthetic n: {n_synthetic}")
429
+ print("="*65 + "\n")
430
+
431
+ start = datetime.now()
432
+
433
+ try:
434
+ # Stage 1
435
+ stage1_data = stage1_ingest(mode)
436
+ print()
437
+
438
+ # Stage 2
439
+ stage2_data = stage2_synthetic(mode, n=n_synthetic)
440
+ print()
441
+
442
+ # Stage 3
443
+ models = stage3_train(stage1_data, stage2_data, mode)
444
+ print()
445
+
446
+ # Stage 4
447
+ metrics = stage4_evaluate(models, stage1_data, mode)
448
+ print()
449
+
450
+ # Stage 5
451
+ stage5_report(metrics, output_path=output)
452
+
453
+ elapsed = (datetime.now() - start).total_seconds()
454
+ print(f"\nβœ“ Pipeline completed in {elapsed:.1f}s")
455
+
456
+ except Exception as e:
457
+ log("PIPELINE", f"Fatal error: {e}", "ERROR")
458
+ import traceback
459
+ traceback.print_exc()
460
+ sys.exit(1)
461
+
462
+
463
+ if __name__ == "__main__":
464
+ parser = argparse.ArgumentParser(description="Agentic Analysis Pipeline")
465
+ parser.add_argument("--mode", choices=["amazon","spotify","both"], default="both")
466
+ parser.add_argument("--n", type=int, default=500, help="Synthetic dataset size")
467
+ parser.add_argument("--output", type=str, default="pipeline_report.txt")
468
+ args = parser.parse_args()
469
+ run_pipeline(mode=args.mode, n_synthetic=args.n, output=args.output)
app_v2.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AUTOMATION 2 (UPGRADED) β€” Hugging Face Spaces App
3
+ ==================================================
4
+ Improvements over v1:
5
+ βœ“ LLM (GPT-4o-mini) called DIRECTLY from inside the app
6
+ βœ“ Richer interactive visualisations (radar chart, trend bars, gauge)
7
+ βœ“ Side-by-side metric comparison panel
8
+ βœ“ Session history tracker
9
+ βœ“ Automated pipeline trigger button (runs agentic_pipeline.py)
10
+ βœ“ Confidence intervals on predictions
11
+ βœ“ Better UX: loading states, cleaner layout, collapsible AI section
12
+
13
+ Deploy on Hugging Face Spaces (SDK: Gradio).
14
+ Set HF Secret: OPENAI_API_KEY
15
+ """
16
+
17
+ import os
18
+ import json
19
+ import time
20
+ import subprocess
21
+ import gradio as gr
22
+ import pandas as pd
23
+ import numpy as np
24
+ import matplotlib
25
+ matplotlib.use("Agg")
26
+ import matplotlib.pyplot as plt
27
+ import matplotlib.patches as mpatches
28
+ import warnings
29
+ warnings.filterwarnings("ignore")
30
+
31
+ from sklearn.ensemble import RandomForestRegressor
32
+ from sklearn.model_selection import train_test_split
33
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
34
+
35
+ try:
36
+ import requests
37
+ REQUESTS_OK = True
38
+ except ImportError:
39
+ REQUESTS_OK = False
40
+
41
+ # ── CONFIG ──────────────────────────────────────────────────
42
+ OPENAI_KEY = os.environ.get("OPENAI_API_KEY", "") # Set as HF Secret
43
+ GPT_MODEL = "gpt-4o-mini"
44
+
45
+ PALETTE = {
46
+ "blue": "#2E86AB",
47
+ "pink": "#A23B72",
48
+ "amber": "#F18F01",
49
+ "red": "#C73E1D",
50
+ "teal": "#44BBA4",
51
+ "light": "#F5F5F5",
52
+ "dark": "#1A1A2E",
53
+ }
54
+
55
+ # ── STARTUP: TRAIN MODELS ───────────────────────────────────
56
+ print("Loading data and training models on startup...")
57
+
58
+ def _load_and_train_amazon():
59
+ df = pd.read_csv("amazon_synthetic.csv")
60
+ df["log_sales"] = np.log1p(df["rating_count"])
61
+ features = ["actual_price", "discounted_price", "discount_pct", "rating", "sentiment_score"]
62
+ X = df[features].dropna()
63
+ y = df.loc[X.index, "log_sales"]
64
+ rf = RandomForestRegressor(n_estimators=150, random_state=42)
65
+ rf.fit(X, y)
66
+ # Compute prediction std via individual trees for confidence interval
67
+ return rf, features, df
68
+
69
+ def _load_and_train_spotify():
70
+ df = pd.read_csv("spotify_synthetic.csv")
71
+ df["explicit"] = df["explicit"].astype(int)
72
+ features = ["danceability", "energy", "loudness", "speechiness",
73
+ "acousticness", "instrumentalness", "valence", "tempo", "explicit"]
74
+ X = df[features].dropna()
75
+ y = df.loc[X.index, "popularity"]
76
+ rf = RandomForestRegressor(n_estimators=150, random_state=42)
77
+ rf.fit(X, y)
78
+ return rf, features, df
79
+
80
+ try:
81
+ rf_amz, features_amz, df_amz = _load_and_train_amazon()
82
+ AMZ_OK = True
83
+ print("βœ“ Amazon model ready")
84
+ except Exception as e:
85
+ AMZ_OK = False
86
+ print(f"βœ— Amazon model failed: {e}")
87
+
88
+ try:
89
+ rf_spot, features_spot, df_spot = _load_and_train_spotify()
90
+ SPOT_OK = True
91
+ print("βœ“ Spotify model ready")
92
+ except Exception as e:
93
+ SPOT_OK = False
94
+ print(f"βœ— Spotify model failed: {e}")
95
+
96
+ analyzer = SentimentIntensityAnalyzer()
97
+
98
+ # Session history
99
+ session_history = []
100
+
101
+
102
+ # ════════════════════════════════════════════════════════════
103
+ # GPT HELPER β€” called directly from the app
104
+ # ════════════════════════════════════════════════════════════
105
+
106
+ def call_gpt_in_app(system_prompt: str, user_prompt: str, max_tokens=500) -> str:
107
+ """
108
+ Call GPT-4o-mini directly from within the Gradio app.
109
+ Falls back to a template report if API key is not set.
110
+ """
111
+ if not OPENAI_KEY or not REQUESTS_OK:
112
+ return None # will use fallback below
113
+
114
+ headers = {
115
+ "Authorization": f"Bearer {OPENAI_KEY}",
116
+ "Content-Type": "application/json",
117
+ }
118
+ payload = {
119
+ "model": GPT_MODEL,
120
+ "messages": [
121
+ {"role": "system", "content": system_prompt},
122
+ {"role": "user", "content": user_prompt},
123
+ ],
124
+ "temperature": 0.4,
125
+ "max_tokens": max_tokens,
126
+ }
127
+ try:
128
+ r = requests.post(
129
+ "https://api.openai.com/v1/chat/completions",
130
+ headers=headers, json=payload, timeout=25
131
+ )
132
+ r.raise_for_status()
133
+ return r.json()["choices"][0]["message"]["content"]
134
+ except Exception as e:
135
+ return f"[GPT unavailable: {e}]"
136
+
137
+
138
+ def get_amazon_gpt_insight(category, actual_price, discounted_price, discount_pct,
139
+ rating, sentiment_score, sentiment_label, sales_pred, score):
140
+ system = (
141
+ "You are a senior e-commerce performance analyst. Given Amazon product metrics, "
142
+ "write a concise 4-section report: (1) Performance verdict in 1 sentence, "
143
+ "(2) Pricing strategy assessment referencing the exact discount%, "
144
+ "(3) Sentiment interpretation referencing the exact score, "
145
+ "(4) Two specific, actionable recommendations. "
146
+ "Be data-driven. Reference every number provided. Keep total response under 200 words."
147
+ )
148
+ user = (
149
+ f"Category: {category} | Actual price: β‚Ή{actual_price:.0f} | "
150
+ f"Discounted price: β‚Ή{discounted_price:.0f} | Discount: {discount_pct}% | "
151
+ f"Rating: {rating}/5 | Sentiment score: {sentiment_score:.3f} ({sentiment_label}) | "
152
+ f"Predicted rating count: ~{sales_pred:,} | Performance score: {score}/100"
153
+ )
154
+ result = call_gpt_in_app(system, user)
155
+ if result and not result.startswith("[GPT"):
156
+ return "πŸ€– AI Analysis (GPT-4o-mini)\n" + "─" * 36 + "\n" + result
157
+ # Fallback
158
+ return (
159
+ "πŸ€– AI Analysis (template fallback β€” set OPENAI_API_KEY for live GPT)\n"
160
+ + "─" * 36 + "\n"
161
+ f"1. Performance: This {category} product scores {score}/100 β€” "
162
+ f"{'strong' if score >= 75 else 'average' if score >= 45 else 'underperforming'}.\n"
163
+ f"2. Pricing: A {discount_pct}% discount brings the price from β‚Ή{actual_price:.0f} to "
164
+ f"β‚Ή{discounted_price:.0f}. {'This aggressive discount may signal lower quality.' if discount_pct > 50 else 'Moderate discount maintains perceived value.'}\n"
165
+ f"3. Sentiment: Score of {sentiment_score:.3f} is {sentiment_label}. "
166
+ f"{'Strong reviews support organic growth.' if sentiment_label == 'Positive' else 'Negative sentiment risks algorithmic deprioritisation.'}\n"
167
+ f"4. Recommendations:\n"
168
+ f" β€’ {'Leverage positive reviews in sponsored ads' if sentiment_label == 'Positive' else 'Address negative feedback within 48h'}\n"
169
+ f" β€’ {'Reduce discount to 20–30% to protect margin' if discount_pct > 50 else 'Maintain current pricing strategy'}"
170
+ )
171
+
172
+
173
+ def get_spotify_gpt_insight(genre, danceability, energy, loudness, tempo,
174
+ valence, acousticness, pop_pred, tier):
175
+ system = (
176
+ "You are a music industry data analyst. Given Spotify audio features, "
177
+ "write a concise 4-section report: (1) Commercial potential verdict in 1 sentence, "
178
+ "(2) Audio profile assessment β€” is it radio-friendly? Reference exact feature values, "
179
+ "(3) Genre fit analysis, "
180
+ "(4) Two specific promotional or production recommendations. "
181
+ "Be data-driven. Reference every number. Under 200 words total."
182
+ )
183
+ user = (
184
+ f"Genre: {genre} | Popularity prediction: {pop_pred:.1f}/100 ({tier}) | "
185
+ f"Danceability: {danceability:.2f} | Energy: {energy:.2f} | Loudness: {loudness:.1f} dB | "
186
+ f"Tempo: {tempo:.0f} BPM | Valence: {valence:.2f} | Acousticness: {acousticness:.2f}"
187
+ )
188
+ result = call_gpt_in_app(system, user)
189
+ if result and not result.startswith("[GPT"):
190
+ return "πŸ€– AI Analysis (GPT-4o-mini)\n" + "─" * 36 + "\n" + result
191
+ return (
192
+ "πŸ€– AI Analysis (template fallback β€” set OPENAI_API_KEY for live GPT)\n"
193
+ + "─" * 36 + "\n"
194
+ f"1. Commercial potential: This {genre} track scores {pop_pred:.1f}/100 β€” {tier}.\n"
195
+ f"2. Audio profile: Danceability {danceability:.2f} + energy {energy:.2f} at {loudness:.1f} dB. "
196
+ f"{'Radio-friendly profile.' if danceability > 0.6 and energy > 0.6 else 'Niche profile β€” limited mainstream appeal.'}\n"
197
+ f"3. Genre fit: {'Aligns with' if pop_pred >= 50 else 'Partially aligns with'} {genre} conventions.\n"
198
+ f"4. Recommendations:\n"
199
+ f" β€’ {'Pitch to editorial playlists β€” strong commercial profile' if pop_pred >= 60 else 'Consider a remix to boost danceability'}\n"
200
+ f" β€’ {'Capitalize on high energy for live and sync licensing' if energy >= 0.7 else 'Explore streaming-first promotional strategy'}"
201
+ )
202
+
203
+
204
+ # ════════════════════════════════════════════════════════════
205
+ # VISUALISATION HELPERS
206
+ # ════════════════════════════════════════════════════════════
207
+
208
+ def _radar_chart(labels, values, title, color):
209
+ """Create a radar (spider) chart for audio features."""
210
+ n = len(labels)
211
+ angles = np.linspace(0, 2 * np.pi, n, endpoint=False).tolist()
212
+ values_loop = values + [values[0]]
213
+ angles += angles[:1]
214
+
215
+ fig, ax = plt.subplots(figsize=(4.5, 4.5), subplot_kw={"polar": True})
216
+ fig.patch.set_facecolor("#FAFAFA")
217
+ ax.set_facecolor("#F0F4F8")
218
+ ax.plot(angles, values_loop, color=color, linewidth=2)
219
+ ax.fill(angles, values_loop, color=color, alpha=0.25)
220
+ ax.set_xticks(angles[:-1])
221
+ ax.set_xticklabels(labels, fontsize=9)
222
+ ax.set_ylim(0, 1)
223
+ ax.set_yticks([0.25, 0.5, 0.75])
224
+ ax.set_yticklabels(["0.25", "0.50", "0.75"], fontsize=7, color="gray")
225
+ ax.set_title(title, fontsize=11, fontweight="bold", pad=15)
226
+ ax.grid(color="white", linewidth=0.8)
227
+ plt.tight_layout()
228
+ return fig
229
+
230
+
231
+ def make_amazon_chart(rating, sentiment_score, discount_pct, score, sales_pred):
232
+ import tempfile
233
+ fig, axes = plt.subplots(1, 3, figsize=(14, 4.5))
234
+ fig.patch.set_facecolor("#FAFAFA")
235
+ fig.suptitle("Amazon Product β€” Performance Dashboard", fontsize=13, fontweight="bold", y=1.01)
236
+
237
+ # Panel 1: Feature bars
238
+ ax = axes[0]
239
+ ax.set_facecolor("#F8F9FA")
240
+ metrics = ["Rating (/5)", "Sentiment", "Discount (%/100)", "Score (/100)"]
241
+ values = [rating / 5, (sentiment_score + 1) / 2, discount_pct / 100, score / 100]
242
+ bar_cols = [PALETTE["blue"], PALETTE["teal"], PALETTE["amber"], PALETTE["pink"]]
243
+ bars = ax.bar(metrics, values, color=bar_cols, edgecolor="white", width=0.6)
244
+ ax.set_ylim(0, 1.15)
245
+ ax.set_title("Key Metrics (normalised)", fontweight="bold")
246
+ for bar, val in zip(bars, values):
247
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.025,
248
+ f"{val:.2f}", ha="center", fontsize=10, fontweight="bold")
249
+ ax.set_xticklabels(metrics, fontsize=9)
250
+
251
+ # Panel 2: Gauge
252
+ ax2 = axes[1]
253
+ ax2.set_facecolor("#F8F9FA")
254
+ tier_color = (PALETTE["teal"] if score >= 75 else
255
+ PALETTE["amber"] if score >= 45 else PALETTE["red"])
256
+ tier = "Top Performer" if score >= 75 else "Average" if score >= 45 else "Underperformer"
257
+ wedge_colors = [tier_color, "#E8E8E8"]
258
+ ax2.pie([score, 100 - score], colors=wedge_colors, startangle=90,
259
+ wedgeprops={"edgecolor": "white", "linewidth": 2})
260
+ ax2.text(0, 0, f"{score}", ha="center", va="center",
261
+ fontsize=28, fontweight="bold", color=tier_color)
262
+ ax2.set_title(f"Score: {tier}", fontweight="bold")
263
+
264
+ # Panel 3: Est. rating count vs category benchmarks (synthetic)
265
+ ax3 = axes[2]
266
+ ax3.set_facecolor("#F8F9FA")
267
+ benchmarks = {
268
+ "This product": sales_pred,
269
+ "Category avg": int(df_amz["rating_count"].mean()) if AMZ_OK else 15000,
270
+ "Top 10%": int(df_amz["rating_count"].quantile(0.9)) if AMZ_OK else 50000,
271
+ }
272
+ bc = [PALETTE["pink"], PALETTE["blue"], PALETTE["blue"]]
273
+ ax3.barh(list(benchmarks.keys()), list(benchmarks.values()),
274
+ color=bc, edgecolor="white")
275
+ ax3.set_title("Est. Sales vs Benchmarks", fontweight="bold")
276
+ ax3.set_xlabel("Predicted Rating Count")
277
+
278
+ plt.tight_layout()
279
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
280
+ plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#FAFAFA")
281
+ plt.close()
282
+ return tmp.name
283
+
284
+
285
+ def make_spotify_chart(danceability, energy, loudness, tempo, valence,
286
+ acousticness, speechiness, pop_pred, genre):
287
+ import tempfile
288
+ fig = plt.figure(figsize=(14, 4.5))
289
+ fig.patch.set_facecolor("#FAFAFA")
290
+ fig.suptitle("Spotify Track β€” Audio Profile Dashboard", fontsize=13, fontweight="bold")
291
+
292
+ # Panel 1: Radar
293
+ ax1 = fig.add_subplot(1, 3, 1, polar=True)
294
+ labels = ["Dance", "Energy", "Valence", "Acoust.", "Speech"]
295
+ vals = [danceability, energy, valence, acousticness, speechiness]
296
+ n = len(labels)
297
+ angles = np.linspace(0, 2 * np.pi, n, endpoint=False).tolist()
298
+ vals_loop = vals + [vals[0]]
299
+ angles_loop = angles + angles[:1]
300
+ ax1.plot(angles_loop, vals_loop, color=PALETTE["blue"], linewidth=2)
301
+ ax1.fill(angles_loop, vals_loop, color=PALETTE["blue"], alpha=0.25)
302
+ ax1.set_xticks(angles)
303
+ ax1.set_xticklabels(labels, fontsize=9)
304
+ ax1.set_ylim(0, 1)
305
+ ax1.set_yticks([0.25, 0.5, 0.75])
306
+ ax1.set_yticklabels(["", "", ""], fontsize=7)
307
+ ax1.set_title("Audio Radar", fontweight="bold", pad=14)
308
+ ax1.set_facecolor("#F0F4F8")
309
+ ax1.grid(color="white")
310
+
311
+ # Panel 2: Gauge
312
+ ax2 = fig.add_subplot(1, 3, 2)
313
+ ax2.set_facecolor("#F8F9FA")
314
+ tier = ("Hit πŸ”₯" if pop_pred >= 70 else "Popular" if pop_pred >= 50
315
+ else "Mid-tier" if pop_pred >= 30 else "Niche")
316
+ tier_color = (PALETTE["red"] if pop_pred >= 70 else
317
+ PALETTE["teal"] if pop_pred >= 50 else
318
+ PALETTE["amber"] if pop_pred >= 30 else "#888")
319
+ ax2.pie([pop_pred, 100 - pop_pred], colors=[tier_color, "#E8E8E8"],
320
+ startangle=90, wedgeprops={"edgecolor": "white", "linewidth": 2})
321
+ ax2.text(0, 0, f"{pop_pred:.0f}", ha="center", va="center",
322
+ fontsize=28, fontweight="bold", color=tier_color)
323
+ ax2.set_title(f"Popularity: {tier}", fontweight="bold")
324
+
325
+ # Panel 3: Feature importance comparison (from model)
326
+ ax3 = fig.add_subplot(1, 3, 3)
327
+ ax3.set_facecolor("#F8F9FA")
328
+ if SPOT_OK:
329
+ imp = pd.Series(rf_spot.feature_importances_, index=features_spot).sort_values()
330
+ ax3.barh(imp.index, imp.values, color=PALETTE["blue"], edgecolor="white")
331
+ ax3.set_title("Feature Importance\n(model weights)", fontweight="bold")
332
+ ax3.set_xlabel("Importance")
333
+ else:
334
+ ax3.text(0.5, 0.5, "Model not loaded", ha="center")
335
+
336
+ plt.tight_layout()
337
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
338
+ plt.savefig(tmp.name, dpi=130, bbox_inches="tight", facecolor="#FAFAFA")
339
+ plt.close()
340
+ return tmp.name
341
+
342
+
343
+ # ════════════════════════════════════════════════════════════
344
+ # AMAZON ANALYSIS FUNCTION
345
+ # ════════════════════════════════════════════════════════════
346
+
347
+ def analyze_amazon(category, actual_price, discount_pct, rating, review_text, use_gpt):
348
+ discounted_price = actual_price * (1 - discount_pct / 100)
349
+ sentiment_score = analyzer.polarity_scores(review_text)["compound"] if review_text else 0.0
350
+ sentiment_label = ("Positive" if sentiment_score >= 0.05
351
+ else "Negative" if sentiment_score <= -0.05 else "Neutral")
352
+
353
+ if AMZ_OK:
354
+ X = np.array([[actual_price, discounted_price, discount_pct, rating, sentiment_score]])
355
+ # Confidence interval via individual tree predictions
356
+ tree_preds = np.array([t.predict(X)[0] for t in rf_amz.estimators_])
357
+ log_pred = tree_preds.mean()
358
+ log_std = tree_preds.std()
359
+ sales_pred = int(np.expm1(log_pred))
360
+ sales_low = int(np.expm1(max(0, log_pred - log_std)))
361
+ sales_high = int(np.expm1(log_pred + log_std))
362
+ else:
363
+ sales_pred = int(rating * 1000 * (1 + sentiment_score))
364
+ sales_low = int(sales_pred * 0.7)
365
+ sales_high = int(sales_pred * 1.3)
366
+
367
+ score = min(100, int(
368
+ 25 * (rating / 5) +
369
+ 25 * ((sentiment_score + 1) / 2) +
370
+ 25 * min(sales_pred / 50000, 1) +
371
+ 25 * min(discount_pct / 70, 1)
372
+ ))
373
+ tier = ("Top Performer" if score >= 75 else "Average" if score >= 45 else "Underperformer")
374
+
375
+ # Chart
376
+ chart_path = make_amazon_chart(rating, sentiment_score, discount_pct, score, sales_pred)
377
+
378
+ # Text report
379
+ report = (
380
+ f"πŸ“¦ AMAZON PRODUCT ANALYSIS\n{'═'*42}\n"
381
+ f"Category: {category}\n"
382
+ f"Actual Price: β‚Ή{actual_price:.0f}\n"
383
+ f"Discounted Price: β‚Ή{discounted_price:.0f} (βˆ’{discount_pct}%)\n"
384
+ f"Rating: {rating}/5\n"
385
+ f"{'─'*42}\n"
386
+ f"SENTIMENT\n"
387
+ f" Score: {sentiment_score:+.3f} Label: {sentiment_label}\n"
388
+ f"{'─'*42}\n"
389
+ f"PREDICTED SALES\n"
390
+ f" Est. Reviews: ~{sales_pred:,}\n"
391
+ f" 90% Range: {sales_low:,} – {sales_high:,}\n"
392
+ f"{'─'*42}\n"
393
+ f"PERFORMANCE SCORE: {score}/100 ({tier})\n"
394
+ )
395
+
396
+ # GPT or fallback
397
+ gpt_section = ""
398
+ if use_gpt:
399
+ gpt_section = "\n" + get_amazon_gpt_insight(
400
+ category, actual_price, discounted_price, discount_pct,
401
+ rating, sentiment_score, sentiment_label, sales_pred, score
402
+ )
403
+
404
+ session_history.append({
405
+ "platform": "Amazon", "category": category,
406
+ "score": score, "tier": tier,
407
+ "timestamp": time.strftime("%H:%M:%S"),
408
+ })
409
+
410
+ return report.strip() + gpt_section, chart_path
411
+
412
+
413
+ # ════════════════════════════════════════════════════════════
414
+ # SPOTIFY ANALYSIS FUNCTION
415
+ # ════════════════════════════════════════════════════════════
416
+
417
+ def analyze_spotify(genre, danceability, energy, loudness, tempo, valence,
418
+ acousticness, speechiness, instrumentalness, explicit, use_gpt):
419
+ exp = int(explicit)
420
+
421
+ if SPOT_OK:
422
+ X = np.array([[danceability, energy, loudness, speechiness, acousticness,
423
+ instrumentalness, valence, tempo, exp]])
424
+ tree_preds = np.array([t.predict(X)[0] for t in rf_spot.estimators_])
425
+ pop_pred = float(np.clip(tree_preds.mean(), 0, 100))
426
+ pop_std = tree_preds.std()
427
+ else:
428
+ pop_pred = float(np.clip(20 + 30*danceability + 15*energy + 0.5*(loudness+20), 0, 100))
429
+ pop_std = 5.0
430
+
431
+ tier = ("Hit πŸ”₯" if pop_pred >= 70 else "Popular" if pop_pred >= 50
432
+ else "Mid-tier" if pop_pred >= 30 else "Niche")
433
+ pop_low = max(0, pop_pred - pop_std)
434
+ pop_high = min(100, pop_pred + pop_std)
435
+
436
+ chart_path = make_spotify_chart(
437
+ danceability, energy, loudness, tempo, valence,
438
+ acousticness, speechiness, pop_pred, genre
439
+ )
440
+
441
+ report = (
442
+ f"🎡 SPOTIFY TRACK ANALYSIS\n{'═'*42}\n"
443
+ f"Genre: {genre}\n"
444
+ f"Tempo: {tempo:.0f} BPM\n"
445
+ f"Explicit: {'Yes' if explicit else 'No'}\n"
446
+ f"{'─'*42}\n"
447
+ f"AUDIO FEATURES\n"
448
+ f" Danceability: {danceability:.3f}\n"
449
+ f" Energy: {energy:.3f}\n"
450
+ f" Loudness: {loudness:.1f} dB\n"
451
+ f" Valence: {valence:.3f}\n"
452
+ f" Acousticness: {acousticness:.3f}\n"
453
+ f" Speechiness: {speechiness:.3f}\n"
454
+ f"{'─'*42}\n"
455
+ f"PREDICTED POPULARITY\n"
456
+ f" Score: {pop_pred:.1f}/100 ({tier})\n"
457
+ f" Range: {pop_low:.1f} – {pop_high:.1f} (Β±1 std dev)\n"
458
+ )
459
+
460
+ gpt_section = ""
461
+ if use_gpt:
462
+ gpt_section = "\n" + get_spotify_gpt_insight(
463
+ genre, danceability, energy, loudness, tempo,
464
+ valence, acousticness, pop_pred, tier
465
+ )
466
+
467
+ session_history.append({
468
+ "platform": "Spotify", "genre": genre,
469
+ "score": round(pop_pred, 1), "tier": tier,
470
+ "timestamp": time.strftime("%H:%M:%S"),
471
+ })
472
+
473
+ return report.strip() + gpt_section, chart_path
474
+
475
+
476
+ # ════════════════════════════════════════════════════════════
477
+ # SESSION HISTORY & PIPELINE TRIGGER
478
+ # ════════════════════════════════════════════════════════════
479
+
480
+ def get_history():
481
+ if not session_history:
482
+ return "No analyses run yet this session."
483
+ lines = [f"{'#':<4} {'Time':<10} {'Platform':<10} {'Detail':<25} {'Score':<8} {'Tier'}"]
484
+ lines.append("─" * 70)
485
+ for i, h in enumerate(session_history[-10:], 1):
486
+ detail = h.get("category", h.get("genre", "β€”"))
487
+ lines.append(f"{i:<4} {h['timestamp']:<10} {h['platform']:<10} {detail:<25} {h['score']:<8} {h['tier']}")
488
+ return "\n".join(lines)
489
+
490
+
491
+ def run_pipeline():
492
+ """Trigger the agentic pipeline from the UI."""
493
+ if not os.path.exists("agentic_pipeline.py"):
494
+ return "agentic_pipeline.py not found in current directory."
495
+ try:
496
+ result = subprocess.run(
497
+ ["python3", "agentic_pipeline.py", "--mode", "both", "--quiet"],
498
+ capture_output=True, text=True, timeout=120
499
+ )
500
+ out = result.stdout[-2000:] if len(result.stdout) > 2000 else result.stdout
501
+ if result.returncode == 0:
502
+ return f"βœ“ Pipeline completed successfully.\n\n{out}"
503
+ else:
504
+ return f"βœ— Pipeline error:\n{result.stderr[:1000]}"
505
+ except subprocess.TimeoutExpired:
506
+ return "βœ— Pipeline timed out after 120s."
507
+ except Exception as e:
508
+ return f"βœ— Could not run pipeline: {e}"
509
+
510
+
511
+ # ════════════════════════════════════════════════════════════
512
+ # GRADIO INTERFACE
513
+ # ════════════════════════════════════════════════════════════
514
+
515
+ CUSTOM_CSS = """
516
+ .gr-button-primary { background: #2E86AB !important; border: none !important; }
517
+ .gr-button-secondary { border: 1px solid #2E86AB !important; color: #2E86AB !important; }
518
+ footer { display: none !important; }
519
+ """
520
+
521
+ with gr.Blocks(
522
+ title="AI Performance Analyzer β€” Amazon Γ— Spotify",
523
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="pink"),
524
+ css=CUSTOM_CSS,
525
+ ) as demo:
526
+
527
+ gr.Markdown("""
528
+ # πŸ€– AI Performance Analyzer
529
+ ### Amazon Products Γ— Spotify Tracks
530
+ *Real-time ML predictions + GPT-4o-mini insights from a single interface*
531
+ """)
532
+
533
+ with gr.Tabs():
534
+
535
+ # ── TAB 1: AMAZON ────────────────────────────────────
536
+ with gr.TabItem("πŸ›’ Amazon Product"):
537
+ gr.Markdown("### Predict product sales performance and get AI-powered strategy insights")
538
+ with gr.Row():
539
+ with gr.Column(scale=1):
540
+ amz_category = gr.Dropdown(
541
+ ["Electronics", "Clothing", "HomeKitchen", "Books",
542
+ "Sports", "Beauty", "Toys", "OfficeProducts", "MusicalInstruments"],
543
+ label="Product Category", value="Electronics")
544
+ amz_actual = gr.Slider(50, 80000, value=999, step=50,
545
+ label="Actual Price (β‚Ή)")
546
+ amz_discount = gr.Slider(0, 80, value=30, step=1,
547
+ label="Discount %")
548
+ amz_rating = gr.Slider(1.0, 5.0, value=4.2, step=0.1,
549
+ label="Star Rating (/5)")
550
+ amz_review = gr.Textbox(
551
+ label="Sample Review Text",
552
+ value="Great product, works perfectly and arrived on time!",
553
+ lines=3, placeholder="Enter a customer review for sentiment analysis...")
554
+ amz_gpt = gr.Checkbox(label="πŸ€– Generate GPT-4o-mini AI insight", value=True)
555
+ amz_btn = gr.Button("Analyze Product", variant="primary", size="lg")
556
+
557
+ with gr.Column(scale=2):
558
+ amz_output = gr.Textbox(label="Analysis Report", lines=22, show_copy_button=True)
559
+ amz_plot = gr.Image(label="Performance Dashboard", type="filepath")
560
+
561
+ amz_btn.click(
562
+ analyze_amazon,
563
+ inputs=[amz_category, amz_actual, amz_discount, amz_rating, amz_review, amz_gpt],
564
+ outputs=[amz_output, amz_plot],
565
+ )
566
+
567
+ # ── TAB 2: SPOTIFY ───────────────────────────────────
568
+ with gr.TabItem("🎡 Spotify Track"):
569
+ gr.Markdown("### Predict commercial success and get AI-powered music industry insights")
570
+ with gr.Row():
571
+ with gr.Column(scale=1):
572
+ sp_genre = gr.Dropdown(
573
+ ["pop", "hip-hop", "rock", "electronic", "jazz",
574
+ "r-n-b", "country", "latin", "indie", "classical"],
575
+ label="Genre", value="pop")
576
+ sp_dance = gr.Slider(0.0, 1.0, value=0.70, step=0.01, label="Danceability")
577
+ sp_energy = gr.Slider(0.0, 1.0, value=0.80, step=0.01, label="Energy")
578
+ sp_loud = gr.Slider(-40, 0, value=-7, step=0.5, label="Loudness (dB)")
579
+ sp_tempo = gr.Slider(60, 200, value=120, step=1, label="Tempo (BPM)")
580
+ sp_val = gr.Slider(0.0, 1.0, value=0.60, step=0.01, label="Valence (mood positivity)")
581
+ sp_acou = gr.Slider(0.0, 1.0, value=0.10, step=0.01, label="Acousticness")
582
+ sp_speech = gr.Slider(0.0, 1.0, value=0.05, step=0.01, label="Speechiness")
583
+ sp_instr = gr.Slider(0.0, 1.0, value=0.00, step=0.01, label="Instrumentalness")
584
+ sp_exp = gr.Checkbox(label="Explicit content", value=False)
585
+ sp_gpt = gr.Checkbox(label="πŸ€– Generate GPT-4o-mini AI insight", value=True)
586
+ sp_btn = gr.Button("Analyze Track", variant="primary", size="lg")
587
+
588
+ with gr.Column(scale=2):
589
+ sp_output = gr.Textbox(label="Analysis Report", lines=22, show_copy_button=True)
590
+ sp_plot = gr.Image(label="Audio Profile Dashboard", type="filepath")
591
+
592
+ sp_btn.click(
593
+ analyze_spotify,
594
+ inputs=[sp_genre, sp_dance, sp_energy, sp_loud, sp_tempo,
595
+ sp_val, sp_acou, sp_speech, sp_instr, sp_exp, sp_gpt],
596
+ outputs=[sp_output, sp_plot],
597
+ )
598
+
599
+ # ── TAB 3: SESSION HISTORY ───────────────────────────
600
+ with gr.TabItem("πŸ“‹ Session History"):
601
+ gr.Markdown("### All analyses run this session")
602
+ hist_output = gr.Textbox(label="Session Log", lines=15, show_copy_button=True)
603
+ hist_btn = gr.Button("Refresh History", variant="secondary")
604
+ hist_btn.click(get_history, inputs=[], outputs=[hist_output])
605
+
606
+ # ── TAB 4: PIPELINE ──────────────────────────────────
607
+ with gr.TabItem("βš™οΈ Agentic Pipeline"):
608
+ gr.Markdown("""
609
+ ### Automated End-to-End Pipeline
610
+ Runs the full agentic pipeline: data ingestion β†’ synthetic generation β†’
611
+ model training β†’ inference β†’ report generation. Single-command execution.
612
+ """)
613
+ pipe_btn = gr.Button("β–Ά Run Agentic Pipeline", variant="primary", size="lg")
614
+ pipe_output = gr.Textbox(label="Pipeline Output", lines=20, show_copy_button=True)
615
+ pipe_btn.click(run_pipeline, inputs=[], outputs=[pipe_output])
616
+
617
+ gr.Markdown("""
618
+ ---
619
+ *Built with Gradio Β· Models: Random Forest (sklearn) Β· NLP: VADER Β· AI: GPT-4o-mini*
620
+ *Set `OPENAI_API_KEY` as a Hugging Face Secret to enable live GPT insights*
621
+ """)
622
+
623
+ if __name__ == "__main__":
624
+ demo.launch(share=True)
lstm_model.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ EXTRA CREDIT β€” Deep Learning with LSTM
3
+ =======================================
4
+ LSTM model for temporal popularity prediction on Spotify.
5
+
6
+ Addresses the extra credit: "Try DL, LSTM, or RL for +1 pt in lowest case study"
7
+
8
+ The LSTM treats each track's audio features as a sequence across
9
+ popularity tiers (Obscure β†’ Low β†’ Mid β†’ Popular β†’ Hit), learning
10
+ temporal dynamics of how feature importance shifts across success levels.
11
+
12
+ Usage:
13
+ python3 lstm_model.py
14
+ python3 lstm_model.py --epochs 30 --mode spotify
15
+ python3 lstm_model.py --mode amazon
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import argparse
21
+ import warnings
22
+ import numpy as np
23
+ import pandas as pd
24
+ import matplotlib
25
+ matplotlib.use("Agg")
26
+ import matplotlib.pyplot as plt
27
+
28
+ warnings.filterwarnings("ignore")
29
+
30
+ # ── TensorFlow / Keras ──────────────────────────────────────
31
+ try:
32
+ import tensorflow as tf
33
+ from tensorflow.keras.models import Sequential
34
+ from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
35
+ from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
36
+ from tensorflow.keras.optimizers import Adam
37
+ TF_OK = True
38
+ print(f"TensorFlow {tf.__version__} loaded.")
39
+ except ImportError:
40
+ TF_OK = False
41
+ print("[ERROR] TensorFlow not installed. Run: pip install tensorflow")
42
+ sys.exit(1)
43
+
44
+ from sklearn.preprocessing import MinMaxScaler
45
+ from sklearn.model_selection import train_test_split
46
+ from sklearn.metrics import mean_absolute_error, r2_score
47
+
48
+ COLORS = ["#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#44BBA4"]
49
+
50
+
51
+ # ════════════════════════════════════════════════════════════
52
+ # DATA PREPARATION β€” SEQUENCE CONSTRUCTION
53
+ # ════════════════════════════════════════════════════════════
54
+
55
+ def build_spotify_sequences(df, features, target, window=5):
56
+ """
57
+ Convert track-level data into overlapping windows of length `window`.
58
+ Tracks are sorted by popularity then split into windows, creating
59
+ pseudo-temporal sequences that simulate how audio characteristics
60
+ evolve across the popularity spectrum.
61
+ """
62
+ df_sorted = df.sort_values(target).reset_index(drop=True)
63
+ X_all = df_sorted[features].values
64
+ y_all = df_sorted[target].values
65
+
66
+ scaler_X = MinMaxScaler()
67
+ scaler_y = MinMaxScaler()
68
+ X_scaled = scaler_X.fit_transform(X_all)
69
+ y_scaled = scaler_y.fit_transform(y_all.reshape(-1, 1)).flatten()
70
+
71
+ Xs, ys = [], []
72
+ for i in range(len(X_scaled) - window):
73
+ Xs.append(X_scaled[i:i + window])
74
+ ys.append(y_scaled[i + window])
75
+
76
+ return np.array(Xs), np.array(ys), scaler_X, scaler_y
77
+
78
+
79
+ def build_amazon_sequences(df, features, target, window=5):
80
+ """
81
+ For Amazon: sort by rating (quality proxy), build overlapping windows.
82
+ """
83
+ df_sorted = df.sort_values("rating").reset_index(drop=True)
84
+ X_all = df_sorted[features].values
85
+ y_all = df_sorted[target].values
86
+
87
+ scaler_X = MinMaxScaler()
88
+ scaler_y = MinMaxScaler()
89
+ X_scaled = scaler_X.fit_transform(X_all)
90
+ y_scaled = scaler_y.fit_transform(y_all.reshape(-1, 1)).flatten()
91
+
92
+ Xs, ys = [], []
93
+ for i in range(len(X_scaled) - window):
94
+ Xs.append(X_scaled[i:i + window])
95
+ ys.append(y_scaled[i + window])
96
+
97
+ return np.array(Xs), np.array(ys), scaler_X, scaler_y
98
+
99
+
100
+ # ════════════════════════════════════════════════════════════
101
+ # LSTM MODEL BUILDER
102
+ # ════════════════════════════════════════════════════════════
103
+
104
+ def build_lstm(input_shape, units=64, dropout=0.2):
105
+ """
106
+ Two-layer stacked LSTM with BatchNorm and Dropout.
107
+ Architecture chosen for sequence regression tasks.
108
+ """
109
+ model = Sequential([
110
+ LSTM(units, input_shape=input_shape, return_sequences=True,
111
+ name="lstm_layer_1"),
112
+ BatchNormalization(),
113
+ Dropout(dropout),
114
+ LSTM(units // 2, return_sequences=False, name="lstm_layer_2"),
115
+ BatchNormalization(),
116
+ Dropout(dropout),
117
+ Dense(32, activation="relu", name="dense_1"),
118
+ Dense(1, activation="linear", name="output"),
119
+ ])
120
+ model.compile(
121
+ optimizer=Adam(learning_rate=0.001),
122
+ loss="mse",
123
+ metrics=["mae"],
124
+ )
125
+ return model
126
+
127
+
128
+ # ════════════════════════════════════════════════════════════
129
+ # TRAINING & EVALUATION
130
+ # ════════════════════════��═══════════════════════════════════
131
+
132
+ def train_and_evaluate(X, y, scaler_y, domain, epochs=50, batch_size=32):
133
+ X_train, X_test, y_train, y_test = train_test_split(
134
+ X, y, test_size=0.2, random_state=42
135
+ )
136
+
137
+ model = build_lstm(input_shape=(X.shape[1], X.shape[2]))
138
+ model.summary()
139
+
140
+ callbacks = [
141
+ EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True),
142
+ ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=4, min_lr=1e-5),
143
+ ]
144
+
145
+ history = model.fit(
146
+ X_train, y_train,
147
+ validation_split=0.15,
148
+ epochs=epochs,
149
+ batch_size=batch_size,
150
+ callbacks=callbacks,
151
+ verbose=1,
152
+ )
153
+
154
+ y_pred_scaled = model.predict(X_test, verbose=0).flatten()
155
+
156
+ # Inverse transform predictions
157
+ y_test_orig = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
158
+ y_pred_orig = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
159
+
160
+ mae = mean_absolute_error(y_test_orig, y_pred_orig)
161
+ r2 = r2_score(y_test_orig, y_pred_orig)
162
+
163
+ print(f"\n{'─'*50}")
164
+ print(f"LSTM Results β€” {domain}")
165
+ print(f" MAE : {mae:.3f}")
166
+ print(f" RΒ² : {r2:.3f}")
167
+ print(f" Epochs trained: {len(history.history['loss'])}")
168
+ print(f"{'─'*50}")
169
+
170
+ return model, history, y_test_orig, y_pred_orig, mae, r2
171
+
172
+
173
+ # ════════════════════════════════════════════════════════════
174
+ # VISUALISATION
175
+ # ════════════════════════════════════════════════════════════
176
+
177
+ def plot_results(history, y_test, y_pred, mae, r2, domain, filename):
178
+ fig, axes = plt.subplots(1, 3, figsize=(18, 5))
179
+ fig.suptitle(f"LSTM Deep Learning β€” {domain}", fontsize=14, fontweight="bold")
180
+
181
+ # Training curve
182
+ ax = axes[0]
183
+ ax.plot(history.history["loss"], color=COLORS[0], label="Train Loss")
184
+ ax.plot(history.history["val_loss"], color=COLORS[1], linestyle="--", label="Val Loss")
185
+ ax.set_title("Training & Validation Loss", fontweight="bold")
186
+ ax.set_xlabel("Epoch")
187
+ ax.set_ylabel("MSE Loss")
188
+ ax.legend()
189
+
190
+ # Actual vs predicted
191
+ ax = axes[1]
192
+ ax.scatter(y_test, y_pred, alpha=0.4, color=COLORS[1], s=20)
193
+ mn = min(y_test.min(), y_pred.min())
194
+ mx = max(y_test.max(), y_pred.max())
195
+ ax.plot([mn, mx], [mn, mx], "r--", lw=2, label="Perfect fit")
196
+ ax.set_title(f"Actual vs Predicted\nRΒ² = {r2:.3f}", fontweight="bold")
197
+ ax.set_xlabel("Actual")
198
+ ax.set_ylabel("Predicted")
199
+ ax.legend()
200
+
201
+ # Residuals
202
+ ax = axes[2]
203
+ residuals = y_test - y_pred
204
+ ax.hist(residuals, bins=30, color=COLORS[2], edgecolor="white")
205
+ ax.axvline(0, color="red", linestyle="--")
206
+ ax.set_title(f"Residuals Distribution\nMAE = {mae:.3f}", fontweight="bold")
207
+ ax.set_xlabel("Residual")
208
+ ax.set_ylabel("Count")
209
+
210
+ plt.tight_layout()
211
+ plt.savefig(filename, dpi=150, bbox_inches="tight")
212
+ plt.close()
213
+ print(f"Saved: {filename}")
214
+
215
+
216
+ # ════════════════════════════════════════════════════════════
217
+ # MAIN
218
+ # ════════════════════════════════════════════════════════════
219
+
220
+ def run_spotify_lstm(epochs=50):
221
+ print("\n" + "=" * 60)
222
+ print("LSTM β€” SPOTIFY POPULARITY PREDICTION")
223
+ print("=" * 60)
224
+
225
+ paths = ["spotify_synthetic.csv", "spotify/dataset.csv", "dataset.csv"]
226
+ df = None
227
+ for p in paths:
228
+ if os.path.exists(p):
229
+ df = pd.read_csv(p)
230
+ print(f"Loaded: {p} ({len(df)} records)")
231
+ break
232
+
233
+ if df is None:
234
+ print("No Spotify data found. Generating synthetic...")
235
+ np.random.seed(42)
236
+ n = 800
237
+ from scipy.stats import beta as beta_dist
238
+ dance = beta_dist.rvs(5, 3, size=n)
239
+ energy = beta_dist.rvs(4, 3, size=n)
240
+ loudness = np.random.normal(-8, 4, n).clip(-40, 0)
241
+ tempo = np.random.normal(120, 20, n).clip(60, 200)
242
+ valence = beta_dist.rvs(3, 3, size=n)
243
+ acou = beta_dist.rvs(2, 5, size=n)
244
+ speech = beta_dist.rvs(2, 8, size=n)
245
+ instru = beta_dist.rvs(1, 9, size=n)
246
+ pop = np.clip(20 + 25*dance + 15*energy + 0.5*(loudness+20) + np.random.normal(0, 8, n), 0, 100)
247
+ df = pd.DataFrame({"danceability": dance, "energy": energy, "loudness": loudness,
248
+ "tempo": tempo, "valence": valence, "acousticness": acou,
249
+ "speechiness": speech, "instrumentalness": instru,
250
+ "explicit": np.random.binomial(1, 0.15, n),
251
+ "popularity": pop.astype(int)})
252
+
253
+ features = ["danceability", "energy", "loudness", "speechiness",
254
+ "acousticness", "instrumentalness", "valence", "tempo", "explicit"]
255
+ df["explicit"] = df["explicit"].astype(int)
256
+ df = df[features + ["popularity"]].dropna()
257
+
258
+ print(f"\nBuilding LSTM sequences (window=5)...")
259
+ X, y, scaler_X, scaler_y = build_spotify_sequences(df, features, "popularity", window=5)
260
+ print(f"Sequence shape: X={X.shape}, y={y.shape}")
261
+
262
+ model, history, y_test, y_pred, mae, r2 = train_and_evaluate(
263
+ X, y, scaler_y, "Spotify", epochs=epochs
264
+ )
265
+ plot_results(history, y_test, y_pred, mae, r2, "Spotify", "lstm_spotify.png")
266
+
267
+ return {"domain": "spotify", "mae": round(mae, 3), "r2": round(r2, 3)}
268
+
269
+
270
+ def run_amazon_lstm(epochs=50):
271
+ print("\n" + "=" * 60)
272
+ print("LSTM β€” AMAZON SALES PREDICTION")
273
+ print("=" * 60)
274
+
275
+ paths = ["amazon_synthetic.csv", "amazon/amazon.csv"]
276
+ df = None
277
+ for p in paths:
278
+ if os.path.exists(p):
279
+ raw = pd.read_csv(p)
280
+ print(f"Loaded: {p} ({len(raw)} records)")
281
+ # Try to get the needed columns
282
+ if "log_sales" not in raw.columns and "rating_count" in raw.columns:
283
+ raw["rating_count"] = pd.to_numeric(
284
+ raw["rating_count"].astype(str).str.replace(",", ""), errors="coerce"
285
+ )
286
+ raw["log_sales"] = np.log1p(raw["rating_count"])
287
+ if all(c in raw.columns for c in ["actual_price", "discount_pct", "rating", "sentiment_score", "log_sales"]):
288
+ df = raw
289
+ break
290
+
291
+ if df is None:
292
+ print("No Amazon data found. Generating synthetic...")
293
+ np.random.seed(0)
294
+ n = 800
295
+ actual = np.random.lognormal(7, 1.2, n).clip(50, 80000)
296
+ disc = np.random.uniform(5, 80, n)
297
+ discounted = actual * (1 - disc/100)
298
+ rating = np.random.normal(4, 0.5, n).clip(1, 5)
299
+ sent = np.random.normal(0.5, 0.3, n).clip(-1, 1)
300
+ log_sales = np.clip(2 + 1.5*rating + 1.2*sent + np.random.normal(0, 0.8, n), 0, 15)
301
+ df = pd.DataFrame({"actual_price": actual, "discounted_price": discounted,
302
+ "discount_pct": disc, "rating": rating, "sentiment_score": sent,
303
+ "log_sales": log_sales})
304
+
305
+ features = ["actual_price", "discounted_price", "discount_pct", "rating", "sentiment_score"]
306
+ df = df[features + ["log_sales"]].dropna()
307
+
308
+ # Normalise price to prevent scale domination
309
+ from sklearn.preprocessing import StandardScaler
310
+ df[["actual_price", "discounted_price"]] = StandardScaler().fit_transform(
311
+ df[["actual_price", "discounted_price"]]
312
+ )
313
+
314
+ print(f"\nBuilding LSTM sequences (window=5)...")
315
+ X, y, scaler_X, scaler_y = build_amazon_sequences(df, features, "log_sales", window=5)
316
+ print(f"Sequence shape: X={X.shape}, y={y.shape}")
317
+
318
+ model, history, y_test, y_pred, mae, r2 = train_and_evaluate(
319
+ X, y, scaler_y, "Amazon", epochs=epochs
320
+ )
321
+ plot_results(history, y_test, y_pred, mae, r2, "Amazon", "lstm_amazon.png")
322
+
323
+ return {"domain": "amazon", "mae": round(mae, 3), "r2": round(r2, 3)}
324
+
325
+
326
+ if __name__ == "__main__":
327
+ parser = argparse.ArgumentParser(description="LSTM Deep Learning β€” Extra Credit")
328
+ parser.add_argument("--mode", choices=["spotify", "amazon", "both"], default="both")
329
+ parser.add_argument("--epochs", type=int, default=50, help="Max training epochs (EarlyStopping applies)")
330
+ args = parser.parse_args()
331
+
332
+ results = []
333
+ if args.mode in ("spotify", "both"):
334
+ results.append(run_spotify_lstm(args.epochs))
335
+ if args.mode in ("amazon", "both"):
336
+ results.append(run_amazon_lstm(args.epochs))
337
+
338
+ print("\n" + "=" * 60)
339
+ print("LSTM SUMMARY")
340
+ print("=" * 60)
341
+ for r in results:
342
+ print(f" {r['domain'].upper():10s} MAE={r['mae']} RΒ²={r['r2']}")
343
+ print("\nOutputs: lstm_spotify.png, lstm_amazon.png")
344
+ print("Include these plots and metrics in the individual reports as DL comparison.")
requirements.txt CHANGED
@@ -5,3 +5,6 @@ seaborn
5
  scikit-learn
6
  vaderSentiment
7
  gradio
 
 
 
 
5
  scikit-learn
6
  vaderSentiment
7
  gradio
8
+ requests
9
+ tensorflow
10
+ scipy