{ "nbformat": 4, "nbformat_minor": 5, "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.0" }, "colab": { "provenance": [] } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "9YD-vrG5i6Pe" }, "source": [ "# šŸŽµ **Data Analysis & Visualization**\n", "### Topic: *\"To what extent does an artist's digital popularity (social media reach) predict their monthly listener count compared to their physical presence (number of concerts and tour frequency)?\"*" ], "id": "9YD-vrG5i6Pe" }, { "cell_type": "markdown", "metadata": { "id": "rYSsyJMti6Pi" }, "source": [ "## **1.** šŸ“¦ Install required packages" ], "id": "rYSsyJMti6Pi" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TFQzk9Jgi6Pk" }, "outputs": [], "source": [ "!pip install pandas matplotlib seaborn numpy vaderSentiment statsmodels scikit-learn" ], "id": "TFQzk9Jgi6Pk" }, { "cell_type": "markdown", "metadata": { "id": "B0YFRzeDi6Po" }, "source": [ "## **2.** šŸ“Š Generate & Load all datasets\n", "\n", "This section generates all data directly so no CSV upload is needed." ], "id": "B0YFRzeDi6Po" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7V8qUOK_i6Pp" }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import random\n", "from datetime import datetime\n", "\n", "random.seed(2025)\n", "np.random.seed(2025)\n", "\n", "# ─── Artist base data (scraped from Wikipedia in Notebook 1) ───\n", "artist_data = [\n", " (\"The Beatles\", \"UK\", \"Rock\", 600),\n", " (\"Elvis Presley\", \"US\", \"Rock\", 500),\n", " (\"Michael Jackson\",\"US\", \"Pop\", 350),\n", " (\"Madonna\", \"US\", \"Pop\", 300),\n", " (\"Elton John\", \"UK\", \"Pop\", 250),\n", " (\"Led Zeppelin\", \"UK\", \"Rock\", 200),\n", " (\"Pink Floyd\", \"UK\", \"Rock\", 200),\n", " (\"Rihanna\", \"Barbados\", \"R&B\", 150),\n", " (\"Taylor Swift\", \"US\", \"Pop\", 150),\n", " (\"Beyonce\", \"US\", \"R&B\", 130),\n", " (\"Drake\", \"Canada\", \"Hip-Hop\", 120),\n", " (\"Ed Sheeran\", \"UK\", \"Pop\", 100),\n", " (\"Adele\", \"UK\", \"Soul\", 100),\n", " (\"Eminem\", \"US\", \"Hip-Hop\", 100),\n", " (\"Bruno Mars\", \"US\", \"Pop\", 90),\n", " (\"The Weeknd\", \"Canada\", \"R&B\", 85),\n", " (\"Ariana Grande\", \"US\", \"Pop\", 80),\n", " (\"Justin Bieber\", \"Canada\", \"Pop\", 80),\n", " (\"Billie Eilish\", \"US\", \"Pop\", 50),\n", " (\"Bad Bunny\", \"Puerto Rico\", \"Reggaeton\", 50),\n", " (\"Post Malone\", \"US\", \"Hip-Hop\", 40),\n", " (\"Doja Cat\", \"US\", \"Pop\", 40),\n", " (\"Olivia Rodrigo\", \"US\", \"Pop\", 35),\n", " (\"Harry Styles\", \"UK\", \"Pop\", 30),\n", " (\"Dua Lipa\", \"UK\", \"Pop\", 30),\n", "]\n", "\n", "df_artists = pd.DataFrame(artist_data,\n", " columns=[\"artist_name\",\"country\",\"genre\",\"claimed_sales_millions\"])\n", "\n", "def generate_social_tier(s):\n", " if s >= 300: base = 5\n", " elif s >= 200: base = 4\n", " elif s >= 100: base = 3\n", " elif s >= 50: base = 2\n", " else: base = 1\n", " noise = random.choices([-1,0,1], weights=[1,3,2])[0]\n", " return int(np.clip(base+noise, 1, 5))\n", "\n", "def get_popularity_label(tier):\n", " if tier <= 2: return \"emerging\"\n", " elif tier == 3: return \"established\"\n", " else: return \"superstar\"\n", "\n", "df_artists[\"social_media_tier\"] = df_artists[\"claimed_sales_millions\"].apply(generate_social_tier)\n", "df_artists[\"popularity_label\"] = df_artists[\"social_media_tier\"].apply(get_popularity_label)\n", "\n", "print(f\"āœ… {len(df_artists)} artists loaded\")\n", "print(df_artists[[\"artist_name\",\"genre\",\"popularity_label\"]].to_string(index=False))" ], "id": "7V8qUOK_i6Pp" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6iSL-YB_i6Pq" }, "outputs": [], "source": [ "# ─── Synthetic social media data (18 months) ───\n", "def generate_social_media_profile(popularity_label):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"MS\")\n", " if popularity_label == \"superstar\":\n", " base = random.randint(20_000_000, 60_000_000)\n", " eng = np.random.uniform(0.02, 0.05, len(months))\n", " elif popularity_label == \"established\":\n", " base = random.randint(3_000_000, 15_000_000)\n", " eng = np.random.uniform(0.04, 0.08, len(months))\n", " else:\n", " base = random.randint(100_000, 2_000_000)\n", " eng = np.random.uniform(0.06, 0.15, len(months))\n", " growth = np.linspace(base, base * random.uniform(1.02, 1.15), len(months))\n", " seasonality = base * 0.03 * np.sin(np.linspace(0, 3*np.pi, len(months)))\n", " noise = np.random.normal(0, base * 0.01, len(months))\n", " followers = np.clip(growth + seasonality + noise, 0, None).astype(int)\n", " return list(zip(months.strftime(\"%Y-%m\"), followers, np.round(eng, 4)))\n", "\n", "social_rows = []\n", "for _, row in df_artists.iterrows():\n", " for month, followers, eng in generate_social_media_profile(row[\"popularity_label\"]):\n", " social_rows.append({\n", " \"artist_name\": row[\"artist_name\"],\n", " \"month\": month,\n", " \"social_media_followers\": int(followers),\n", " \"engagement_rate\": float(eng),\n", " \"social_media_tier\": int(row[\"social_media_tier\"]),\n", " \"popularity_label\": row[\"popularity_label\"]\n", " })\n", "\n", "df_social = pd.DataFrame(social_rows)\n", "print(f\"āœ… df_social: {df_social.shape} | sample:\")\n", "print(df_social.head(3).to_string(index=False))" ], "id": "6iSL-YB_i6Pq" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tEhHtwMji6Ps" }, "outputs": [], "source": [ "# ─── Synthetic concert data (18 months) ───\n", "def generate_concert_profile(popularity_label):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"MS\")\n", " if popularity_label == \"superstar\":\n", " avg_c = random.randint(6, 15); tf = random.uniform(0.6, 1.0)\n", " elif popularity_label == \"established\":\n", " avg_c = random.randint(3, 8); tf = random.uniform(0.3, 0.7)\n", " else:\n", " avg_c = random.randint(0, 4); tf = random.uniform(0.1, 0.4)\n", " records = []\n", " for m in months:\n", " on_tour = random.random() < tf\n", " concerts = max(0, int(np.random.poisson(avg_c)) if on_tour else 0)\n", " cities = max(0, int(concerts * random.uniform(0.7, 1.0)))\n", " records.append((m.strftime(\"%Y-%m\"), concerts, int(on_tour), cities))\n", " return records\n", "\n", "concert_rows = []\n", "for _, row in df_artists.iterrows():\n", " for month, concerts, on_tour, cities in generate_concert_profile(row[\"popularity_label\"]):\n", " concert_rows.append({\n", " \"artist_name\": row[\"artist_name\"],\n", " \"month\": month,\n", " \"concerts_count\": concerts,\n", " \"on_tour\": on_tour,\n", " \"cities_count\": cities,\n", " \"popularity_label\": row[\"popularity_label\"]\n", " })\n", "\n", "df_concerts = pd.DataFrame(concert_rows)\n", "print(f\"āœ… df_concerts: {df_concerts.shape} | sample:\")\n", "print(df_concerts.head(3).to_string(index=False))" ], "id": "tEhHtwMji6Ps" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "y8aHurRPi6Pt" }, "outputs": [], "source": [ "# ─── Synthetic monthly listeners (dependent variable, 18 months) ───\n", "def generate_listener_profile(popularity_label):\n", " months = pd.date_range(end=datetime.today(), periods=18, freq=\"MS\")\n", " base_map = {\n", " \"superstar\": random.randint(30_000_000, 80_000_000),\n", " \"established\": random.randint(5_000_000, 25_000_000),\n", " \"emerging\": random.randint(200_000, 4_000_000)\n", " }\n", " base = base_map[popularity_label]\n", " trend = np.linspace(base, base * random.uniform(0.95, 1.15), len(months))\n", " seasonality= base * 0.05 * np.sin(np.linspace(0, 3*np.pi, len(months)))\n", " noise = np.random.normal(0, base * 0.04, len(months))\n", " listeners = np.clip(trend + seasonality + noise, 0, None).astype(int)\n", " return list(zip(months.strftime(\"%Y-%m\"), listeners))\n", "\n", "listener_rows = []\n", "for _, row in df_artists.iterrows():\n", " for month, listeners in generate_listener_profile(row[\"popularity_label\"]):\n", " listener_rows.append({\n", " \"artist_name\": row[\"artist_name\"],\n", " \"month\": month,\n", " \"monthly_listeners\": int(listeners),\n", " \"popularity_label\": row[\"popularity_label\"]\n", " })\n", "\n", "df_listeners = pd.DataFrame(listener_rows)\n", "print(f\"āœ… df_listeners: {df_listeners.shape} | sample:\")\n", "print(df_listeners.head(3).to_string(index=False))" ], "id": "y8aHurRPi6Pt" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eQ9HPLFui6Pu" }, "outputs": [], "source": [ "# ─── Synthetic fan reviews ───\n", "synthetic_reviews_by_sentiment = {'positive': ['Absolutely incredible live performance — this artist never disappoints!', 'Their new album is a masterpiece; every track is a hit.', 'The concert was life-changing. The energy was electric from start to finish.', 'I have been streaming their music non-stop — total obsession!', 'One of the most talented artists of our generation. Simply brilliant.', 'Their Spotify monthly listeners will only keep growing — incredibly talented.', 'Saw them live last week and it exceeded every expectation. Phenomenal!', 'The social media presence is spot-on — engaging, authentic, and creative.', 'Their tour this year is absolutely worth every penny. Go see them live!', 'Every release just confirms how gifted this artist truly is.', 'The production quality is outstanding. You can feel the passion in each song.', 'From the first note to the last, their show had me completely captivated.', 'Their Instagram reels are viral for a reason — pure talent on display.', 'I have never felt more connected to an artist. Their music speaks to me.', 'Flawless vocals, incredible stage presence, and an unforgettable setlist.', 'They are redefining the genre — fresh, innovative, and deeply moving.', 'I have recommended this artist to everyone I know. Pure gold.', 'Their streaming numbers reflect what fans already know: sheer excellence.', 'The world-tour announcement has me counting down the days already.', 'Honestly one of the best concerts I have ever attended in my life.', 'Their music has the rare quality of making you feel understood.', 'Every tour stop sells out immediately — that tells you everything.', 'The authenticity in their lyrics is what sets them apart from everyone.', 'A truly generational talent. Watch this artist take over the world.', 'Their TikTok content is as good as their music — totally engaging.', 'Sold-out arenas worldwide because the talent backs up every ticket.', 'This artist deserves every stream, every follow, every accolade.', 'Their discography is one of the strongest in modern music history.', 'Can not stop replaying their latest single — it is just that good.', 'Breathtaking stage design, incredible musicianship, and pure showmanship.', 'Their fan base is loyal because the artist genuinely deserves it.', 'Every album era brings something fresh — that is true artistry.', 'Their social engagement with fans is warm, funny, and incredibly genuine.', 'I drove three hours to see their show and it was absolutely worth it.', 'Their music got me through a rough year — forever grateful.', 'The vocal range and control in their live performances is jaw-dropping.', 'No one works harder and it shows in every single release.', 'Their tours are more than concerts — they are genuine experiences.', 'I play their playlist every morning. It just makes everything better.', 'Their growth on streaming platforms is well-deserved and well-earned.', 'Watched their documentary and fell in love with their story even more.', 'Every single, every feature, every collaboration is a standout moment.', 'The crowd at their concert was the most alive I have ever seen.', 'Their online presence perfectly captures who they are as an artist.', 'I cried three times during their set and I regret nothing.', 'They have mastered the art of connecting with millions while feeling personal.', 'The encore left the entire arena completely speechless.', 'Their music will still be relevant in twenty years — timeless.', 'An artist who genuinely improves with every release. Remarkable consistency.', 'This tour should be mandatory for every music lover on the planet.'], 'neutral': ['The concert was decent but not as impressive as I had hoped.', 'Their streaming numbers are strong though the latest album was just okay.', 'A solid performance — nothing groundbreaking but enjoyable enough.', 'I appreciate the social media content even if the music feels formulaic.', 'The tour was well-organised but lacked the spark of their earlier shows.', 'Good artist overall — just not quite my taste in music.', 'Their latest release has its moments but also some forgettable tracks.', 'I can see why they are popular even if I do not personally connect.', 'The concert production was impressive; the setlist felt a bit safe.', 'Solid streaming numbers but the new direction feels a little predictable.', 'An enjoyable listen but nothing that makes me want to replay immediately.', 'Their social media following is impressive; the content is hit or miss.', 'A competent live show with a few genuinely great moments.', 'I went expecting to be blown away and left feeling just satisfied.', 'Their older material is stronger but the new stuff is still fine.', 'The hype is somewhat justified though I expected more originality.', 'A reliable artist who delivers consistently without ever truly surprising.', 'Their tour frequency is admirable even if the shows feel routine.', 'Not bad by any means — just not exceptional either.', 'Their fanbase is very dedicated even if I am only a casual listener.', 'Some songs really land; others feel like filler on the tracklist.', 'I appreciate their work but the latest project left me underwhelmed.', 'The concert was professionally run but lacked emotional connection for me.', 'Their social media presence is active though the content feels managed.', 'A middle-of-the-road show — I had fun but was not moved.', 'I can understand the appeal without fully sharing the enthusiasm.', 'Technically very good. Artistically, I wanted more risk-taking.', 'The tour announcement was exciting; the actual show was just decent.', 'Their monthly listener count reflects a broad appeal I partly get.', 'Some parts of the set were genuinely exciting — others dragged.', 'Good enough for a casual listen but not something I seek out actively.', 'Their crossover appeal is clear even if the music feels a bit bland.', 'I followed them online hoping to connect more — it helped a little.', 'A perfectly fine performance that neither thrilled nor disappointed me.', 'Their genre is not really my thing but I can acknowledge the craft.', 'The venue was great; the artist was merely good.', 'I will keep streaming their popular tracks without seeking out deep cuts.', 'Their social engagement is high; the artistic depth is debatable.', 'An average concert experience elevated by strong production values.', 'I was present, I enjoyed it, and I probably will not rush back.', 'They deliver what fans expect — consistent if not especially daring.', 'A pleasant listen that fades quickly from memory.', 'Their image is carefully crafted; the music is serviceable.', 'The tour had its high points but the overall show felt overlong.', 'I follow their updates without feeling a strong pull to their music.', 'Competent pop that fits the moment without defining it.', 'A fine concert — not something I would call unmissable though.', 'Their growth makes sense commercially even if artistically I am unmoved.', 'I went with friends and we had an okay time — nothing more.', 'Their stats are impressive; my personal reaction is just neutral.'], 'negative': ['The concert was a major disappointment — the energy was completely flat.', 'Their social media hype far outpaces the actual quality of their music.', 'I left the show early; nothing about it was worth staying for.', 'Overrated in every possible way — the streaming numbers baffle me.', 'The setlist was lazy and the performance felt phoned in.', 'Their online presence is all marketing with very little substance behind it.', 'I expected so much more given the ticket price and the hype surrounding them.', 'The vocals were weak and the stage show had almost no energy.', 'Their popularity is driven by algorithms, not genuine talent.', 'I kept hoping the show would pick up — it never did.', 'The production was loud and flashy to compensate for a lack of artistry.', 'Their social media followers clearly do not reflect actual fan quality.', 'One of the most forgettable concerts I have attended in years.', 'The latest album is a step backward in every measurable way.', 'All image, no substance — their decline feels inevitable.', 'I felt nothing during the entire performance, which says it all.', 'Their tour is more about merchandise than actual music quality.', 'The songs sounded significantly better recorded than they did live.', 'Their streaming peak has passed and the new material explains why.', 'I paid a premium price for a deeply underwhelming experience.', 'The gap between their public image and live reality is staggering.', 'Nothing in the show felt authentic or heartfelt — purely transactional.', 'Their social media engagement tricks fans into overestimating their relevance.', 'A two-hour show that felt like three — in all the wrong ways.', 'Their audience deserves better than what they delivered on this tour.', 'The music has become so polished it has lost all emotional resonance.', 'I walked away feeling like I had wasted an evening and a lot of money.', 'Their live performances cannot match what studio production hides.', 'The hype machine is strong but the talent is clearly fading.', 'I cannot justify the streaming numbers after hearing the latest release.', 'A soulless performance from an artist running purely on past reputation.', 'Their concerts used to feel special — now they feel like an obligation.', 'The formula is tired and nothing about this tour felt fresh or exciting.', 'I unfollowed them on every platform after this disappointing experience.', 'Their decline is visible in every live show and every new release.', 'Nothing landed emotionally — it was technically competent and nothing else.', 'The ticket prices are insulting given the quality of what they delivered.', 'Their social reach masks a reality: the music is no longer compelling.', 'I regret attending — not something I ever thought I would say.', 'Their best work is far behind them and everyone in that arena knew it.', 'The encore was unnecessary given how flat the main set already was.', 'Fans deserve transparency about how much is live versus playback.', 'Their online image is disconnected from what they deliver on stage.', 'A show that reminded me why I stopped following this artist.', 'The tour felt like a cash grab with little regard for fan experience.', 'Their songs lack the depth needed to sustain a two-hour set.', 'Overproduced, over-hyped, and deeply underwhelming in every respect.', 'I left feeling like the artist no longer cares about the actual music.', 'Their fall from cultural relevance is on full display at these shows.', 'I truly hope this is not what live music is becoming — it was bleak.']}\n", "\n", "def get_review_sentiment(popularity_label):\n", " if popularity_label == \"superstar\":\n", " return random.choices([\"positive\",\"neutral\",\"negative\"], weights=[70,20,10])[0]\n", " elif popularity_label == \"established\":\n", " return random.choices([\"positive\",\"neutral\",\"negative\"], weights=[50,35,15])[0]\n", " else:\n", " return random.choices([\"positive\",\"neutral\",\"negative\"], weights=[40,35,25])[0]\n", "\n", "review_rows = []\n", "for _, row in df_artists.iterrows():\n", " for _ in range(10):\n", " sent = get_review_sentiment(row[\"popularity_label\"])\n", " review_rows.append({\n", " \"artist_name\": row[\"artist_name\"],\n", " \"popularity_label\": row[\"popularity_label\"],\n", " \"social_media_tier\": int(row[\"social_media_tier\"]),\n", " \"review_text\": random.choice(synthetic_reviews_by_sentiment[sent]),\n", " \"sentiment_label\": sent\n", " })\n", "\n", "df_reviews = pd.DataFrame(review_rows)\n", "print(f\"āœ… df_reviews: {df_reviews.shape} | sample:\")\n", "print(df_reviews.head(3).to_string(index=False))" ], "id": "eQ9HPLFui6Pu" }, { "cell_type": "markdown", "metadata": { "id": "FhpI9LNsi6Px" }, "source": [ "### *Quality check on all datasets*" ], "id": "FhpI9LNsi6Px" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lvgyv0Tei6Py" }, "outputs": [], "source": [ "def quality_check(df, name=\"DataFrame\"):\n", " print(f\"\\nšŸ” Quality Check: {name}\")\n", " print(f\" Shape: {df.shape}\")\n", " print(f\" Columns: {list(df.columns)}\")\n", " print(f\" Missing values:\\n{df.isnull().sum().to_string()}\")\n", " print(f\" Duplicates: {df.duplicated().sum()}\")\n", " display(df.describe(include='all').transpose())\n", "\n", "quality_check(df_listeners, \"df_listeners\")\n", "quality_check(df_social, \"df_social\")\n", "quality_check(df_concerts, \"df_concerts\")\n", "quality_check(df_reviews, \"df_reviews\")" ], "id": "lvgyv0Tei6Py" }, { "cell_type": "markdown", "metadata": { "id": "Ntx2U5ghi6Py" }, "source": [ "## **3.** šŸŽ­ Perform sentiment analysis using VADER\n", "\n", "### *a. Initial setup*" ], "id": "Ntx2U5ghi6Py" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ASKlZE2xi6Pz" }, "outputs": [], "source": [ "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", "analyzer = SentimentIntensityAnalyzer()" ], "id": "ASKlZE2xi6Pz" }, { "cell_type": "markdown", "metadata": { "id": "O6oN--Hni6P0" }, "source": [ "### *b. Create a function get_sentiment_label that returns negative, neutral, or positive based on VADER scoring*" ], "id": "O6oN--Hni6P0" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wMbIprKDi6P0" }, "outputs": [], "source": [ "def get_sentiment_label(text):\n", " score = analyzer.polarity_scores(str(text))[\"compound\"]\n", " if score >= 0.05: return \"positive\"\n", " elif score <= -0.05: return \"negative\"\n", " else: return \"neutral\"" ], "id": "wMbIprKDi6P0" }, { "cell_type": "markdown", "metadata": { "id": "a-y8S_d3i6P1" }, "source": [ "### *c. āœ‹šŸ»šŸ›‘ā›”ļø Apply get_sentiment_label to df_reviews column named review_text to get vader_sentiment column*" ], "id": "a-y8S_d3i6P1" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "TKiZQ9Mvi6P1" }, "outputs": [], "source": [ "df_reviews[\"vader_sentiment\"] = df_reviews[\"review_text\"].apply(get_sentiment_label)" ], "id": "TKiZQ9Mvi6P1" }, { "cell_type": "markdown", "metadata": { "id": "NK9nNwLdi6P2" }, "source": [ "### *d. āœ‹šŸ»šŸ›‘ā›”ļø View the first few lines of the resulting table df_reviews*" ], "id": "NK9nNwLdi6P2" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QJg55YvMi6P2" }, "outputs": [], "source": [ "print(df_reviews.head())" ], "id": "QJg55YvMi6P2" }, { "cell_type": "markdown", "metadata": { "id": "ekX6ZnM2i6P2" }, "source": [ "## **4.** šŸ“Š Data visualization\n", "\n", "### *a. Initial setup*" ], "id": "ekX6ZnM2i6P2" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ASH_eUBki6P2" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import matplotlib.dates as mdates\n", "from pathlib import Path\n", "\n", "ART_DIR = Path(\"artifacts\")\n", "PY_FIG = ART_DIR / \"py\" / \"figures\"\n", "PY_TAB = ART_DIR / \"py\" / \"tables\"\n", "for p in [PY_FIG, PY_TAB]:\n", " p.mkdir(parents=True, exist_ok=True)\n", "\n", "popularity_colors = {\"superstar\": \"royalblue\", \"established\": \"mediumseagreen\", \"emerging\": \"orangered\"}\n", "print(\"āœ… Output folders ready:\", PY_FIG.resolve())" ], "id": "ASH_eUBki6P2" }, { "cell_type": "markdown", "metadata": { "id": "z2q5ayKYi6P3" }, "source": [ "### *b. Sample 2 artists for each popularity tier*" ], "id": "z2q5ayKYi6P3" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mNWaHS8Ai6P3" }, "outputs": [], "source": [ "sampled_artists = []\n", "for label in [\"superstar\", \"established\", \"emerging\"]:\n", " pool = df_listeners[df_listeners[\"popularity_label\"] == label][\"artist_name\"].unique()\n", " sampled = random.sample(list(pool), min(2, len(pool)))\n", " sampled_artists.extend(sampled)\n", "print(\"Sampled artists:\", sampled_artists)" ], "id": "mNWaHS8Ai6P3" }, { "cell_type": "markdown", "metadata": { "id": "1fVc82oqi6P4" }, "source": [ "### *c. Copy relevant data for sampled artists*" ], "id": "1fVc82oqi6P4" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "nIu_5tj8i6P4" }, "outputs": [], "source": [ "sampled_listeners = df_listeners[df_listeners[\"artist_name\"].isin(sampled_artists)].copy()\n", "sampled_social = df_social[df_social[\"artist_name\"].isin(sampled_artists)].copy()\n", "sampled_concerts = df_concerts[df_concerts[\"artist_name\"].isin(sampled_artists)].copy()\n", "sampled_reviews = df_reviews[df_reviews[\"artist_name\"].isin(sampled_artists)].copy()" ], "id": "nIu_5tj8i6P4" }, { "cell_type": "markdown", "metadata": { "id": "0A7TglN2i6P4" }, "source": [ "### *d. Plot monthly listener trends over time for the sampled artists*" ], "id": "0A7TglN2i6P4" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Hv6KXLfEi6P4" }, "outputs": [], "source": [ "sampled_listeners[\"month\"] = pd.to_datetime(sampled_listeners[\"month\"])\n", "\n", "plt.figure(figsize=(18, 6))\n", "for artist in sampled_artists:\n", " subset = sampled_listeners[sampled_listeners[\"artist_name\"] == artist]\n", " label_val = subset[\"popularity_label\"].iloc[0]\n", " plt.plot(subset[\"month\"], subset[\"monthly_listeners\"],\n", " label=f\"{artist} ({label_val})\",\n", " color=popularity_colors.get(label_val, \"gray\"))\n", "\n", "plt.title(\"šŸŽ§ Monthly Listener Trends Over Time (Sampled Artists)\")\n", "plt.xlabel(\"Month\"); plt.ylabel(\"Monthly Listeners\")\n", "plt.xticks(rotation=45)\n", "plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize='small')\n", "plt.grid(True); plt.tight_layout()\n", "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))\n", "plt.savefig(PY_FIG / 'listener_trends_sampled_artists.png', dpi=150)\n", "plt.show()" ], "id": "Hv6KXLfEi6P4" }, { "cell_type": "markdown", "metadata": { "id": "7QB7dn8Mi6P5" }, "source": [ "### *e. Plot VADER sentiment distribution per artist*" ], "id": "7QB7dn8Mi6P5" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "n5DjmqgQi6P5" }, "outputs": [], "source": [ "sampled_reviews[\"grouped_artist\"] = (\n", " sampled_reviews[\"popularity_label\"].str.upper() + \" | \" + sampled_reviews[\"artist_name\"]\n", ")\n", "sentiment_counts = (\n", " sampled_reviews.groupby([\"grouped_artist\", \"vader_sentiment\"])\n", " .size().unstack(fill_value=0)\n", ")\n", "for col in [\"negative\",\"neutral\",\"positive\"]:\n", " if col not in sentiment_counts.columns: sentiment_counts[col] = 0\n", "sentiment_counts = sentiment_counts[[\"negative\",\"neutral\",\"positive\"]]\n", "sentiment_counts.reset_index().to_csv(PY_TAB / 'sentiment_counts_sampled.csv', index=False)\n", "\n", "fig, ax = plt.subplots(figsize=(12, 7))\n", "sentiment_counts.plot.barh(stacked=True, ax=ax,\n", " color={\"negative\":\"crimson\",\"neutral\":\"lightgray\",\"positive\":\"royalblue\"})\n", "plt.title(\"šŸ’¬ VADER Sentiment Distribution in Fan Reviews (Sampled Artists)\")\n", "plt.xlabel(\"Number of Reviews\"); plt.ylabel(\"Artist\")\n", "plt.legend(title=\"Sentiment\", loc=\"lower right\")\n", "plt.grid(axis=\"x\", linestyle=\"--\", alpha=0.6)\n", "plt.tight_layout()\n", "plt.savefig(PY_FIG / 'sentiment_distribution_sampled_artists.png', dpi=150)\n", "plt.show()" ], "id": "n5DjmqgQi6P5" }, { "cell_type": "markdown", "metadata": { "id": "MTNV4ANMi6P6" }, "source": [ "### *f. Scatter plot: Social media followers vs Monthly listeners*" ], "id": "MTNV4ANMi6P6" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wQ9Xnhg4i6P6" }, "outputs": [], "source": [ "avg_social_plot = df_social.groupby(\"artist_name\")[\"social_media_followers\"].mean().reset_index()\n", "avg_listeners_plot = df_listeners.groupby(\"artist_name\")[\"monthly_listeners\"].mean().reset_index()\n", "avg_concerts_plot = df_concerts.groupby(\"artist_name\")[\"concerts_count\"].mean().reset_index()\n", "\n", "df_agg = (\n", " avg_social_plot\n", " .merge(avg_listeners_plot, on=\"artist_name\")\n", " .merge(avg_concerts_plot, on=\"artist_name\")\n", " .merge(df_reviews[[\"artist_name\",\"popularity_label\"]].drop_duplicates(), on=\"artist_name\", how=\"left\")\n", ")\n", "\n", "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", "for label, color in popularity_colors.items():\n", " sub = df_agg[df_agg[\"popularity_label\"] == label]\n", " axes[0].scatter(sub[\"social_media_followers\"], sub[\"monthly_listeners\"],\n", " label=label, color=color, alpha=0.7, s=80)\n", " axes[1].scatter(sub[\"concerts_count\"], sub[\"monthly_listeners\"],\n", " label=label, color=color, alpha=0.7, s=80)\n", "\n", "axes[0].set_title(\"šŸ“± Social Media Followers vs Monthly Listeners\")\n", "axes[0].set_xlabel(\"Avg Social Media Followers\"); axes[0].set_ylabel(\"Avg Monthly Listeners\")\n", "axes[0].legend(title=\"Tier\"); axes[0].grid(True, linestyle=\"--\", alpha=0.5)\n", "axes[1].set_title(\"šŸŽ¤ Avg Concerts per Month vs Monthly Listeners\")\n", "axes[1].set_xlabel(\"Avg Concerts per Month\"); axes[1].set_ylabel(\"Avg Monthly Listeners\")\n", "axes[1].legend(title=\"Tier\"); axes[1].grid(True, linestyle=\"--\", alpha=0.5)\n", "\n", "plt.tight_layout()\n", "plt.savefig(PY_FIG / 'scatter_social_vs_concerts_vs_listeners.png', dpi=150)\n", "plt.show()" ], "id": "wQ9Xnhg4i6P6" }, { "cell_type": "markdown", "metadata": { "id": "KD7S7PGfi6P6" }, "source": [ "### *g. Correlation heatmap*" ], "id": "KD7S7PGfi6P6" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QJyZLcJSi6P7" }, "outputs": [], "source": [ "corr_matrix = df_agg[[\"social_media_followers\",\"concerts_count\",\"monthly_listeners\"]].corr()\n", "\n", "plt.figure(figsize=(7, 5))\n", "sns.heatmap(corr_matrix, annot=True, fmt=\".2f\", cmap=\"coolwarm\",\n", " linewidths=0.5, vmin=-1, vmax=1)\n", "plt.title(\"šŸ”„ Correlation: Digital vs Physical vs Monthly Listeners\")\n", "plt.tight_layout()\n", "plt.savefig(PY_FIG / 'correlation_heatmap.png', dpi=150)\n", "plt.show()\n", "\n", "print(\"\\nšŸ“‹ Correlation with Monthly Listeners:\")\n", "print(corr_matrix[\"monthly_listeners\"].sort_values(ascending=False))" ], "id": "QJyZLcJSi6P7" }, { "cell_type": "markdown", "metadata": { "id": "WbR9Zu_gi6P7" }, "source": [ "## **5.** šŸ”® Forecast monthly listeners with ARIMA\n", "\n", "### *a. Initial setup*" ], "id": "WbR9Zu_gi6P7" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QTRS2Ifsi6P7" }, "outputs": [], "source": [ "import statsmodels.api as sm\n", "from itertools import product\n", "import warnings" ], "id": "QTRS2Ifsi6P7" }, { "cell_type": "markdown", "metadata": { "id": "ab2gwSO9i6P8" }, "source": [ "### *b. Define function find_best_arima*" ], "id": "ab2gwSO9i6P8" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "W6bgDof4i6P8" }, "outputs": [], "source": [ "def find_best_arima(series, p_range=(0,3), d_range=(0,2), q_range=(0,1)):\n", " best_aic, best_order, best_model = float(\"inf\"), None, None\n", " for p, d, q in product(range(p_range[0], p_range[1]+1),\n", " range(d_range[0], d_range[1]+1),\n", " range(q_range[0], q_range[1]+1)):\n", " try:\n", " results = sm.tsa.ARIMA(series, order=(p,d,q)).fit()\n", " if results.aic < best_aic:\n", " best_aic, best_order, best_model = results.aic, (p,d,q), results\n", " except Exception:\n", " continue\n", " return best_order, best_model" ], "id": "W6bgDof4i6P8" }, { "cell_type": "markdown", "metadata": { "id": "eXYrO9a6i6P8" }, "source": [ "### *c. Plot ARIMA listener forecasts for sampled artists*" ], "id": "eXYrO9a6i6P8" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uLFaiLTFi6P8" }, "outputs": [], "source": [ "colors = sns.color_palette(\"tab10\", len(sampled_artists))\n", "sampled_listeners[\"month\"] = pd.to_datetime(sampled_listeners[\"month\"])\n", "\n", "plt.figure(figsize=(16, 8))\n", "for i, artist in enumerate(sampled_artists):\n", " artist_data = (\n", " sampled_listeners[sampled_listeners[\"artist_name\"] == artist]\n", " .copy().sort_values(\"month\").set_index(\"month\")\n", " )\n", " with warnings.catch_warnings():\n", " warnings.simplefilter(\"ignore\")\n", " best_order, best_model = find_best_arima(artist_data[\"monthly_listeners\"])\n", " if best_model is not None:\n", " forecast = best_model.get_forecast(steps=6)\n", " forecast_index = pd.date_range(\n", " start=artist_data.index[-1] + pd.DateOffset(months=1), periods=6, freq='MS')\n", " plt.plot(artist_data.index, artist_data[\"monthly_listeners\"],\n", " color=colors[i], label=artist, linewidth=2)\n", " plt.plot(forecast_index, forecast.predicted_mean,\n", " linestyle=\"--\", color=colors[i], linewidth=2)\n", "\n", "plt.title(\"šŸ“ˆ ARIMA Monthly Listener Forecasts (Sampled Artists)\", fontsize=14)\n", "plt.xlabel(\"Month\"); plt.ylabel(\"Monthly Listeners\")\n", "plt.xticks(rotation=45)\n", "plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))\n", "plt.grid(True)\n", "plt.legend(loc=\"center left\", bbox_to_anchor=(1, 0.5), fontsize=\"small\")\n", "plt.tight_layout()\n", "plt.savefig(PY_FIG / 'arima_forecasts_sampled_artists.png', dpi=150)\n", "plt.show()" ], "id": "uLFaiLTFi6P8" }, { "cell_type": "markdown", "metadata": { "id": "MHCiramYi6QG" }, "source": [ "## **6.** šŸ·ļø Marketing recommendations (rule-based)\n", "\n", "### *a. Calculate average social media followers, concerts, and sentiment ratio per artist*" ], "id": "MHCiramYi6QG" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xFNQ_67Ci6QH" }, "outputs": [], "source": [ "avg_social_agg = df_social.groupby(\"artist_name\")[\"social_media_followers\"].mean().reset_index()\n", "avg_social_agg.columns = [\"artist_name\", \"avg_followers\"]\n", "\n", "avg_concerts_per_month = df_concerts.groupby(\"artist_name\")[\"concerts_count\"].mean().reset_index()\n", "avg_concerts_per_month.columns = [\"artist_name\", \"avg_concerts_per_month\"]" ], "id": "xFNQ_67Ci6QH" }, { "cell_type": "markdown", "metadata": { "id": "jTbSpL-2i6QH" }, "source": [ "### *b. Calculate sentiment distribution per artist*" ], "id": "jTbSpL-2i6QH" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZJTYX0oti6QH" }, "outputs": [], "source": [ "sentiment_dist = (\n", " df_reviews.groupby([\"artist_name\",\"vader_sentiment\"])\n", " .size().unstack(fill_value=0)\n", ")\n", "sentiment_dist.columns.name = None # fix unstack column name bug\n", "for col in [\"positive\",\"neutral\",\"negative\"]:\n", " if col not in sentiment_dist.columns: sentiment_dist[col] = 0\n", "sentiment_dist[\"total\"] = sentiment_dist[[\"positive\",\"neutral\",\"negative\"]].sum(axis=1)\n", "sentiment_dist[\"positive_ratio\"] = sentiment_dist[\"positive\"] / sentiment_dist[\"total\"]\n", "sentiment_dist[\"negative_ratio\"] = sentiment_dist[\"negative\"] / sentiment_dist[\"total\"]" ], "id": "ZJTYX0oti6QH" }, { "cell_type": "markdown", "metadata": { "id": "ybqWL9Lgi6QI" }, "source": [ "### *c. Merge all characteristics into a decision table*" ], "id": "ybqWL9Lgi6QI" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VYJsqy3Ci6QI" }, "outputs": [], "source": [ "df_decision = (\n", " avg_social_agg\n", " .merge(avg_concerts_per_month, on=\"artist_name\")\n", " .merge(sentiment_dist.reset_index(), on=\"artist_name\", how=\"left\")\n", " .merge(df_listeners.groupby(\"artist_name\")[\"monthly_listeners\"].mean().reset_index(), on=\"artist_name\")\n", ")\n", "df_decision[\"positive_ratio\"] = df_decision[\"positive_ratio\"].fillna(0)\n", "df_decision[\"negative_ratio\"] = df_decision[\"negative_ratio\"].fillna(0)" ], "id": "VYJsqy3Ci6QI" }, { "cell_type": "markdown", "metadata": { "id": "z7sPmbJqi6QJ" }, "source": [ "### *d. āœ‹šŸ»šŸ›‘ā›”ļø Create the marketing_recommendation function*\n", "\n", "- avg_followers >= 10M **and** positive_ratio >= 0.6 → **\"Double down on digital\"**\n", "- avg_concerts_per_month >= 5 **and** positive_ratio >= 0.5 → **\"Expand tour presence\"**\n", "- negative_ratio >= 0.4 → **\"Crisis management needed\"**\n", "- Otherwise → **\"Balanced strategy\"**" ], "id": "z7sPmbJqi6QJ" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "NSM3bKVwi6QJ" }, "outputs": [], "source": [ "def marketing_recommendation(row):\n", " if row[\"avg_followers\"] >= 10_000_000 and row[\"positive_ratio\"] >= 0.6:\n", " return \"Double down on digital — leverage your online reach\"\n", " elif row[\"avg_concerts_per_month\"] >= 5 and row[\"positive_ratio\"] >= 0.5:\n", " return \"Expand tour presence — live shows are driving engagement\"\n", " elif row[\"negative_ratio\"] >= 0.4:\n", " return \"Crisis management needed — address sentiment issues\"\n", " else:\n", " return \"Balanced strategy — maintain current digital & physical mix\"" ], "id": "NSM3bKVwi6QJ" }, { "cell_type": "markdown", "metadata": { "id": "fPwYXqXni6QJ" }, "source": [ "### *e. āœ‹šŸ»šŸ›‘ā›”ļø Run the marketing_recommendation function and check out the first few recommendations*" ], "id": "fPwYXqXni6QJ" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "B1zQqJkui6QJ" }, "outputs": [], "source": [ "df_decision[\"marketing_recommendation\"] = df_decision.apply(marketing_recommendation, axis=1)\n", "print(df_decision[[\n", " \"artist_name\",\"avg_followers\",\"avg_concerts_per_month\",\n", " \"positive_ratio\",\"monthly_listeners\",\"marketing_recommendation\"\n", "]].to_string(index=False))" ], "id": "B1zQqJkui6QJ" }, { "cell_type": "markdown", "metadata": { "id": "pLByd04Ii6QK" }, "source": [ "## **7.** šŸ’¾ Save Python outputs for the Hugging Face dashboard" ], "id": "pLByd04Ii6QK" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mk0DnrJUi6QK" }, "outputs": [], "source": [ "import json\n", "\n", "df_listeners[\"month\"] = pd.to_datetime(df_listeners[\"month\"])\n", "df_dashboard = (\n", " df_listeners.groupby(\"month\", as_index=False)[\"monthly_listeners\"].sum()\n", " .rename(columns={\"monthly_listeners\": \"total_monthly_listeners\"})\n", " .sort_values(\"month\")\n", ")\n", "df_dashboard.to_csv(PY_TAB / \"df_dashboard.csv\", index=False)\n", "\n", "kpis = {\n", " \"n_artists\": int(df_listeners[\"artist_name\"].nunique()),\n", " \"n_months\": int(df_dashboard[\"month\"].nunique()),\n", " \"avg_monthly_listeners_total\": float(df_dashboard[\"total_monthly_listeners\"].mean()),\n", " \"digital_vs_listeners_correlation\": float(df_agg[[\"social_media_followers\",\"monthly_listeners\"]].corr().iloc[0,1]),\n", " \"physical_vs_listeners_correlation\": float(df_agg[[\"concerts_count\",\"monthly_listeners\"]].corr().iloc[0,1])\n", "}\n", "with open(PY_FIG / \"kpis.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(kpis, f, indent=2)\n", "\n", "df_decision.to_csv(PY_TAB / \"marketing_recommendations.csv\", index=False)\n", "\n", "print(\"āœ… All outputs saved!\")\n", "print(json.dumps(kpis, indent=2))" ], "id": "mk0DnrJUi6QK" } ] }