helenerollier commited on
Commit
e1efdff
·
verified ·
1 Parent(s): 70abeef

Upload datacreation.ipynb

Browse files
Files changed (1) hide show
  1. datacreation.ipynb +309 -32
datacreation.ipynb CHANGED
@@ -20,13 +20,13 @@
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 11,
24
  "metadata": {
25
  "colab": {
26
  "base_uri": "https://localhost:8080/"
27
  },
28
  "id": "f48c8f8c",
29
- "outputId": "f1a76cb9-ce00-47c2-df85-094eb6e9a141"
30
  },
31
  "outputs": [
32
  {
@@ -85,7 +85,7 @@
85
  },
86
  {
87
  "cell_type": "code",
88
- "execution_count": 12,
89
  "metadata": {
90
  "id": "91d52125"
91
  },
@@ -113,7 +113,7 @@
113
  },
114
  {
115
  "cell_type": "code",
116
- "execution_count": null,
117
  "metadata": {
118
  "id": "xqO5Y3dnYhxt"
119
  },
@@ -145,7 +145,7 @@
145
  },
146
  {
147
  "cell_type": "code",
148
- "execution_count": null,
149
  "metadata": {
150
  "id": "l5FkkNhUYTHh"
151
  },
@@ -175,7 +175,7 @@
175
  "metadata": {
176
  "id": "j_U7YrVrrN3n"
177
  },
178
- "execution_count": null,
179
  "outputs": []
180
  },
181
  {
@@ -186,7 +186,7 @@
186
  "metadata": {
187
  "id": "KJ-lE6ktrQX9"
188
  },
189
- "execution_count": null,
190
  "outputs": []
191
  },
192
  {
@@ -203,12 +203,12 @@
203
  "metadata": {
204
  "id": "AqZUPGJtrSET"
205
  },
206
- "execution_count": null,
207
  "outputs": []
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": null,
212
  "metadata": {
213
  "id": "lC1U_YHtZifh"
214
  },
@@ -232,7 +232,7 @@
232
  },
233
  {
234
  "cell_type": "code",
235
- "execution_count": null,
236
  "metadata": {
237
  "id": "O_wIvTxYZqCK"
238
  },
@@ -259,7 +259,7 @@
259
  },
260
  {
261
  "cell_type": "code",
262
- "execution_count": null,
263
  "metadata": {
264
  "id": "-gPXGcRPuV_9"
265
  },
@@ -286,7 +286,7 @@
286
  },
287
  {
288
  "cell_type": "code",
289
- "execution_count": null,
290
  "metadata": {
291
  "id": "mnd5hdAbaNjz"
292
  },
@@ -309,7 +309,7 @@
309
  },
310
  {
311
  "cell_type": "code",
312
- "execution_count": null,
313
  "metadata": {
314
  "id": "V-G3OCUCgR07"
315
  },
@@ -327,7 +327,7 @@
327
  },
328
  {
329
  "cell_type": "code",
330
- "execution_count": null,
331
  "metadata": {
332
  "id": "kUtWmr8maZLZ"
333
  },
@@ -353,7 +353,7 @@
353
  },
354
  {
355
  "cell_type": "code",
356
- "execution_count": null,
357
  "metadata": {
358
  "id": "tafQj8_7gYCG"
359
  },
@@ -380,7 +380,7 @@
380
  },
381
  {
382
  "cell_type": "code",
383
- "execution_count": null,
384
  "metadata": {
385
  "id": "qkVhYPXGbgEn"
386
  },
@@ -417,7 +417,38 @@
417
  },
418
  {
419
  "cell_type": "code",
420
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  "metadata": {
422
  "id": "SlJ24AUafoDB"
423
  },
@@ -446,7 +477,7 @@
446
  },
447
  {
448
  "cell_type": "code",
449
- "execution_count": null,
450
  "metadata": {
451
  "id": "wcN6gtiZg-ws"
452
  },
@@ -464,11 +495,154 @@
464
  },
465
  {
466
  "cell_type": "code",
467
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
468
  "metadata": {
469
- "id": "MzbZvLcAhGaH"
470
  },
471
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  "source": [
473
  "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
474
  "\n",
@@ -495,7 +669,7 @@
495
  },
496
  {
497
  "cell_type": "code",
498
- "execution_count": null,
499
  "metadata": {
500
  "id": "b3cd2a50"
501
  },
@@ -531,18 +705,103 @@
531
  },
532
  {
533
  "cell_type": "code",
534
- "execution_count": null,
 
 
535
  "metadata": {
536
- "id": "l2SRc3PjuTGM"
 
 
 
 
537
  },
538
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  "source": [
540
  "review_rows = []\n",
541
  "for _, row in df_books.iterrows():\n",
542
  " title = row['title']\n",
543
  " sentiment_label = row['sentiment_label']\n",
544
  " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
545
- " sampled_reviews = random.sample(review_pool, 10)\n",
 
 
546
  " for review_text in sampled_reviews:\n",
547
  " review_rows.append({\n",
548
  " \"title\": title,\n",
@@ -551,7 +810,12 @@
551
  " \"rating\": row['rating'],\n",
552
  " \"popularity_score\": row['popularity_score']\n",
553
  " })"
554
- ]
 
 
 
 
 
555
  },
556
  {
557
  "cell_type": "markdown",
@@ -564,7 +828,7 @@
564
  },
565
  {
566
  "cell_type": "code",
567
- "execution_count": null,
568
  "metadata": {
569
  "id": "ZUKUqZsuumsp"
570
  },
@@ -585,11 +849,24 @@
585
  },
586
  {
587
  "cell_type": "code",
588
- "execution_count": null,
589
  "metadata": {
590
- "id": "3946e521"
 
 
 
 
591
  },
592
- "outputs": [],
 
 
 
 
 
 
 
 
 
593
  "source": [
594
  "import numpy as np\n",
595
  "\n",
@@ -704,7 +981,7 @@
704
  },
705
  {
706
  "cell_type": "code",
707
- "execution_count": null,
708
  "metadata": {
709
  "id": "xfE8NMqOurKo"
710
  },
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 81,
24
  "metadata": {
25
  "colab": {
26
  "base_uri": "https://localhost:8080/"
27
  },
28
  "id": "f48c8f8c",
29
+ "outputId": "d0cf9eb7-407a-4275-a05d-a6016d1d3277"
30
  },
31
  "outputs": [
32
  {
 
85
  },
86
  {
87
  "cell_type": "code",
88
+ "execution_count": 82,
89
  "metadata": {
90
  "id": "91d52125"
91
  },
 
113
  },
114
  {
115
  "cell_type": "code",
116
+ "execution_count": 83,
117
  "metadata": {
118
  "id": "xqO5Y3dnYhxt"
119
  },
 
145
  },
146
  {
147
  "cell_type": "code",
148
+ "execution_count": 83,
149
  "metadata": {
150
  "id": "l5FkkNhUYTHh"
151
  },
 
175
  "metadata": {
176
  "id": "j_U7YrVrrN3n"
177
  },
178
+ "execution_count": 84,
179
  "outputs": []
180
  },
181
  {
 
186
  "metadata": {
187
  "id": "KJ-lE6ktrQX9"
188
  },
189
+ "execution_count": 85,
190
  "outputs": []
191
  },
192
  {
 
203
  "metadata": {
204
  "id": "AqZUPGJtrSET"
205
  },
206
+ "execution_count": 86,
207
  "outputs": []
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": 87,
212
  "metadata": {
213
  "id": "lC1U_YHtZifh"
214
  },
 
232
  },
233
  {
234
  "cell_type": "code",
235
+ "execution_count": 87,
236
  "metadata": {
237
  "id": "O_wIvTxYZqCK"
238
  },
 
259
  },
260
  {
261
  "cell_type": "code",
262
+ "execution_count": 88,
263
  "metadata": {
264
  "id": "-gPXGcRPuV_9"
265
  },
 
286
  },
287
  {
288
  "cell_type": "code",
289
+ "execution_count": 89,
290
  "metadata": {
291
  "id": "mnd5hdAbaNjz"
292
  },
 
309
  },
310
  {
311
  "cell_type": "code",
312
+ "execution_count": 89,
313
  "metadata": {
314
  "id": "V-G3OCUCgR07"
315
  },
 
327
  },
328
  {
329
  "cell_type": "code",
330
+ "execution_count": 90,
331
  "metadata": {
332
  "id": "kUtWmr8maZLZ"
333
  },
 
353
  },
354
  {
355
  "cell_type": "code",
356
+ "execution_count": 90,
357
  "metadata": {
358
  "id": "tafQj8_7gYCG"
359
  },
 
380
  },
381
  {
382
  "cell_type": "code",
383
+ "execution_count": 91,
384
  "metadata": {
385
  "id": "qkVhYPXGbgEn"
386
  },
 
417
  },
418
  {
419
  "cell_type": "code",
420
+ "source": [
421
+ "import numpy as np\n",
422
+ "\n",
423
+ "if \"sentiment_label\" not in df_books.columns:\n",
424
+ " labels = [\"positive\", \"neutral\", \"negative\"]\n",
425
+ " df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))\n",
426
+ ""
427
+ ],
428
+ "metadata": {
429
+ "id": "mNYR6hMcs1P9"
430
+ },
431
+ "execution_count": 92,
432
+ "outputs": []
433
+ },
434
+ {
435
+ "cell_type": "code",
436
+ "source": [
437
+ "import numpy as np\n",
438
+ "\n",
439
+ "if \"sentiment_label\" not in df_books.columns:\n",
440
+ " labels = [\"positive\", \"neutral\", \"negative\"]\n",
441
+ " df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))"
442
+ ],
443
+ "metadata": {
444
+ "id": "crhSJ861s27Q"
445
+ },
446
+ "execution_count": 93,
447
+ "outputs": []
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": 94,
452
  "metadata": {
453
  "id": "SlJ24AUafoDB"
454
  },
 
477
  },
478
  {
479
  "cell_type": "code",
480
+ "execution_count": 94,
481
  "metadata": {
482
  "id": "wcN6gtiZg-ws"
483
  },
 
495
  },
496
  {
497
  "cell_type": "code",
498
+ "source": [
499
+ "sales_data = []\n",
500
+ "for _, row in df_books.iterrows():\n",
501
+ " records = generate_sales_profile(row[\"sentiment_label\"])\n",
502
+ " for month, units in records:\n",
503
+ " sales_data.append({\n",
504
+ " \"title\": row[\"title\"],\n",
505
+ " \"month\": month,\n",
506
+ " \"units_sold\": units,\n",
507
+ " \"sentiment_label\": row[\"sentiment_label\"]\n",
508
+ " })"
509
+ ],
510
  "metadata": {
511
+ "id": "R0XK8LjDtXCe"
512
  },
513
+ "execution_count": 95,
514
+ "outputs": []
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "source": [
519
+ "df_sales = pd.DataFrame(sales_data)\n",
520
+ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
521
+ "print(df_sales.head())"
522
+ ],
523
+ "metadata": {
524
+ "id": "f5qTbY8itZPE",
525
+ "outputId": "187b94f4-5c87-43b1-f69b-fe1e6ae7d433",
526
+ "colab": {
527
+ "base_uri": "https://localhost:8080/"
528
+ }
529
+ },
530
+ "execution_count": 96,
531
+ "outputs": [
532
+ {
533
+ "output_type": "stream",
534
+ "name": "stdout",
535
+ "text": [
536
+ " title month units_sold sentiment_label\n",
537
+ "0 Book A 2024-10 49 negative\n",
538
+ "1 Book A 2024-11 56 negative\n",
539
+ "2 Book A 2024-12 71 negative\n",
540
+ "3 Book A 2025-01 64 negative\n",
541
+ "4 Book A 2025-02 61 negative\n"
542
+ ]
543
+ }
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "code",
548
+ "source": [
549
+ "import numpy as np\n",
550
+ "\n",
551
+ "if \"sentiment_label\" not in df_books.columns:\n",
552
+ " labels = [\"positive\", \"neutral\", \"negative\"]\n",
553
+ " df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))"
554
+ ],
555
+ "metadata": {
556
+ "id": "5ebVuc_KtZ_O"
557
+ },
558
+ "execution_count": 97,
559
+ "outputs": []
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "source": [
564
+ " sales_data = []\n",
565
+ "for _, row in df_books.iterrows():\n",
566
+ " records = generate_sales_profile(row[\"sentiment_label\"])\n",
567
+ " for month, units in records:\n",
568
+ " sales_data.append({\n",
569
+ " \"title\": row[\"title\"],\n",
570
+ " \"month\": month,\n",
571
+ " \"units_sold\": units,\n",
572
+ " \"sentiment_label\": row[\"sentiment_label\"]\n",
573
+ " })"
574
+ ],
575
+ "metadata": {
576
+ "id": "VXxCmPTatcug"
577
+ },
578
+ "execution_count": 98,
579
+ "outputs": []
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "source": [
584
+ "df_sales = pd.DataFrame(sales_data)\n",
585
+ "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
586
+ "print(\"✅ synthetic_sales_data.csv created\")\n",
587
+ "print(df_sales.head())"
588
+ ],
589
+ "metadata": {
590
+ "id": "l3fLhuNVte15",
591
+ "outputId": "b61c06a4-a534-4f4d-e5f6-a348741f7980",
592
+ "colab": {
593
+ "base_uri": "https://localhost:8080/"
594
+ }
595
+ },
596
+ "execution_count": 99,
597
+ "outputs": [
598
+ {
599
+ "output_type": "stream",
600
+ "name": "stdout",
601
+ "text": [
602
+ "✅ synthetic_sales_data.csv created\n",
603
+ " title month units_sold sentiment_label\n",
604
+ "0 Book A 2024-10 31 negative\n",
605
+ "1 Book A 2024-11 26 negative\n",
606
+ "2 Book A 2024-12 22 negative\n",
607
+ "3 Book A 2025-01 34 negative\n",
608
+ "4 Book A 2025-02 24 negative\n"
609
+ ]
610
+ }
611
+ ]
612
+ },
613
+ {
614
+ "cell_type": "code",
615
+ "source": [],
616
+ "metadata": {
617
+ "id": "B3owNJclthJm"
618
+ },
619
+ "execution_count": 99,
620
+ "outputs": []
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": 100,
625
+ "metadata": {
626
+ "colab": {
627
+ "base_uri": "https://localhost:8080/"
628
+ },
629
+ "id": "MzbZvLcAhGaH",
630
+ "outputId": "4858910f-6703-4d9c-bdda-76cf5cfef58e"
631
+ },
632
+ "outputs": [
633
+ {
634
+ "output_type": "stream",
635
+ "name": "stdout",
636
+ "text": [
637
+ " title month units_sold sentiment_label\n",
638
+ "0 Book A 2024-10 31 negative\n",
639
+ "1 Book A 2024-11 26 negative\n",
640
+ "2 Book A 2024-12 22 negative\n",
641
+ "3 Book A 2025-01 34 negative\n",
642
+ "4 Book A 2025-02 24 negative\n"
643
+ ]
644
+ }
645
+ ],
646
  "source": [
647
  "df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
648
  "\n",
 
669
  },
670
  {
671
  "cell_type": "code",
672
+ "execution_count": 101,
673
  "metadata": {
674
  "id": "b3cd2a50"
675
  },
 
705
  },
706
  {
707
  "cell_type": "code",
708
+ "source": [
709
+ "random.choices(review_pool, k=10)"
710
+ ],
711
  "metadata": {
712
+ "id": "inTizEVgvS2I",
713
+ "outputId": "5692f777-aeda-4452-8f42-0db5ff094ddf",
714
+ "colab": {
715
+ "base_uri": "https://localhost:8080/"
716
+ }
717
  },
718
+ "execution_count": 102,
719
+ "outputs": [
720
+ {
721
+ "output_type": "execute_result",
722
+ "data": {
723
+ "text/plain": [
724
+ "[\"Disappointing. I had high hopes, but they weren't met.\",\n",
725
+ " \"Disappointing. I had high hopes, but they weren't met.\",\n",
726
+ " 'I struggled to get through this one — it just didn’t grab me.',\n",
727
+ " 'I struggled to get through this one — it just didn’t grab me.',\n",
728
+ " 'I struggled to get through this one — it just didn’t grab me.',\n",
729
+ " 'The plot was confusing and the characters felt underdeveloped.',\n",
730
+ " \"Disappointing. I had high hopes, but they weren't met.\",\n",
731
+ " 'I struggled to get through this one — it just didn’t grab me.',\n",
732
+ " 'I struggled to get through this one — it just didn’t grab me.',\n",
733
+ " \"Disappointing. I had high hopes, but they weren't met.\"]"
734
+ ]
735
+ },
736
+ "metadata": {},
737
+ "execution_count": 102
738
+ }
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "source": [
744
+ "import numpy as np\n",
745
+ "\n",
746
+ "if \"popularity_score\" not in df_books.columns:\n",
747
+ " df_books[\"popularity_score\"] = np.random.randint(1, 101, size=len(df_books))"
748
+ ],
749
+ "metadata": {
750
+ "id": "wphd_MYCvg4R"
751
+ },
752
+ "execution_count": 103,
753
+ "outputs": []
754
+ },
755
+ {
756
+ "cell_type": "code",
757
+ "source": [
758
+ "import random\n",
759
+ "\n",
760
+ "if \"popularity_score\" not in df_books.columns:\n",
761
+ " df_books[\"popularity_score\"] = np.random.randint(1, 101, size=len(df_books))\n",
762
+ "\n",
763
+ "review_rows = []\n",
764
+ "for _, row in df_books.iterrows():\n",
765
+ " title = row['title']\n",
766
+ " sentiment_label = row['sentiment_label']\n",
767
+ " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
768
+ " sampled_reviews = random.choices(review_pool, k=10)\n",
769
+ "\n",
770
+ " for review_text in sampled_reviews:\n",
771
+ " review_rows.append({\n",
772
+ " \"title\": title,\n",
773
+ " \"sentiment_label\": sentiment_label,\n",
774
+ " \"review_text\": review_text,\n",
775
+ " \"rating\": row['rating'],\n",
776
+ " \"popularity_score\": row['popularity_score']\n",
777
+ " })"
778
+ ],
779
+ "metadata": {
780
+ "id": "fBFOohZ2vhyT"
781
+ },
782
+ "execution_count": 104,
783
+ "outputs": []
784
+ },
785
+ {
786
+ "cell_type": "code",
787
+ "source": [],
788
+ "metadata": {
789
+ "id": "OJsnG5h-vkGN"
790
+ },
791
+ "execution_count": 104,
792
+ "outputs": []
793
+ },
794
+ {
795
+ "cell_type": "code",
796
  "source": [
797
  "review_rows = []\n",
798
  "for _, row in df_books.iterrows():\n",
799
  " title = row['title']\n",
800
  " sentiment_label = row['sentiment_label']\n",
801
  " review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
802
+ "\n",
803
+ " sampled_reviews = random.choices(review_pool, k=10) # ✅ FIX HERE\n",
804
+ "\n",
805
  " for review_text in sampled_reviews:\n",
806
  " review_rows.append({\n",
807
  " \"title\": title,\n",
 
810
  " \"rating\": row['rating'],\n",
811
  " \"popularity_score\": row['popularity_score']\n",
812
  " })"
813
+ ],
814
+ "metadata": {
815
+ "id": "yHoawsbnvYIV"
816
+ },
817
+ "execution_count": 105,
818
+ "outputs": []
819
  },
820
  {
821
  "cell_type": "markdown",
 
828
  },
829
  {
830
  "cell_type": "code",
831
+ "execution_count": 106,
832
  "metadata": {
833
  "id": "ZUKUqZsuumsp"
834
  },
 
849
  },
850
  {
851
  "cell_type": "code",
852
+ "execution_count": 107,
853
  "metadata": {
854
+ "colab": {
855
+ "base_uri": "https://localhost:8080/"
856
+ },
857
+ "id": "3946e521",
858
+ "outputId": "bb81b360-789d-409e-f3f1-f9445656af2d"
859
  },
860
+ "outputs": [
861
+ {
862
+ "output_type": "stream",
863
+ "name": "stdout",
864
+ "text": [
865
+ "✅ Wrote synthetic_title_level_features.csv\n",
866
+ "✅ Wrote synthetic_monthly_revenue_series.csv\n"
867
+ ]
868
+ }
869
+ ],
870
  "source": [
871
  "import numpy as np\n",
872
  "\n",
 
981
  },
982
  {
983
  "cell_type": "code",
984
+ "execution_count": 107,
985
  "metadata": {
986
  "id": "xfE8NMqOurKo"
987
  },