Spaces:
Running
Running
Upload datacreation.ipynb
Browse files- datacreation.ipynb +309 -32
datacreation.ipynb
CHANGED
|
@@ -20,13 +20,13 @@
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"cell_type": "code",
|
| 23 |
-
"execution_count":
|
| 24 |
"metadata": {
|
| 25 |
"colab": {
|
| 26 |
"base_uri": "https://localhost:8080/"
|
| 27 |
},
|
| 28 |
"id": "f48c8f8c",
|
| 29 |
-
"outputId": "
|
| 30 |
},
|
| 31 |
"outputs": [
|
| 32 |
{
|
|
@@ -85,7 +85,7 @@
|
|
| 85 |
},
|
| 86 |
{
|
| 87 |
"cell_type": "code",
|
| 88 |
-
"execution_count":
|
| 89 |
"metadata": {
|
| 90 |
"id": "91d52125"
|
| 91 |
},
|
|
@@ -113,7 +113,7 @@
|
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"cell_type": "code",
|
| 116 |
-
"execution_count":
|
| 117 |
"metadata": {
|
| 118 |
"id": "xqO5Y3dnYhxt"
|
| 119 |
},
|
|
@@ -145,7 +145,7 @@
|
|
| 145 |
},
|
| 146 |
{
|
| 147 |
"cell_type": "code",
|
| 148 |
-
"execution_count":
|
| 149 |
"metadata": {
|
| 150 |
"id": "l5FkkNhUYTHh"
|
| 151 |
},
|
|
@@ -175,7 +175,7 @@
|
|
| 175 |
"metadata": {
|
| 176 |
"id": "j_U7YrVrrN3n"
|
| 177 |
},
|
| 178 |
-
"execution_count":
|
| 179 |
"outputs": []
|
| 180 |
},
|
| 181 |
{
|
|
@@ -186,7 +186,7 @@
|
|
| 186 |
"metadata": {
|
| 187 |
"id": "KJ-lE6ktrQX9"
|
| 188 |
},
|
| 189 |
-
"execution_count":
|
| 190 |
"outputs": []
|
| 191 |
},
|
| 192 |
{
|
|
@@ -203,12 +203,12 @@
|
|
| 203 |
"metadata": {
|
| 204 |
"id": "AqZUPGJtrSET"
|
| 205 |
},
|
| 206 |
-
"execution_count":
|
| 207 |
"outputs": []
|
| 208 |
},
|
| 209 |
{
|
| 210 |
"cell_type": "code",
|
| 211 |
-
"execution_count":
|
| 212 |
"metadata": {
|
| 213 |
"id": "lC1U_YHtZifh"
|
| 214 |
},
|
|
@@ -232,7 +232,7 @@
|
|
| 232 |
},
|
| 233 |
{
|
| 234 |
"cell_type": "code",
|
| 235 |
-
"execution_count":
|
| 236 |
"metadata": {
|
| 237 |
"id": "O_wIvTxYZqCK"
|
| 238 |
},
|
|
@@ -259,7 +259,7 @@
|
|
| 259 |
},
|
| 260 |
{
|
| 261 |
"cell_type": "code",
|
| 262 |
-
"execution_count":
|
| 263 |
"metadata": {
|
| 264 |
"id": "-gPXGcRPuV_9"
|
| 265 |
},
|
|
@@ -286,7 +286,7 @@
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
-
"execution_count":
|
| 290 |
"metadata": {
|
| 291 |
"id": "mnd5hdAbaNjz"
|
| 292 |
},
|
|
@@ -309,7 +309,7 @@
|
|
| 309 |
},
|
| 310 |
{
|
| 311 |
"cell_type": "code",
|
| 312 |
-
"execution_count":
|
| 313 |
"metadata": {
|
| 314 |
"id": "V-G3OCUCgR07"
|
| 315 |
},
|
|
@@ -327,7 +327,7 @@
|
|
| 327 |
},
|
| 328 |
{
|
| 329 |
"cell_type": "code",
|
| 330 |
-
"execution_count":
|
| 331 |
"metadata": {
|
| 332 |
"id": "kUtWmr8maZLZ"
|
| 333 |
},
|
|
@@ -353,7 +353,7 @@
|
|
| 353 |
},
|
| 354 |
{
|
| 355 |
"cell_type": "code",
|
| 356 |
-
"execution_count":
|
| 357 |
"metadata": {
|
| 358 |
"id": "tafQj8_7gYCG"
|
| 359 |
},
|
|
@@ -380,7 +380,7 @@
|
|
| 380 |
},
|
| 381 |
{
|
| 382 |
"cell_type": "code",
|
| 383 |
-
"execution_count":
|
| 384 |
"metadata": {
|
| 385 |
"id": "qkVhYPXGbgEn"
|
| 386 |
},
|
|
@@ -417,7 +417,38 @@
|
|
| 417 |
},
|
| 418 |
{
|
| 419 |
"cell_type": "code",
|
| 420 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
"metadata": {
|
| 422 |
"id": "SlJ24AUafoDB"
|
| 423 |
},
|
|
@@ -446,7 +477,7 @@
|
|
| 446 |
},
|
| 447 |
{
|
| 448 |
"cell_type": "code",
|
| 449 |
-
"execution_count":
|
| 450 |
"metadata": {
|
| 451 |
"id": "wcN6gtiZg-ws"
|
| 452 |
},
|
|
@@ -464,11 +495,154 @@
|
|
| 464 |
},
|
| 465 |
{
|
| 466 |
"cell_type": "code",
|
| 467 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
"metadata": {
|
| 469 |
-
"id": "
|
| 470 |
},
|
| 471 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
"source": [
|
| 473 |
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
|
| 474 |
"\n",
|
|
@@ -495,7 +669,7 @@
|
|
| 495 |
},
|
| 496 |
{
|
| 497 |
"cell_type": "code",
|
| 498 |
-
"execution_count":
|
| 499 |
"metadata": {
|
| 500 |
"id": "b3cd2a50"
|
| 501 |
},
|
|
@@ -531,18 +705,103 @@
|
|
| 531 |
},
|
| 532 |
{
|
| 533 |
"cell_type": "code",
|
| 534 |
-
"
|
|
|
|
|
|
|
| 535 |
"metadata": {
|
| 536 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
},
|
| 538 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
"source": [
|
| 540 |
"review_rows = []\n",
|
| 541 |
"for _, row in df_books.iterrows():\n",
|
| 542 |
" title = row['title']\n",
|
| 543 |
" sentiment_label = row['sentiment_label']\n",
|
| 544 |
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
|
| 545 |
-
"
|
|
|
|
|
|
|
| 546 |
" for review_text in sampled_reviews:\n",
|
| 547 |
" review_rows.append({\n",
|
| 548 |
" \"title\": title,\n",
|
|
@@ -551,7 +810,12 @@
|
|
| 551 |
" \"rating\": row['rating'],\n",
|
| 552 |
" \"popularity_score\": row['popularity_score']\n",
|
| 553 |
" })"
|
| 554 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
},
|
| 556 |
{
|
| 557 |
"cell_type": "markdown",
|
|
@@ -564,7 +828,7 @@
|
|
| 564 |
},
|
| 565 |
{
|
| 566 |
"cell_type": "code",
|
| 567 |
-
"execution_count":
|
| 568 |
"metadata": {
|
| 569 |
"id": "ZUKUqZsuumsp"
|
| 570 |
},
|
|
@@ -585,11 +849,24 @@
|
|
| 585 |
},
|
| 586 |
{
|
| 587 |
"cell_type": "code",
|
| 588 |
-
"execution_count":
|
| 589 |
"metadata": {
|
| 590 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
},
|
| 592 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
"source": [
|
| 594 |
"import numpy as np\n",
|
| 595 |
"\n",
|
|
@@ -704,7 +981,7 @@
|
|
| 704 |
},
|
| 705 |
{
|
| 706 |
"cell_type": "code",
|
| 707 |
-
"execution_count":
|
| 708 |
"metadata": {
|
| 709 |
"id": "xfE8NMqOurKo"
|
| 710 |
},
|
|
|
|
| 20 |
},
|
| 21 |
{
|
| 22 |
"cell_type": "code",
|
| 23 |
+
"execution_count": 81,
|
| 24 |
"metadata": {
|
| 25 |
"colab": {
|
| 26 |
"base_uri": "https://localhost:8080/"
|
| 27 |
},
|
| 28 |
"id": "f48c8f8c",
|
| 29 |
+
"outputId": "d0cf9eb7-407a-4275-a05d-a6016d1d3277"
|
| 30 |
},
|
| 31 |
"outputs": [
|
| 32 |
{
|
|
|
|
| 85 |
},
|
| 86 |
{
|
| 87 |
"cell_type": "code",
|
| 88 |
+
"execution_count": 82,
|
| 89 |
"metadata": {
|
| 90 |
"id": "91d52125"
|
| 91 |
},
|
|
|
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"cell_type": "code",
|
| 116 |
+
"execution_count": 83,
|
| 117 |
"metadata": {
|
| 118 |
"id": "xqO5Y3dnYhxt"
|
| 119 |
},
|
|
|
|
| 145 |
},
|
| 146 |
{
|
| 147 |
"cell_type": "code",
|
| 148 |
+
"execution_count": 83,
|
| 149 |
"metadata": {
|
| 150 |
"id": "l5FkkNhUYTHh"
|
| 151 |
},
|
|
|
|
| 175 |
"metadata": {
|
| 176 |
"id": "j_U7YrVrrN3n"
|
| 177 |
},
|
| 178 |
+
"execution_count": 84,
|
| 179 |
"outputs": []
|
| 180 |
},
|
| 181 |
{
|
|
|
|
| 186 |
"metadata": {
|
| 187 |
"id": "KJ-lE6ktrQX9"
|
| 188 |
},
|
| 189 |
+
"execution_count": 85,
|
| 190 |
"outputs": []
|
| 191 |
},
|
| 192 |
{
|
|
|
|
| 203 |
"metadata": {
|
| 204 |
"id": "AqZUPGJtrSET"
|
| 205 |
},
|
| 206 |
+
"execution_count": 86,
|
| 207 |
"outputs": []
|
| 208 |
},
|
| 209 |
{
|
| 210 |
"cell_type": "code",
|
| 211 |
+
"execution_count": 87,
|
| 212 |
"metadata": {
|
| 213 |
"id": "lC1U_YHtZifh"
|
| 214 |
},
|
|
|
|
| 232 |
},
|
| 233 |
{
|
| 234 |
"cell_type": "code",
|
| 235 |
+
"execution_count": 87,
|
| 236 |
"metadata": {
|
| 237 |
"id": "O_wIvTxYZqCK"
|
| 238 |
},
|
|
|
|
| 259 |
},
|
| 260 |
{
|
| 261 |
"cell_type": "code",
|
| 262 |
+
"execution_count": 88,
|
| 263 |
"metadata": {
|
| 264 |
"id": "-gPXGcRPuV_9"
|
| 265 |
},
|
|
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"cell_type": "code",
|
| 289 |
+
"execution_count": 89,
|
| 290 |
"metadata": {
|
| 291 |
"id": "mnd5hdAbaNjz"
|
| 292 |
},
|
|
|
|
| 309 |
},
|
| 310 |
{
|
| 311 |
"cell_type": "code",
|
| 312 |
+
"execution_count": 89,
|
| 313 |
"metadata": {
|
| 314 |
"id": "V-G3OCUCgR07"
|
| 315 |
},
|
|
|
|
| 327 |
},
|
| 328 |
{
|
| 329 |
"cell_type": "code",
|
| 330 |
+
"execution_count": 90,
|
| 331 |
"metadata": {
|
| 332 |
"id": "kUtWmr8maZLZ"
|
| 333 |
},
|
|
|
|
| 353 |
},
|
| 354 |
{
|
| 355 |
"cell_type": "code",
|
| 356 |
+
"execution_count": 90,
|
| 357 |
"metadata": {
|
| 358 |
"id": "tafQj8_7gYCG"
|
| 359 |
},
|
|
|
|
| 380 |
},
|
| 381 |
{
|
| 382 |
"cell_type": "code",
|
| 383 |
+
"execution_count": 91,
|
| 384 |
"metadata": {
|
| 385 |
"id": "qkVhYPXGbgEn"
|
| 386 |
},
|
|
|
|
| 417 |
},
|
| 418 |
{
|
| 419 |
"cell_type": "code",
|
| 420 |
+
"source": [
|
| 421 |
+
"import numpy as np\n",
|
| 422 |
+
"\n",
|
| 423 |
+
"if \"sentiment_label\" not in df_books.columns:\n",
|
| 424 |
+
" labels = [\"positive\", \"neutral\", \"negative\"]\n",
|
| 425 |
+
" df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))\n",
|
| 426 |
+
""
|
| 427 |
+
],
|
| 428 |
+
"metadata": {
|
| 429 |
+
"id": "mNYR6hMcs1P9"
|
| 430 |
+
},
|
| 431 |
+
"execution_count": 92,
|
| 432 |
+
"outputs": []
|
| 433 |
+
},
|
| 434 |
+
{
|
| 435 |
+
"cell_type": "code",
|
| 436 |
+
"source": [
|
| 437 |
+
"import numpy as np\n",
|
| 438 |
+
"\n",
|
| 439 |
+
"if \"sentiment_label\" not in df_books.columns:\n",
|
| 440 |
+
" labels = [\"positive\", \"neutral\", \"negative\"]\n",
|
| 441 |
+
" df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))"
|
| 442 |
+
],
|
| 443 |
+
"metadata": {
|
| 444 |
+
"id": "crhSJ861s27Q"
|
| 445 |
+
},
|
| 446 |
+
"execution_count": 93,
|
| 447 |
+
"outputs": []
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"cell_type": "code",
|
| 451 |
+
"execution_count": 94,
|
| 452 |
"metadata": {
|
| 453 |
"id": "SlJ24AUafoDB"
|
| 454 |
},
|
|
|
|
| 477 |
},
|
| 478 |
{
|
| 479 |
"cell_type": "code",
|
| 480 |
+
"execution_count": 94,
|
| 481 |
"metadata": {
|
| 482 |
"id": "wcN6gtiZg-ws"
|
| 483 |
},
|
|
|
|
| 495 |
},
|
| 496 |
{
|
| 497 |
"cell_type": "code",
|
| 498 |
+
"source": [
|
| 499 |
+
"sales_data = []\n",
|
| 500 |
+
"for _, row in df_books.iterrows():\n",
|
| 501 |
+
" records = generate_sales_profile(row[\"sentiment_label\"])\n",
|
| 502 |
+
" for month, units in records:\n",
|
| 503 |
+
" sales_data.append({\n",
|
| 504 |
+
" \"title\": row[\"title\"],\n",
|
| 505 |
+
" \"month\": month,\n",
|
| 506 |
+
" \"units_sold\": units,\n",
|
| 507 |
+
" \"sentiment_label\": row[\"sentiment_label\"]\n",
|
| 508 |
+
" })"
|
| 509 |
+
],
|
| 510 |
"metadata": {
|
| 511 |
+
"id": "R0XK8LjDtXCe"
|
| 512 |
},
|
| 513 |
+
"execution_count": 95,
|
| 514 |
+
"outputs": []
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"cell_type": "code",
|
| 518 |
+
"source": [
|
| 519 |
+
"df_sales = pd.DataFrame(sales_data)\n",
|
| 520 |
+
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
|
| 521 |
+
"print(df_sales.head())"
|
| 522 |
+
],
|
| 523 |
+
"metadata": {
|
| 524 |
+
"id": "f5qTbY8itZPE",
|
| 525 |
+
"outputId": "187b94f4-5c87-43b1-f69b-fe1e6ae7d433",
|
| 526 |
+
"colab": {
|
| 527 |
+
"base_uri": "https://localhost:8080/"
|
| 528 |
+
}
|
| 529 |
+
},
|
| 530 |
+
"execution_count": 96,
|
| 531 |
+
"outputs": [
|
| 532 |
+
{
|
| 533 |
+
"output_type": "stream",
|
| 534 |
+
"name": "stdout",
|
| 535 |
+
"text": [
|
| 536 |
+
" title month units_sold sentiment_label\n",
|
| 537 |
+
"0 Book A 2024-10 49 negative\n",
|
| 538 |
+
"1 Book A 2024-11 56 negative\n",
|
| 539 |
+
"2 Book A 2024-12 71 negative\n",
|
| 540 |
+
"3 Book A 2025-01 64 negative\n",
|
| 541 |
+
"4 Book A 2025-02 61 negative\n"
|
| 542 |
+
]
|
| 543 |
+
}
|
| 544 |
+
]
|
| 545 |
+
},
|
| 546 |
+
{
|
| 547 |
+
"cell_type": "code",
|
| 548 |
+
"source": [
|
| 549 |
+
"import numpy as np\n",
|
| 550 |
+
"\n",
|
| 551 |
+
"if \"sentiment_label\" not in df_books.columns:\n",
|
| 552 |
+
" labels = [\"positive\", \"neutral\", \"negative\"]\n",
|
| 553 |
+
" df_books[\"sentiment_label\"] = np.random.choice(labels, size=len(df_books))"
|
| 554 |
+
],
|
| 555 |
+
"metadata": {
|
| 556 |
+
"id": "5ebVuc_KtZ_O"
|
| 557 |
+
},
|
| 558 |
+
"execution_count": 97,
|
| 559 |
+
"outputs": []
|
| 560 |
+
},
|
| 561 |
+
{
|
| 562 |
+
"cell_type": "code",
|
| 563 |
+
"source": [
|
| 564 |
+
" sales_data = []\n",
|
| 565 |
+
"for _, row in df_books.iterrows():\n",
|
| 566 |
+
" records = generate_sales_profile(row[\"sentiment_label\"])\n",
|
| 567 |
+
" for month, units in records:\n",
|
| 568 |
+
" sales_data.append({\n",
|
| 569 |
+
" \"title\": row[\"title\"],\n",
|
| 570 |
+
" \"month\": month,\n",
|
| 571 |
+
" \"units_sold\": units,\n",
|
| 572 |
+
" \"sentiment_label\": row[\"sentiment_label\"]\n",
|
| 573 |
+
" })"
|
| 574 |
+
],
|
| 575 |
+
"metadata": {
|
| 576 |
+
"id": "VXxCmPTatcug"
|
| 577 |
+
},
|
| 578 |
+
"execution_count": 98,
|
| 579 |
+
"outputs": []
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"cell_type": "code",
|
| 583 |
+
"source": [
|
| 584 |
+
"df_sales = pd.DataFrame(sales_data)\n",
|
| 585 |
+
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
|
| 586 |
+
"print(\"✅ synthetic_sales_data.csv created\")\n",
|
| 587 |
+
"print(df_sales.head())"
|
| 588 |
+
],
|
| 589 |
+
"metadata": {
|
| 590 |
+
"id": "l3fLhuNVte15",
|
| 591 |
+
"outputId": "b61c06a4-a534-4f4d-e5f6-a348741f7980",
|
| 592 |
+
"colab": {
|
| 593 |
+
"base_uri": "https://localhost:8080/"
|
| 594 |
+
}
|
| 595 |
+
},
|
| 596 |
+
"execution_count": 99,
|
| 597 |
+
"outputs": [
|
| 598 |
+
{
|
| 599 |
+
"output_type": "stream",
|
| 600 |
+
"name": "stdout",
|
| 601 |
+
"text": [
|
| 602 |
+
"✅ synthetic_sales_data.csv created\n",
|
| 603 |
+
" title month units_sold sentiment_label\n",
|
| 604 |
+
"0 Book A 2024-10 31 negative\n",
|
| 605 |
+
"1 Book A 2024-11 26 negative\n",
|
| 606 |
+
"2 Book A 2024-12 22 negative\n",
|
| 607 |
+
"3 Book A 2025-01 34 negative\n",
|
| 608 |
+
"4 Book A 2025-02 24 negative\n"
|
| 609 |
+
]
|
| 610 |
+
}
|
| 611 |
+
]
|
| 612 |
+
},
|
| 613 |
+
{
|
| 614 |
+
"cell_type": "code",
|
| 615 |
+
"source": [],
|
| 616 |
+
"metadata": {
|
| 617 |
+
"id": "B3owNJclthJm"
|
| 618 |
+
},
|
| 619 |
+
"execution_count": 99,
|
| 620 |
+
"outputs": []
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"cell_type": "code",
|
| 624 |
+
"execution_count": 100,
|
| 625 |
+
"metadata": {
|
| 626 |
+
"colab": {
|
| 627 |
+
"base_uri": "https://localhost:8080/"
|
| 628 |
+
},
|
| 629 |
+
"id": "MzbZvLcAhGaH",
|
| 630 |
+
"outputId": "4858910f-6703-4d9c-bdda-76cf5cfef58e"
|
| 631 |
+
},
|
| 632 |
+
"outputs": [
|
| 633 |
+
{
|
| 634 |
+
"output_type": "stream",
|
| 635 |
+
"name": "stdout",
|
| 636 |
+
"text": [
|
| 637 |
+
" title month units_sold sentiment_label\n",
|
| 638 |
+
"0 Book A 2024-10 31 negative\n",
|
| 639 |
+
"1 Book A 2024-11 26 negative\n",
|
| 640 |
+
"2 Book A 2024-12 22 negative\n",
|
| 641 |
+
"3 Book A 2025-01 34 negative\n",
|
| 642 |
+
"4 Book A 2025-02 24 negative\n"
|
| 643 |
+
]
|
| 644 |
+
}
|
| 645 |
+
],
|
| 646 |
"source": [
|
| 647 |
"df_sales.to_csv(\"synthetic_sales_data.csv\", index=False)\n",
|
| 648 |
"\n",
|
|
|
|
| 669 |
},
|
| 670 |
{
|
| 671 |
"cell_type": "code",
|
| 672 |
+
"execution_count": 101,
|
| 673 |
"metadata": {
|
| 674 |
"id": "b3cd2a50"
|
| 675 |
},
|
|
|
|
| 705 |
},
|
| 706 |
{
|
| 707 |
"cell_type": "code",
|
| 708 |
+
"source": [
|
| 709 |
+
"random.choices(review_pool, k=10)"
|
| 710 |
+
],
|
| 711 |
"metadata": {
|
| 712 |
+
"id": "inTizEVgvS2I",
|
| 713 |
+
"outputId": "5692f777-aeda-4452-8f42-0db5ff094ddf",
|
| 714 |
+
"colab": {
|
| 715 |
+
"base_uri": "https://localhost:8080/"
|
| 716 |
+
}
|
| 717 |
},
|
| 718 |
+
"execution_count": 102,
|
| 719 |
+
"outputs": [
|
| 720 |
+
{
|
| 721 |
+
"output_type": "execute_result",
|
| 722 |
+
"data": {
|
| 723 |
+
"text/plain": [
|
| 724 |
+
"[\"Disappointing. I had high hopes, but they weren't met.\",\n",
|
| 725 |
+
" \"Disappointing. I had high hopes, but they weren't met.\",\n",
|
| 726 |
+
" 'I struggled to get through this one — it just didn’t grab me.',\n",
|
| 727 |
+
" 'I struggled to get through this one — it just didn’t grab me.',\n",
|
| 728 |
+
" 'I struggled to get through this one — it just didn’t grab me.',\n",
|
| 729 |
+
" 'The plot was confusing and the characters felt underdeveloped.',\n",
|
| 730 |
+
" \"Disappointing. I had high hopes, but they weren't met.\",\n",
|
| 731 |
+
" 'I struggled to get through this one — it just didn’t grab me.',\n",
|
| 732 |
+
" 'I struggled to get through this one — it just didn’t grab me.',\n",
|
| 733 |
+
" \"Disappointing. I had high hopes, but they weren't met.\"]"
|
| 734 |
+
]
|
| 735 |
+
},
|
| 736 |
+
"metadata": {},
|
| 737 |
+
"execution_count": 102
|
| 738 |
+
}
|
| 739 |
+
]
|
| 740 |
+
},
|
| 741 |
+
{
|
| 742 |
+
"cell_type": "code",
|
| 743 |
+
"source": [
|
| 744 |
+
"import numpy as np\n",
|
| 745 |
+
"\n",
|
| 746 |
+
"if \"popularity_score\" not in df_books.columns:\n",
|
| 747 |
+
" df_books[\"popularity_score\"] = np.random.randint(1, 101, size=len(df_books))"
|
| 748 |
+
],
|
| 749 |
+
"metadata": {
|
| 750 |
+
"id": "wphd_MYCvg4R"
|
| 751 |
+
},
|
| 752 |
+
"execution_count": 103,
|
| 753 |
+
"outputs": []
|
| 754 |
+
},
|
| 755 |
+
{
|
| 756 |
+
"cell_type": "code",
|
| 757 |
+
"source": [
|
| 758 |
+
"import random\n",
|
| 759 |
+
"\n",
|
| 760 |
+
"if \"popularity_score\" not in df_books.columns:\n",
|
| 761 |
+
" df_books[\"popularity_score\"] = np.random.randint(1, 101, size=len(df_books))\n",
|
| 762 |
+
"\n",
|
| 763 |
+
"review_rows = []\n",
|
| 764 |
+
"for _, row in df_books.iterrows():\n",
|
| 765 |
+
" title = row['title']\n",
|
| 766 |
+
" sentiment_label = row['sentiment_label']\n",
|
| 767 |
+
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
|
| 768 |
+
" sampled_reviews = random.choices(review_pool, k=10)\n",
|
| 769 |
+
"\n",
|
| 770 |
+
" for review_text in sampled_reviews:\n",
|
| 771 |
+
" review_rows.append({\n",
|
| 772 |
+
" \"title\": title,\n",
|
| 773 |
+
" \"sentiment_label\": sentiment_label,\n",
|
| 774 |
+
" \"review_text\": review_text,\n",
|
| 775 |
+
" \"rating\": row['rating'],\n",
|
| 776 |
+
" \"popularity_score\": row['popularity_score']\n",
|
| 777 |
+
" })"
|
| 778 |
+
],
|
| 779 |
+
"metadata": {
|
| 780 |
+
"id": "fBFOohZ2vhyT"
|
| 781 |
+
},
|
| 782 |
+
"execution_count": 104,
|
| 783 |
+
"outputs": []
|
| 784 |
+
},
|
| 785 |
+
{
|
| 786 |
+
"cell_type": "code",
|
| 787 |
+
"source": [],
|
| 788 |
+
"metadata": {
|
| 789 |
+
"id": "OJsnG5h-vkGN"
|
| 790 |
+
},
|
| 791 |
+
"execution_count": 104,
|
| 792 |
+
"outputs": []
|
| 793 |
+
},
|
| 794 |
+
{
|
| 795 |
+
"cell_type": "code",
|
| 796 |
"source": [
|
| 797 |
"review_rows = []\n",
|
| 798 |
"for _, row in df_books.iterrows():\n",
|
| 799 |
" title = row['title']\n",
|
| 800 |
" sentiment_label = row['sentiment_label']\n",
|
| 801 |
" review_pool = synthetic_reviews_by_sentiment[sentiment_label]\n",
|
| 802 |
+
"\n",
|
| 803 |
+
" sampled_reviews = random.choices(review_pool, k=10) # ✅ FIX HERE\n",
|
| 804 |
+
"\n",
|
| 805 |
" for review_text in sampled_reviews:\n",
|
| 806 |
" review_rows.append({\n",
|
| 807 |
" \"title\": title,\n",
|
|
|
|
| 810 |
" \"rating\": row['rating'],\n",
|
| 811 |
" \"popularity_score\": row['popularity_score']\n",
|
| 812 |
" })"
|
| 813 |
+
],
|
| 814 |
+
"metadata": {
|
| 815 |
+
"id": "yHoawsbnvYIV"
|
| 816 |
+
},
|
| 817 |
+
"execution_count": 105,
|
| 818 |
+
"outputs": []
|
| 819 |
},
|
| 820 |
{
|
| 821 |
"cell_type": "markdown",
|
|
|
|
| 828 |
},
|
| 829 |
{
|
| 830 |
"cell_type": "code",
|
| 831 |
+
"execution_count": 106,
|
| 832 |
"metadata": {
|
| 833 |
"id": "ZUKUqZsuumsp"
|
| 834 |
},
|
|
|
|
| 849 |
},
|
| 850 |
{
|
| 851 |
"cell_type": "code",
|
| 852 |
+
"execution_count": 107,
|
| 853 |
"metadata": {
|
| 854 |
+
"colab": {
|
| 855 |
+
"base_uri": "https://localhost:8080/"
|
| 856 |
+
},
|
| 857 |
+
"id": "3946e521",
|
| 858 |
+
"outputId": "bb81b360-789d-409e-f3f1-f9445656af2d"
|
| 859 |
},
|
| 860 |
+
"outputs": [
|
| 861 |
+
{
|
| 862 |
+
"output_type": "stream",
|
| 863 |
+
"name": "stdout",
|
| 864 |
+
"text": [
|
| 865 |
+
"✅ Wrote synthetic_title_level_features.csv\n",
|
| 866 |
+
"✅ Wrote synthetic_monthly_revenue_series.csv\n"
|
| 867 |
+
]
|
| 868 |
+
}
|
| 869 |
+
],
|
| 870 |
"source": [
|
| 871 |
"import numpy as np\n",
|
| 872 |
"\n",
|
|
|
|
| 981 |
},
|
| 982 |
{
|
| 983 |
"cell_type": "code",
|
| 984 |
+
"execution_count": 107,
|
| 985 |
"metadata": {
|
| 986 |
"id": "xfE8NMqOurKo"
|
| 987 |
},
|