saishshinde15 commited on
Commit
1816c50
·
verified ·
1 Parent(s): b677ddd

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. SentimentTensor.ipynb +412 -0
  3. sentitensor1.keras +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sentitensor1.keras filter=lfs diff=lfs merge=lfs -text
SentimentTensor.ipynb ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "7e59ad5c",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json\n",
11
+ "import random\n",
12
+ "\n",
13
+ "# Define the path to the full Yelp dataset file\n",
14
+ "full_data_path = \"yelp_academic_dataset_review.json\"\n",
15
+ "\n",
16
+ "# Define the path to save the sampled dataset file\n",
17
+ "sampled_data_path = \"yelp_academic_dataset_review_sampled.json\"\n",
18
+ "\n",
19
+ "# Define the number of reviews to sample (adjust as needed)\n",
20
+ "num_reviews_to_sample = 10000 # Example: Sample 10,000 reviews\n",
21
+ "\n",
22
+ "# Load all reviews from the full dataset\n",
23
+ "all_reviews = []\n",
24
+ "with open(full_data_path, \"r\", encoding=\"utf-8\") as f:\n",
25
+ " for line in f:\n",
26
+ " review = json.loads(line)\n",
27
+ " all_reviews.append(review)\n",
28
+ "\n",
29
+ "# Randomly sample a subset of reviews\n",
30
+ "sampled_reviews = random.sample(all_reviews, num_reviews_to_sample)\n",
31
+ "\n",
32
+ "# Save the sampled reviews to a new JSON file\n",
33
+ "with open(sampled_data_path, \"w\", encoding=\"utf-8\") as f:\n",
34
+ " for review in sampled_reviews:\n",
35
+ " json.dump(review, f)\n",
36
+ " f.write(\"\\n\")\n",
37
+ "\n",
38
+ "print(f\"Sampled {num_reviews_to_sample} reviews and saved to {sampled_data_path}\")\n"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "id": "f562ff04",
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "import gzip\n",
49
+ "\n",
50
+ "# Define the path to save the compressed dataset file\n",
51
+ "compressed_data_path = \"yelp_academic_dataset_review_sampled.json.gz\"\n",
52
+ "\n",
53
+ "# Compress the sampled dataset file using gzip\n",
54
+ "with open(sampled_data_path, \"rb\") as f_in:\n",
55
+ " with gzip.open(compressed_data_path, \"wb\") as f_out:\n",
56
+ " f_out.writelines(f_in)\n",
57
+ "\n",
58
+ "print(f\"Compressed file saved to {compressed_data_path}\")\n"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 1,
64
+ "id": "337f6649",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "import pandas as pd\n",
69
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
70
+ "from sklearn.model_selection import train_test_split\n",
71
+ "from sklearn.linear_model import LogisticRegression\n",
72
+ "from sklearn.metrics import classification_report, accuracy_score\n",
73
+ "\n",
74
+ "# Load the preprocessed Yelp dataset (sampled and compressed if applicable)\n",
75
+ "data_path = \"yelp_academic_dataset_review_sampled.json.gz\" # Adjust the path\n",
76
+ "data = pd.read_json(data_path, lines=True)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 2,
82
+ "id": "e0936968",
83
+ "metadata": {},
84
+ "outputs": [
85
+ {
86
+ "data": {
87
+ "text/html": [
88
+ "<div>\n",
89
+ "<style scoped>\n",
90
+ " .dataframe tbody tr th:only-of-type {\n",
91
+ " vertical-align: middle;\n",
92
+ " }\n",
93
+ "\n",
94
+ " .dataframe tbody tr th {\n",
95
+ " vertical-align: top;\n",
96
+ " }\n",
97
+ "\n",
98
+ " .dataframe thead th {\n",
99
+ " text-align: right;\n",
100
+ " }\n",
101
+ "</style>\n",
102
+ "<table border=\"1\" class=\"dataframe\">\n",
103
+ " <thead>\n",
104
+ " <tr style=\"text-align: right;\">\n",
105
+ " <th></th>\n",
106
+ " <th>review_id</th>\n",
107
+ " <th>user_id</th>\n",
108
+ " <th>business_id</th>\n",
109
+ " <th>stars</th>\n",
110
+ " <th>useful</th>\n",
111
+ " <th>funny</th>\n",
112
+ " <th>cool</th>\n",
113
+ " <th>text</th>\n",
114
+ " <th>date</th>\n",
115
+ " </tr>\n",
116
+ " </thead>\n",
117
+ " <tbody>\n",
118
+ " <tr>\n",
119
+ " <th>0</th>\n",
120
+ " <td>f9khuhJxadQhg6CaI1cRdA</td>\n",
121
+ " <td>4Qijwb2RDiUGc4SBjA2lJg</td>\n",
122
+ " <td>nTBStZYJfHGdSZJbpaBiPA</td>\n",
123
+ " <td>4</td>\n",
124
+ " <td>1</td>\n",
125
+ " <td>0</td>\n",
126
+ " <td>1</td>\n",
127
+ " <td>I had read about this place adding a second lo...</td>\n",
128
+ " <td>2011-02-08 17:48:40</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>1</th>\n",
132
+ " <td>WH0c1wEMu4XRTIysI7uMig</td>\n",
133
+ " <td>7JeW4Mlvqdp7R-FAUBB_vA</td>\n",
134
+ " <td>H3Tmgv94pbGvBIKZ4Rs9Cw</td>\n",
135
+ " <td>5</td>\n",
136
+ " <td>1</td>\n",
137
+ " <td>0</td>\n",
138
+ " <td>1</td>\n",
139
+ " <td>I had dinner at Tin Angel on Saturday and was ...</td>\n",
140
+ " <td>2012-04-16 13:30:02</td>\n",
141
+ " </tr>\n",
142
+ " <tr>\n",
143
+ " <th>2</th>\n",
144
+ " <td>S1Lg07IGrupUDk7Uu9rnQQ</td>\n",
145
+ " <td>umUy5DTpVrvQDXLR4gywHA</td>\n",
146
+ " <td>H7BikysfQbS9bMULQsCU_Q</td>\n",
147
+ " <td>2</td>\n",
148
+ " <td>4</td>\n",
149
+ " <td>1</td>\n",
150
+ " <td>0</td>\n",
151
+ " <td>I was really excited to visit the store, havin...</td>\n",
152
+ " <td>2019-10-05 00:17:15</td>\n",
153
+ " </tr>\n",
154
+ " <tr>\n",
155
+ " <th>3</th>\n",
156
+ " <td>AH4_Pua0yzK4oU9FoU8hXQ</td>\n",
157
+ " <td>uwYw0KKj16lC_nq_HsQGVQ</td>\n",
158
+ " <td>Xb6QfBbleg2aJT2cG807jQ</td>\n",
159
+ " <td>1</td>\n",
160
+ " <td>1</td>\n",
161
+ " <td>0</td>\n",
162
+ " <td>0</td>\n",
163
+ " <td>I hired Two Men and a Truck for my recent move...</td>\n",
164
+ " <td>2016-06-02 13:27:24</td>\n",
165
+ " </tr>\n",
166
+ " <tr>\n",
167
+ " <th>4</th>\n",
168
+ " <td>9_CIDS98p6ZsTRiCvmuIKA</td>\n",
169
+ " <td>l9bVKgzvjjcU8Iang3Tvtg</td>\n",
170
+ " <td>lqSJkyNSE1yPeux4PoR-pg</td>\n",
171
+ " <td>1</td>\n",
172
+ " <td>0</td>\n",
173
+ " <td>0</td>\n",
174
+ " <td>0</td>\n",
175
+ " <td>i was very disappointed to this company. They ...</td>\n",
176
+ " <td>2020-06-05 22:28:47</td>\n",
177
+ " </tr>\n",
178
+ " <tr>\n",
179
+ " <th>...</th>\n",
180
+ " <td>...</td>\n",
181
+ " <td>...</td>\n",
182
+ " <td>...</td>\n",
183
+ " <td>...</td>\n",
184
+ " <td>...</td>\n",
185
+ " <td>...</td>\n",
186
+ " <td>...</td>\n",
187
+ " <td>...</td>\n",
188
+ " <td>...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>9995</th>\n",
192
+ " <td>5MknizHCBH3jpj5DJd-6Uw</td>\n",
193
+ " <td>d2VrfngFJ1f1nvNAsojJzw</td>\n",
194
+ " <td>hy-E7DdXbdgTbwphKUYW1w</td>\n",
195
+ " <td>1</td>\n",
196
+ " <td>1</td>\n",
197
+ " <td>0</td>\n",
198
+ " <td>0</td>\n",
199
+ " <td>This was such a trash experience. We signed up...</td>\n",
200
+ " <td>2021-07-29 16:10:10</td>\n",
201
+ " </tr>\n",
202
+ " <tr>\n",
203
+ " <th>9996</th>\n",
204
+ " <td>mXFlaWuiCnyCkZ_SIAGqew</td>\n",
205
+ " <td>cHWDGVf4LofBk9wZ2mnXQQ</td>\n",
206
+ " <td>AYWSFv6QxF5IjQSxITMUug</td>\n",
207
+ " <td>5</td>\n",
208
+ " <td>0</td>\n",
209
+ " <td>0</td>\n",
210
+ " <td>0</td>\n",
211
+ " <td>I have been going to Goshen Nail Salon for the...</td>\n",
212
+ " <td>2018-03-16 00:30:50</td>\n",
213
+ " </tr>\n",
214
+ " <tr>\n",
215
+ " <th>9997</th>\n",
216
+ " <td>W1Ij-zC3ufRU5MTEgHLjmg</td>\n",
217
+ " <td>aN9nWudz5rfar7rHr9lHfA</td>\n",
218
+ " <td>oyJ3gXNkV0DO0YxcaTgtTg</td>\n",
219
+ " <td>5</td>\n",
220
+ " <td>0</td>\n",
221
+ " <td>0</td>\n",
222
+ " <td>0</td>\n",
223
+ " <td>Ok. This place surprised me. I always thought ...</td>\n",
224
+ " <td>2018-06-01 23:56:44</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>9998</th>\n",
228
+ " <td>HNejB5H9iD1qe3MMKxg6sg</td>\n",
229
+ " <td>6JejVLZl5M-IB3UkNTkXtQ</td>\n",
230
+ " <td>WJLKQTduGumxjlXelqiuKg</td>\n",
231
+ " <td>3</td>\n",
232
+ " <td>0</td>\n",
233
+ " <td>0</td>\n",
234
+ " <td>0</td>\n",
235
+ " <td>Meets expectations, but quirky. The trucks re...</td>\n",
236
+ " <td>2016-06-29 15:57:34</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>9999</th>\n",
240
+ " <td>LSJGzHJ7whqNn5uPxidMjQ</td>\n",
241
+ " <td>_Av1LaAAY0Y8YcPp7Ck7fg</td>\n",
242
+ " <td>M983OPfVRnwvG7zEOzykCA</td>\n",
243
+ " <td>5</td>\n",
244
+ " <td>0</td>\n",
245
+ " <td>0</td>\n",
246
+ " <td>0</td>\n",
247
+ " <td>Jordan was our waiter. He was very attentive a...</td>\n",
248
+ " <td>2017-03-15 23:54:07</td>\n",
249
+ " </tr>\n",
250
+ " </tbody>\n",
251
+ "</table>\n",
252
+ "<p>10000 rows × 9 columns</p>\n",
253
+ "</div>"
254
+ ],
255
+ "text/plain": [
256
+ " review_id user_id business_id \\\n",
257
+ "0 f9khuhJxadQhg6CaI1cRdA 4Qijwb2RDiUGc4SBjA2lJg nTBStZYJfHGdSZJbpaBiPA \n",
258
+ "1 WH0c1wEMu4XRTIysI7uMig 7JeW4Mlvqdp7R-FAUBB_vA H3Tmgv94pbGvBIKZ4Rs9Cw \n",
259
+ "2 S1Lg07IGrupUDk7Uu9rnQQ umUy5DTpVrvQDXLR4gywHA H7BikysfQbS9bMULQsCU_Q \n",
260
+ "3 AH4_Pua0yzK4oU9FoU8hXQ uwYw0KKj16lC_nq_HsQGVQ Xb6QfBbleg2aJT2cG807jQ \n",
261
+ "4 9_CIDS98p6ZsTRiCvmuIKA l9bVKgzvjjcU8Iang3Tvtg lqSJkyNSE1yPeux4PoR-pg \n",
262
+ "... ... ... ... \n",
263
+ "9995 5MknizHCBH3jpj5DJd-6Uw d2VrfngFJ1f1nvNAsojJzw hy-E7DdXbdgTbwphKUYW1w \n",
264
+ "9996 mXFlaWuiCnyCkZ_SIAGqew cHWDGVf4LofBk9wZ2mnXQQ AYWSFv6QxF5IjQSxITMUug \n",
265
+ "9997 W1Ij-zC3ufRU5MTEgHLjmg aN9nWudz5rfar7rHr9lHfA oyJ3gXNkV0DO0YxcaTgtTg \n",
266
+ "9998 HNejB5H9iD1qe3MMKxg6sg 6JejVLZl5M-IB3UkNTkXtQ WJLKQTduGumxjlXelqiuKg \n",
267
+ "9999 LSJGzHJ7whqNn5uPxidMjQ _Av1LaAAY0Y8YcPp7Ck7fg M983OPfVRnwvG7zEOzykCA \n",
268
+ "\n",
269
+ " stars useful funny cool \\\n",
270
+ "0 4 1 0 1 \n",
271
+ "1 5 1 0 1 \n",
272
+ "2 2 4 1 0 \n",
273
+ "3 1 1 0 0 \n",
274
+ "4 1 0 0 0 \n",
275
+ "... ... ... ... ... \n",
276
+ "9995 1 1 0 0 \n",
277
+ "9996 5 0 0 0 \n",
278
+ "9997 5 0 0 0 \n",
279
+ "9998 3 0 0 0 \n",
280
+ "9999 5 0 0 0 \n",
281
+ "\n",
282
+ " text date \n",
283
+ "0 I had read about this place adding a second lo... 2011-02-08 17:48:40 \n",
284
+ "1 I had dinner at Tin Angel on Saturday and was ... 2012-04-16 13:30:02 \n",
285
+ "2 I was really excited to visit the store, havin... 2019-10-05 00:17:15 \n",
286
+ "3 I hired Two Men and a Truck for my recent move... 2016-06-02 13:27:24 \n",
287
+ "4 i was very disappointed to this company. They ... 2020-06-05 22:28:47 \n",
288
+ "... ... ... \n",
289
+ "9995 This was such a trash experience. We signed up... 2021-07-29 16:10:10 \n",
290
+ "9996 I have been going to Goshen Nail Salon for the... 2018-03-16 00:30:50 \n",
291
+ "9997 Ok. This place surprised me. I always thought ... 2018-06-01 23:56:44 \n",
292
+ "9998 Meets expectations, but quirky. The trucks re... 2016-06-29 15:57:34 \n",
293
+ "9999 Jordan was our waiter. He was very attentive a... 2017-03-15 23:54:07 \n",
294
+ "\n",
295
+ "[10000 rows x 9 columns]"
296
+ ]
297
+ },
298
+ "execution_count": 2,
299
+ "metadata": {},
300
+ "output_type": "execute_result"
301
+ }
302
+ ],
303
+ "source": [
304
+ "data"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 3,
310
+ "id": "466ef010",
311
+ "metadata": {},
312
+ "outputs": [],
313
+ "source": [
314
+ "# Map stars to sentiment labels\n",
315
+ "def map_sentiment(stars):\n",
316
+ " if stars >= 4:\n",
317
+ " return \"positive\"\n",
318
+ " elif stars <= 2:\n",
319
+ " return \"negative\"\n",
320
+ " else:\n",
321
+ " return \"neutral\" # Optional: Handle neutral sentiment if needed\n",
322
+ "\n",
323
+ "# Apply sentiment mapping to stars\n",
324
+ "data['sentiment'] = data['stars'].apply(map_sentiment)\n"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 4,
330
+ "id": "756b3285",
331
+ "metadata": {},
332
+ "outputs": [],
333
+ "source": [
334
+ "# Apply sentiment mapping to stars\n",
335
+ "data['sentiment'] = data['stars'].apply(map_sentiment)\n",
336
+ "\n",
337
+ "# Split the data into training and testing sets\n",
338
+ "train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)\n",
339
+ "\n",
340
+ "# Save the preprocessed data\n",
341
+ "train_data.to_csv(\"preprocessed_train_data.csv\", index=False)\n",
342
+ "test_data.to_csv(\"preprocessed_test_data.csv\", index=False)"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": null,
348
+ "id": "7257dd9d",
349
+ "metadata": {},
350
+ "outputs": [],
351
+ "source": [
352
+ "pip install torch\n"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": null,
358
+ "id": "f03a2ad5",
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "pip install transformers"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": null,
368
+ "id": "ecdcf9c9",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "import pandas as pd\n",
373
+ "import torch\n",
374
+ "from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments\n",
375
+ "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
376
+ "\n",
377
+ "# Load the preprocessed training and testing data\n",
378
+ "train_data = pd.read_csv(\"preprocessed_train_data.csv\") # Adjust the path\n",
379
+ "test_data = pd.read_csv(\"preprocessed_test_data.csv\") # Adjust the path\n"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "c83718d7",
386
+ "metadata": {},
387
+ "outputs": [],
388
+ "source": []
389
+ }
390
+ ],
391
+ "metadata": {
392
+ "kernelspec": {
393
+ "display_name": "Python 3 (ipykernel)",
394
+ "language": "python",
395
+ "name": "python3"
396
+ },
397
+ "language_info": {
398
+ "codemirror_mode": {
399
+ "name": "ipython",
400
+ "version": 3
401
+ },
402
+ "file_extension": ".py",
403
+ "mimetype": "text/x-python",
404
+ "name": "python",
405
+ "nbconvert_exporter": "python",
406
+ "pygments_lexer": "ipython3",
407
+ "version": "3.9.12"
408
+ }
409
+ },
410
+ "nbformat": 4,
411
+ "nbformat_minor": 5
412
+ }
sentitensor1.keras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad5474595c05edd8d3ff4aa18749c0c6accb27f2402ab53c4fc35ff3d0286995
3
+ size 29927801