MightyOctopus commited on
Commit
13495b5
·
verified ·
1 Parent(s): 71ea44c

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -3,3 +3,4 @@ products_vectordb_production/2663ea41-1052-4307-83f1-1edb20555e69/index_metadata
3
  products_vectordb_production/2663ea41-1052-4307-83f1-1edb20555e69/length.bin filter=lfs diff=lfs merge=lfs -text
4
  products_vectordb_production/2663ea41-1052-4307-83f1-1edb20555e69/link_lists.bin filter=lfs diff=lfs merge=lfs -text
5
  products_vectordb_production/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
3
  products_vectordb_production/2663ea41-1052-4307-83f1-1edb20555e69/length.bin filter=lfs diff=lfs merge=lfs -text
4
  products_vectordb_production/2663ea41-1052-4307-83f1-1edb20555e69/link_lists.bin filter=lfs diff=lfs merge=lfs -text
5
  products_vectordb_production/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
6
+ models/neural_network_pricer_model.pt filter=lfs diff=lfs merge=lfs -text
models/.ipynb_checkpoints/neural_network-checkpoint.ipynb ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "5ae2adf5-1c4b-4d55-98b3-523e65ed84f1",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "\n",
11
+ "import numpy as np \n",
12
+ "from tqdm import tqdm\n",
13
+ "from sklearn.feature_extraction.text import HashingVectorizer\n",
14
+ "import torch \n",
15
+ "import torch.nn as nn\n",
16
+ "import torch.optim as optim \n",
17
+ "from torch.utils.data import DataLoader, TensorDataset\n",
18
+ "from sklearn.model_selection import train_test_split\n",
19
+ "import pickle\n"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "8e0b5764-2450-4ef5-b47f-47833a441c21",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stderr",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "/Volumes/VTG/Dev/C_5/Projects/week8/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
33
+ " from .autonotebook import tqdm as notebook_tqdm\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "with open(\"train.pkl\", \"rb\") as f: \n",
39
+ " train_ds = pickle.load(f)\n",
40
+ "\n",
41
+ "with open(\"test.pkl\", \"rb\") as f: \n",
42
+ " test_ds = pickle.load(f)"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 4,
48
+ "id": "8d4532ef-acd8-4b40-921c-ca27891cab7c",
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "### Documents for X / Price for y\n",
53
+ "\n",
54
+ "y = np.array([float(item.price) for item in train_ds])\n",
55
+ "documents = [str(item).split(\"= $\")[0].replace(\"<\", \"\").strip() for item in train_ds]"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 5,
61
+ "id": "8bfa2437-6376-4abb-ad75-01d9eef0253a",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "# Use the HashingVectorizer for a Bag of Words model\n",
66
+ "\n",
67
+ "np.random.seed(42)\n",
68
+ "vectorizer = HashingVectorizer(n_features=5000, stop_words=\"english\", binary=True)\n",
69
+ "X = vectorizer.fit_transform(documents)"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 6,
75
+ "id": "31154065-3973-4afa-a9f9-9f348c105ba1",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "class NeuralNetwork(nn.Module): \n",
80
+ " def __init__(self, input_size): \n",
81
+ " super(NeuralNetwork, self).__init__()\n",
82
+ " self.layer1 = nn.Linear(input_size, 128)\n",
83
+ " self.layer2 = nn.Linear(128, 64)\n",
84
+ " self.layer3 = nn.Linear(64, 64)\n",
85
+ " self.layer4 = nn.Linear(64, 64)\n",
86
+ " self.layer5 = nn.Linear(64, 64) \n",
87
+ " self.layer6 = nn.Linear(64, 64)\n",
88
+ " self.layer7 = nn.Linear(64, 64)\n",
89
+ " self.layer8 = nn.Linear(64, 1)\n",
90
+ " self.relu = nn.ReLU()\n",
91
+ "\n",
92
+ " def forward(self, x): \n",
93
+ " output1 = self.relu(self.layer1(x))\n",
94
+ " output2 = self.relu(self.layer2(output1))\n",
95
+ " output3 = self.relu(self.layer3(output2))\n",
96
+ " output4 = self.relu(self.layer4(output3))\n",
97
+ " output5 = self.relu(self.layer5(output4)) \n",
98
+ " output6 = self.relu(self.layer6(output5)) \n",
99
+ " output7 = self.relu(self.layer7(output6)) \n",
100
+ " output8 = self.layer8(output7)\n",
101
+ "\n",
102
+ " return output8"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": 7,
108
+ "id": "6d538f09-4c01-4e1b-bb6f-661d196ea05b",
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "### Convert data to PyTorch tensors\n",
113
+ "X_train_tensor = torch.FloatTensor(X.toarray())\n",
114
+ "y_train_tensor = torch.FloatTensor(y).unsqueeze(1)\n",
115
+ "\n",
116
+ "### Split the data into training and validation sets\n",
117
+ "X_train, X_val, y_train, y_val = train_test_split(\n",
118
+ " X_train_tensor, \n",
119
+ " y_train_tensor, \n",
120
+ " test_size=0.01, \n",
121
+ " random_state=42\n",
122
+ ")\n",
123
+ "\n",
124
+ "### Create the loader (modified the original batch_size 64)\n",
125
+ "train_dataset = TensorDataset(X_train, y_train)\n",
126
+ "train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)\n",
127
+ "\n",
128
+ "### Initialize the model\n",
129
+ "input_size = X_train_tensor.shape[1]\n",
130
+ "model = NeuralNetwork(input_size)"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 8,
136
+ "id": "b8b2d948-011d-4a84-b49e-f5dba399511a",
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "name": "stdout",
141
+ "output_type": "stream",
142
+ "text": [
143
+ "Number of trainable parameters: 669,249\n"
144
+ ]
145
+ }
146
+ ],
147
+ "source": [
148
+ "trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
149
+ "\n",
150
+ "print(f\"Number of trainable parameters: {trainable_params:,}\")"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": 9,
156
+ "id": "bb19432d-dc08-4e7e-a515-834c50608e3d",
157
+ "metadata": {},
158
+ "outputs": [
159
+ {
160
+ "name": "stderr",
161
+ "output_type": "stream",
162
+ "text": [
163
+ "100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 197.43it/s]\n"
164
+ ]
165
+ },
166
+ {
167
+ "name": "stdout",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "Epoch [1/3], Train Loss: 39517.398, Val Loss: 19185.752\n"
171
+ ]
172
+ },
173
+ {
174
+ "name": "stderr",
175
+ "output_type": "stream",
176
+ "text": [
177
+ "100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 214.18it/s]\n"
178
+ ]
179
+ },
180
+ {
181
+ "name": "stdout",
182
+ "output_type": "stream",
183
+ "text": [
184
+ "Epoch [2/3], Train Loss: 40145.402, Val Loss: 18731.053\n"
185
+ ]
186
+ },
187
+ {
188
+ "name": "stderr",
189
+ "output_type": "stream",
190
+ "text": [
191
+ "100%|█████████████████████████████████████████████████████████████████████████| 1161/1161 [00:05<00:00, 225.37it/s]\n"
192
+ ]
193
+ },
194
+ {
195
+ "name": "stdout",
196
+ "output_type": "stream",
197
+ "text": [
198
+ "Epoch [3/3], Train Loss: 12280.417, Val Loss: 17233.447\n"
199
+ ]
200
+ }
201
+ ],
202
+ "source": [
203
+ "## Define loss function and optimizer \n",
204
+ "\n",
205
+ "loss_function = nn.MSELoss() \n",
206
+ "optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
207
+ "\n",
208
+ "EPOCHS = 3\n",
209
+ "\n",
210
+ "for epoch in range(EPOCHS): \n",
211
+ " model.train()\n",
212
+ " for batch_X, batch_y in tqdm(train_loader): \n",
213
+ " optimizer.zero_grad()\n",
214
+ "\n",
215
+ " ### forward pass, loss calculation, backward pass, optimizer\n",
216
+ " outputs = model(batch_X)\n",
217
+ " loss = loss_function(outputs, batch_y)\n",
218
+ " loss.backward()\n",
219
+ " optimizer.step()\n",
220
+ "\n",
221
+ " model.eval()\n",
222
+ " with torch.no_grad(): \n",
223
+ " val_outputs = model(X_val)\n",
224
+ " val_loss = loss_function(val_outputs, y_val)\n",
225
+ "\n",
226
+ " print(f'Epoch [{epoch+1}/{EPOCHS}], Train Loss: {loss.item():.3f}, Val Loss: {val_loss.item():.3f}')"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 10,
232
+ "id": "7eafa04b-ec1a-4905-b2db-1c37adc2e153",
233
+ "metadata": {},
234
+ "outputs": [],
235
+ "source": [
236
+ "def neural_network(item): \n",
237
+ " model.eval()\n",
238
+ " with torch.no_grad(): \n",
239
+ " vector = vectorizer.transform([item])\n",
240
+ " vector = torch.FloatTensor(vector.toarray())\n",
241
+ " result = model(vector)[0].item()\n",
242
+ "\n",
243
+ " return max(0, result)"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 12,
249
+ "id": "982bd690-bf50-4244-bece-66e647a639d1",
250
+ "metadata": {},
251
+ "outputs": [],
252
+ "source": [
253
+ "test_items = [\n",
254
+ " {\n",
255
+ " \"item\": str(item).split(\"= $\")[0].replace(\"<\", \"\").strip(), \n",
256
+ " \"price\": item.price, \n",
257
+ " \"title\": item.title\n",
258
+ " }\n",
259
+ " for item in test_ds\n",
260
+ "]"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "8b18a98d-6199-4793-8e10-595c6aad98c6",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "from prototypes.testing_for_neural_network import TesterForNeuralNetwork\n",
271
+ "\n",
272
+ "TesterForNeuralNetwork.test(neural_network, test_items)"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "markdown",
277
+ "id": "10e61448-0211-49e0-865b-dd6fc4498a41",
278
+ "metadata": {},
279
+ "source": [
280
+ "## Save the neural network model"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 15,
286
+ "id": "fb70ea49-7df7-4d98-9e72-3efa59a94d21",
287
+ "metadata": {},
288
+ "outputs": [
289
+ {
290
+ "data": {
291
+ "text/plain": [
292
+ "['vectorizer.joblib']"
293
+ ]
294
+ },
295
+ "execution_count": 15,
296
+ "metadata": {},
297
+ "output_type": "execute_result"
298
+ }
299
+ ],
300
+ "source": [
301
+ "torch.save(model.state_dict(), \"models/neural_network_pricer_model.pt\")\n",
302
+ "\n",
303
+ "import joblib\n",
304
+ "joblib.dump(vectorizer, \"vectorizer.joblib\")"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 16,
310
+ "id": "303a5497-e6e7-4f4b-81e5-fc36665073a4",
311
+ "metadata": {},
312
+ "outputs": [
313
+ {
314
+ "name": "stdout",
315
+ "output_type": "stream",
316
+ "text": [
317
+ "<class 'sklearn.feature_extraction.text.HashingVectorizer'>\n"
318
+ ]
319
+ }
320
+ ],
321
+ "source": [
322
+ "print(type(vectorizer))"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "id": "cf3e1e1f-7393-4fc2-8147-8f3850a2f3e0",
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": []
332
+ }
333
+ ],
334
+ "metadata": {
335
+ "kernelspec": {
336
+ "display_name": "Python 3 (ipykernel)",
337
+ "language": "python",
338
+ "name": "python3"
339
+ },
340
+ "language_info": {
341
+ "codemirror_mode": {
342
+ "name": "ipython",
343
+ "version": 3
344
+ },
345
+ "file_extension": ".py",
346
+ "mimetype": "text/x-python",
347
+ "name": "python",
348
+ "nbconvert_exporter": "python",
349
+ "pygments_lexer": "ipython3",
350
+ "version": "3.13.1"
351
+ }
352
+ },
353
+ "nbformat": 4,
354
+ "nbformat_minor": 5
355
+ }
models/__pycache__/neural_network.cpython-313.pyc ADDED
Binary file (2.44 kB). View file
 
models/neural_network.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
models/neural_network.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class NeuralNetwork(nn.Module):
5
+ def __init__(self, input_size):
6
+ super(NeuralNetwork, self).__init__()
7
+ self.layer1 = nn.Linear(input_size, 128)
8
+ self.layer2 = nn.Linear(128, 64)
9
+ self.layer3 = nn.Linear(64, 64)
10
+ self.layer4 = nn.Linear(64, 64)
11
+ self.layer5 = nn.Linear(64, 64)
12
+ self.layer6 = nn.Linear(64, 64)
13
+ self.layer7 = nn.Linear(64, 64)
14
+ self.layer8 = nn.Linear(64, 1)
15
+ self.relu = nn.ReLU()
16
+
17
+ def forward(self, x):
18
+ output1 = self.relu(self.layer1(x))
19
+ output2 = self.relu(self.layer2(output1))
20
+ output3 = self.relu(self.layer3(output2))
21
+ output4 = self.relu(self.layer4(output3))
22
+ output5 = self.relu(self.layer5(output4))
23
+ output6 = self.relu(self.layer6(output5))
24
+ output7 = self.relu(self.layer7(output6))
25
+ output8 = self.layer8(output7)
26
+
27
+ return output8
models/neural_network_pricer_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3df70692cef20ed4618414252a343d450c2156a9aa8305c30422b900d6fe28db
3
+ size 2683243
models/vectorizer.joblib ADDED
Binary file (424 Bytes). View file