IAMTFRMZA Michelangiolo commited on
Commit
3e23545
·
0 Parent(s):

Duplicate from goliathaiconsulting/airbnb-search-engine

Browse files

Co-authored-by: Mazzeschi <Michelangiolo@users.noreply.huggingface.co>

Files changed (7) hide show
  1. .gitattributes +34 -0
  2. Airbnb_Open_Data.csv +0 -0
  3. README.md +13 -0
  4. airbnb.ipynb +604 -0
  5. app.py +91 -0
  6. df_encoded.parquet +3 -0
  7. history.ipynb +107 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Airbnb_Open_Data.csv ADDED
The diff for this file is too large to render. See raw diff
 
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Airbnb Search Engine
3
+ emoji: 🐢
4
+ colorFrom: green
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.23.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: goliathaiconsulting/airbnb-search-engine
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
airbnb.ipynb ADDED
@@ -0,0 +1,604 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "Index(['id', 'NAME', 'host id', 'host name', 'neighbourhood group',\n",
12
+ " 'neighbourhood', 'lat', 'long', 'country', 'country code',\n",
13
+ " 'instant_bookable', 'cancellation_policy', 'room type',\n",
14
+ " 'Construction year', 'price', 'service fee', 'minimum nights',\n",
15
+ " 'number of reviews', 'last review', 'reviews per month',\n",
16
+ " 'review rate number', 'calculated host listings count',\n",
17
+ " 'availability 365', 'house_rules', 'license'],\n",
18
+ " dtype='object')"
19
+ ]
20
+ },
21
+ "execution_count": 3,
22
+ "metadata": {},
23
+ "output_type": "execute_result"
24
+ }
25
+ ],
26
+ "source": [
27
+ "df.columns"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": 71,
33
+ "metadata": {},
34
+ "outputs": [
35
+ {
36
+ "name": "stderr",
37
+ "output_type": "stream",
38
+ "text": [
39
+ "C:\\Users\\ardit\\AppData\\Local\\Temp\\ipykernel_25752\\2207992772.py:4: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
40
+ " df = pd.read_csv('Airbnb_Open_Data.csv')\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "import pandas as pd\n",
46
+ "import random\n",
47
+ "\n",
48
+ "df = pd.read_csv('Airbnb_Open_Data.csv')\n",
49
+ "df = df.drop('host_identity_verified', axis=1)\n",
50
+ "df['description'] = df['NAME']\n",
51
+ "df['price'] = df['price'].dropna().apply(lambda x : int(x[1:].strip().replace(',', '')))\n",
52
+ "df['sq. meters'] = df['price'].apply(lambda x : random.choices([25, 40, 45, 55, 60, 70], weights=[5, 5, 4, 3, 2, 1])[0])\n",
53
+ "df = df[['price', 'sq. meters', 'description', 'neighbourhood group', 'host name', 'cancellation_policy', 'house_rules']]\n",
54
+ "df = df[df['house_rules']!='#NAME?'].dropna().reset_index(drop=True)\n",
55
+ "df = df[0:10000]"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 72,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "100%|██████████| 10000/10000 [17:37<00:00, 9.45it/s]\n"
68
+ ]
69
+ },
70
+ {
71
+ "data": {
72
+ "text/html": [
73
+ "<div>\n",
74
+ "<style scoped>\n",
75
+ " .dataframe tbody tr th:only-of-type {\n",
76
+ " vertical-align: middle;\n",
77
+ " }\n",
78
+ "\n",
79
+ " .dataframe tbody tr th {\n",
80
+ " vertical-align: top;\n",
81
+ " }\n",
82
+ "\n",
83
+ " .dataframe thead th {\n",
84
+ " text-align: right;\n",
85
+ " }\n",
86
+ "</style>\n",
87
+ "<table border=\"1\" class=\"dataframe\">\n",
88
+ " <thead>\n",
89
+ " <tr style=\"text-align: right;\">\n",
90
+ " <th></th>\n",
91
+ " <th>price</th>\n",
92
+ " <th>sq. meters</th>\n",
93
+ " <th>description</th>\n",
94
+ " <th>neighbourhood group</th>\n",
95
+ " <th>host name</th>\n",
96
+ " <th>cancellation_policy</th>\n",
97
+ " <th>house_rules</th>\n",
98
+ " <th>text_vector_</th>\n",
99
+ " </tr>\n",
100
+ " </thead>\n",
101
+ " <tbody>\n",
102
+ " <tr>\n",
103
+ " <th>0</th>\n",
104
+ " <td>966.0</td>\n",
105
+ " <td>25</td>\n",
106
+ " <td>Clean &amp; quiet apt home by the park</td>\n",
107
+ " <td>Brooklyn</td>\n",
108
+ " <td>Madaline</td>\n",
109
+ " <td>strict</td>\n",
110
+ " <td>Clean up and treat the home the way you'd like...</td>\n",
111
+ " <td>[-0.047521110624074936, 0.03044620156288147, 0...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>1</th>\n",
115
+ " <td>142.0</td>\n",
116
+ " <td>25</td>\n",
117
+ " <td>Skylit Midtown Castle</td>\n",
118
+ " <td>Manhattan</td>\n",
119
+ " <td>Jenna</td>\n",
120
+ " <td>moderate</td>\n",
121
+ " <td>Pet friendly but please confirm with me if the...</td>\n",
122
+ " <td>[-0.04690079391002655, 0.061329323798418045, 0...</td>\n",
123
+ " </tr>\n",
124
+ " <tr>\n",
125
+ " <th>2</th>\n",
126
+ " <td>620.0</td>\n",
127
+ " <td>45</td>\n",
128
+ " <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
129
+ " <td>Manhattan</td>\n",
130
+ " <td>Elise</td>\n",
131
+ " <td>flexible</td>\n",
132
+ " <td>I encourage you to use my kitchen, cooking and...</td>\n",
133
+ " <td>[0.00039011164335533977, 0.018310122191905975,...</td>\n",
134
+ " </tr>\n",
135
+ " <tr>\n",
136
+ " <th>3</th>\n",
137
+ " <td>204.0</td>\n",
138
+ " <td>55</td>\n",
139
+ " <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
140
+ " <td>Manhattan</td>\n",
141
+ " <td>Lyndon</td>\n",
142
+ " <td>moderate</td>\n",
143
+ " <td>Please no smoking in the house, porch or on th...</td>\n",
144
+ " <td>[-0.04602213576436043, 0.015605293214321136, 0...</td>\n",
145
+ " </tr>\n",
146
+ " <tr>\n",
147
+ " <th>4</th>\n",
148
+ " <td>577.0</td>\n",
149
+ " <td>25</td>\n",
150
+ " <td>Large Cozy 1 BR Apartment In Midtown East</td>\n",
151
+ " <td>Manhattan</td>\n",
152
+ " <td>Michelle</td>\n",
153
+ " <td>flexible</td>\n",
154
+ " <td>No smoking, please, and no drugs.</td>\n",
155
+ " <td>[-0.04859349876642227, -0.01263828668743372, 0...</td>\n",
156
+ " </tr>\n",
157
+ " <tr>\n",
158
+ " <th>...</th>\n",
159
+ " <td>...</td>\n",
160
+ " <td>...</td>\n",
161
+ " <td>...</td>\n",
162
+ " <td>...</td>\n",
163
+ " <td>...</td>\n",
164
+ " <td>...</td>\n",
165
+ " <td>...</td>\n",
166
+ " <td>...</td>\n",
167
+ " </tr>\n",
168
+ " <tr>\n",
169
+ " <th>9995</th>\n",
170
+ " <td>745.0</td>\n",
171
+ " <td>60</td>\n",
172
+ " <td>Upper West Side 1BR next to subway/Central Park</td>\n",
173
+ " <td>Manhattan</td>\n",
174
+ " <td>Doreen</td>\n",
175
+ " <td>strict</td>\n",
176
+ " <td>Our Herbivorian House manual with detailed rul...</td>\n",
177
+ " <td>[-0.0346745029091835, -0.005859952419996262, 0...</td>\n",
178
+ " </tr>\n",
179
+ " <tr>\n",
180
+ " <th>9996</th>\n",
181
+ " <td>1135.0</td>\n",
182
+ " <td>45</td>\n",
183
+ " <td>Modern and Bright Studio Apt in Williamsburg</td>\n",
184
+ " <td>Brooklyn</td>\n",
185
+ " <td>Shannon</td>\n",
186
+ " <td>strict</td>\n",
187
+ " <td>No smoking please!</td>\n",
188
+ " <td>[-0.016586357727646828, 0.020517650991678238, ...</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>9997</th>\n",
192
+ " <td>59.0</td>\n",
193
+ " <td>45</td>\n",
194
+ " <td>Holiday in Trendy Williamsburg Apt!</td>\n",
195
+ " <td>Brooklyn</td>\n",
196
+ " <td>Peter</td>\n",
197
+ " <td>strict</td>\n",
198
+ " <td>We suggest you use email or texting contact us...</td>\n",
199
+ " <td>[-0.05095353722572327, 0.08510775864124298, -0...</td>\n",
200
+ " </tr>\n",
201
+ " <tr>\n",
202
+ " <th>9998</th>\n",
203
+ " <td>1055.0</td>\n",
204
+ " <td>25</td>\n",
205
+ " <td>Greenwich Village| Private Queen room</td>\n",
206
+ " <td>Manhattan</td>\n",
207
+ " <td>Kelly</td>\n",
208
+ " <td>flexible</td>\n",
209
+ " <td>Please treat this house as if it is your own. ...</td>\n",
210
+ " <td>[0.00017118529649451375, 0.010939894244074821,...</td>\n",
211
+ " </tr>\n",
212
+ " <tr>\n",
213
+ " <th>9999</th>\n",
214
+ " <td>285.0</td>\n",
215
+ " <td>25</td>\n",
216
+ " <td>Comfortable bedroom in spacious apt</td>\n",
217
+ " <td>Brooklyn</td>\n",
218
+ " <td>Arthur</td>\n",
219
+ " <td>strict</td>\n",
220
+ " <td>Please, No smoking and no pets. We do require ...</td>\n",
221
+ " <td>[-0.01795135624706745, -0.029596544802188873, ...</td>\n",
222
+ " </tr>\n",
223
+ " </tbody>\n",
224
+ "</table>\n",
225
+ "<p>10000 rows × 8 columns</p>\n",
226
+ "</div>"
227
+ ],
228
+ "text/plain": [
229
+ " price sq. meters description \\\n",
230
+ "0 966.0 25 Clean & quiet apt home by the park \n",
231
+ "1 142.0 25 Skylit Midtown Castle \n",
232
+ "2 620.0 45 THE VILLAGE OF HARLEM....NEW YORK ! \n",
233
+ "3 204.0 55 Entire Apt: Spacious Studio/Loft by central park \n",
234
+ "4 577.0 25 Large Cozy 1 BR Apartment In Midtown East \n",
235
+ "... ... ... ... \n",
236
+ "9995 745.0 60 Upper West Side 1BR next to subway/Central Park \n",
237
+ "9996 1135.0 45 Modern and Bright Studio Apt in Williamsburg \n",
238
+ "9997 59.0 45 Holiday in Trendy Williamsburg Apt! \n",
239
+ "9998 1055.0 25 Greenwich Village| Private Queen room \n",
240
+ "9999 285.0 25 Comfortable bedroom in spacious apt \n",
241
+ "\n",
242
+ " neighbourhood group host name cancellation_policy \\\n",
243
+ "0 Brooklyn Madaline strict \n",
244
+ "1 Manhattan Jenna moderate \n",
245
+ "2 Manhattan Elise flexible \n",
246
+ "3 Manhattan Lyndon moderate \n",
247
+ "4 Manhattan Michelle flexible \n",
248
+ "... ... ... ... \n",
249
+ "9995 Manhattan Doreen strict \n",
250
+ "9996 Brooklyn Shannon strict \n",
251
+ "9997 Brooklyn Peter strict \n",
252
+ "9998 Manhattan Kelly flexible \n",
253
+ "9999 Brooklyn Arthur strict \n",
254
+ "\n",
255
+ " house_rules \\\n",
256
+ "0 Clean up and treat the home the way you'd like... \n",
257
+ "1 Pet friendly but please confirm with me if the... \n",
258
+ "2 I encourage you to use my kitchen, cooking and... \n",
259
+ "3 Please no smoking in the house, porch or on th... \n",
260
+ "4 No smoking, please, and no drugs. \n",
261
+ "... ... \n",
262
+ "9995 Our Herbivorian House manual with detailed rul... \n",
263
+ "9996 No smoking please! \n",
264
+ "9997 We suggest you use email or texting contact us... \n",
265
+ "9998 Please treat this house as if it is your own. ... \n",
266
+ "9999 Please, No smoking and no pets. We do require ... \n",
267
+ "\n",
268
+ " text_vector_ \n",
269
+ "0 [-0.047521110624074936, 0.03044620156288147, 0... \n",
270
+ "1 [-0.04690079391002655, 0.061329323798418045, 0... \n",
271
+ "2 [0.00039011164335533977, 0.018310122191905975,... \n",
272
+ "3 [-0.04602213576436043, 0.015605293214321136, 0... \n",
273
+ "4 [-0.04859349876642227, -0.01263828668743372, 0... \n",
274
+ "... ... \n",
275
+ "9995 [-0.0346745029091835, -0.005859952419996262, 0... \n",
276
+ "9996 [-0.016586357727646828, 0.020517650991678238, ... \n",
277
+ "9997 [-0.05095353722572327, 0.08510775864124298, -0... \n",
278
+ "9998 [0.00017118529649451375, 0.010939894244074821,... \n",
279
+ "9999 [-0.01795135624706745, -0.029596544802188873, ... \n",
280
+ "\n",
281
+ "[10000 rows x 8 columns]"
282
+ ]
283
+ },
284
+ "execution_count": 72,
285
+ "metadata": {},
286
+ "output_type": "execute_result"
287
+ }
288
+ ],
289
+ "source": [
290
+ "import pandas as pd\n",
291
+ "from tqdm import tqdm\n",
292
+ "from sentence_transformers import SentenceTransformer\n",
293
+ "tqdm.pandas()\n",
294
+ "\n",
295
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
296
+ "\n",
297
+ "#encode df version: for small dataset only\n",
298
+ "df['text_vector_'] = df['description'].progress_apply(lambda x : model.encode(x).tolist())\n",
299
+ "df"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": null,
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "df = pd.read_parquet('df_encoded.parquet')\n",
309
+ "df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')\n",
310
+ "df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')\n",
311
+ "df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')\n",
312
+ "df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')\n",
313
+ "df['location'] = df['neighbourhood group']\n",
314
+ "df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]\n",
315
+ "df = df.reset_index(drop=True)\n",
316
+ "df"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 145,
322
+ "metadata": {},
323
+ "outputs": [],
324
+ "source": [
325
+ "from sklearn.neighbors import NearestNeighbors\n",
326
+ "import numpy as np\n",
327
+ "import pandas as pd\n",
328
+ "\n",
329
+ "from sentence_transformers import SentenceTransformer\n",
330
+ "\n",
331
+ "# df = df.read_parquet('df_encoded.parquet')\n",
332
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
333
+ "\n",
334
+ "#prepare model\n",
335
+ "# nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": null,
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": [
344
+ "import gradio as gr\n",
345
+ "import statistics\n",
346
+ "\n",
347
+ "def closest_number(x):\n",
348
+ " closest_numbers = [10, 20, 30, 40]\n",
349
+ " closest_number = closest_numbers[0]\n",
350
+ " min_distance = abs(x - closest_number)\n",
351
+ " for number in closest_numbers[1:]:\n",
352
+ " distance = abs(x - number)\n",
353
+ " if distance < min_distance:\n",
354
+ " closest_number = number\n",
355
+ " min_distance = distance\n",
356
+ " return closest_number\n",
357
+ "\n",
358
+ "def search(df, query):\n",
359
+ " product = model.encode(query).tolist()\n",
360
+ " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
361
+ "\n",
362
+ " nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
363
+ " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
364
+ "\n",
365
+ " #print out the description of every recommended product\n",
366
+ " df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)\n",
367
+ "\n",
368
+ " return df_search.sort_values('price', ascending=False)\n",
369
+ "\n",
370
+ "def filter_df(df, column_name, filter_type, filter_value):\n",
371
+ " if filter_type == '==':\n",
372
+ " df_filtered = df[df[column_name]==filter_value]\n",
373
+ " elif filter_type == '>=':\n",
374
+ " df_filtered = df[df[column_name]>=filter_value]\n",
375
+ " elif filter_type == '<=':\n",
376
+ " df_filtered = df[df[column_name]<=filter_value]\n",
377
+ " return df_filtered\n",
378
+ "\n",
379
+ "history = list()\n",
380
+ "def predict(input1, input2, input3, input4):\n",
381
+ " history.append([input1, input2, input3, input4])\n",
382
+ "\n",
383
+ " print(history)\n",
384
+ " df_location = filter_df(df, 'location', '==', input3)\n",
385
+ " df_size = filter_df(df_location, 'sq. meters', '==', input2)\n",
386
+ " df_price = filter_df(df_size, 'price', '<=', input1)\n",
387
+ " df_result = search(df_price, input4)\n",
388
+ "\n",
389
+ " prediction = [\n",
390
+ " round(statistics.mean([x[0] for x in history])), #price\n",
391
+ " closest_number(statistics.mean([x[1] for x in history])), #square room\n",
392
+ " statistics.mode([x[2] for x in history]) #state\n",
393
+ " ]\n",
394
+ "\n",
395
+ " return df_result, prediction\n",
396
+ "\n",
397
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
398
+ " gr.Markdown(\n",
399
+ " \"\"\"\n",
400
+ " # Airbnb Search Engine\n",
401
+ " \"\"\"\n",
402
+ " )\n",
403
+ " input1 = gr.Slider(100, 1200, value=700, step_size=100, label=\"Max Price\")\n",
404
+ " input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)\n",
405
+ " input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Queens')\n",
406
+ " input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')\n",
407
+ "\n",
408
+ " btn = gr.Button(value=\"Search for a Room\")\n",
409
+ " output1 = gr.Dataframe()\n",
410
+ " output2 = gr.Textbox(label='prediction for the next search')\n",
411
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
412
+ " btn.click(predict, [input1, input2, input3, input4], [output1, output2])\n",
413
+ "demo.launch(share=False)"
414
+ ]
415
+ },
416
+ {
417
+ "cell_type": "code",
418
+ "execution_count": null,
419
+ "metadata": {},
420
+ "outputs": [],
421
+ "source": [
422
+ "import os\n",
423
+ "os.system('pip install openpyxl')\n",
424
+ "os.system('pip install sentence-transformers')\n",
425
+ "import pandas as pd\n",
426
+ "import gradio as gr\n",
427
+ "import statistics\n",
428
+ "from sklearn.neighbors import NearestNeighbors\n",
429
+ "from sentence_transformers import SentenceTransformer\n",
430
+ "\n",
431
+ "df = pd.read_parquet('df_encoded.parquet')\n",
432
+ "df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')\n",
433
+ "df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')\n",
434
+ "df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')\n",
435
+ "df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')\n",
436
+ "df['location'] = df['neighbourhood group']\n",
437
+ "df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]\n",
438
+ "df = df.reset_index(drop=True)\n",
439
+ "df\n",
440
+ "\n",
441
+ "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
442
+ "\n",
443
+ "#prepare model #we run it anew in the search function every time, after the initial filtering\n",
444
+ "# nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
445
+ "\n",
446
+ "def closest_number(x):\n",
447
+ " closest_numbers = [25, 40, 45, 55, 60, 70]\n",
448
+ " closest_number = closest_numbers[0]\n",
449
+ " min_distance = abs(x - closest_number)\n",
450
+ " for number in closest_numbers[1:]:\n",
451
+ " distance = abs(x - number)\n",
452
+ " if distance < min_distance:\n",
453
+ " closest_number = number\n",
454
+ " min_distance = distance\n",
455
+ " return closest_number\n",
456
+ "\n",
457
+ "def search(df, query):\n",
458
+ " product = model.encode(query).tolist()\n",
459
+ " # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
460
+ "\n",
461
+ " nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())\n",
462
+ " distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
463
+ "\n",
464
+ " #print out the description of every recommended product\n",
465
+ " df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)\n",
466
+ "\n",
467
+ " return df_search.sort_values('price', ascending=False)\n",
468
+ "\n",
469
+ "def filter_df(df, column_name, filter_type, filter_value):\n",
470
+ " if filter_type == '==':\n",
471
+ " df_filtered = df[df[column_name]==filter_value]\n",
472
+ " elif filter_type == '>=':\n",
473
+ " df_filtered = df[df[column_name]>=filter_value]\n",
474
+ " elif filter_type == '<=':\n",
475
+ " df_filtered = df[df[column_name]<=filter_value]\n",
476
+ " return df_filtered"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": 7,
482
+ "metadata": {},
483
+ "outputs": [],
484
+ "source": [
485
+ "def predict(history, input1, input2, input3, input4):\n",
486
+ " history.append([input1, input2, input3, input4])\n",
487
+ "\n",
488
+ " print(history)\n",
489
+ " df_location = filter_df(df, 'location', '==', input3)\n",
490
+ " df_size = filter_df(df_location, 'sq. meters', '==', input2)\n",
491
+ " df_price = filter_df(df_size, 'price', '<=', input1)\n",
492
+ " df_result = search(df_price, input4)\n",
493
+ "\n",
494
+ " prediction = [\n",
495
+ " round(statistics.mean([x[0] for x in history])), #price\n",
496
+ " closest_number(statistics.mean([x[1] for x in history])), #square meters\n",
497
+ " statistics.mode([x[2] for x in history]) #state\n",
498
+ " ]\n",
499
+ "\n",
500
+ " print(history)\n",
501
+ "\n",
502
+ " return df_result, prediction"
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": 8,
508
+ "metadata": {},
509
+ "outputs": [
510
+ {
511
+ "name": "stderr",
512
+ "output_type": "stream",
513
+ "text": [
514
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Slider, please remove them: {'step_size': 100}\n",
515
+ " warnings.warn(\n",
516
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
517
+ " warnings.warn(\n"
518
+ ]
519
+ },
520
+ {
521
+ "name": "stdout",
522
+ "output_type": "stream",
523
+ "text": [
524
+ "Running on local URL: http://127.0.0.1:7863\n",
525
+ "\n",
526
+ "To create a public link, set `share=True` in `launch()`.\n"
527
+ ]
528
+ },
529
+ {
530
+ "data": {
531
+ "text/html": [
532
+ "<div><iframe src=\"http://127.0.0.1:7863/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
533
+ ],
534
+ "text/plain": [
535
+ "<IPython.core.display.HTML object>"
536
+ ]
537
+ },
538
+ "metadata": {},
539
+ "output_type": "display_data"
540
+ },
541
+ {
542
+ "data": {
543
+ "text/plain": []
544
+ },
545
+ "execution_count": 8,
546
+ "metadata": {},
547
+ "output_type": "execute_result"
548
+ },
549
+ {
550
+ "name": "stdout",
551
+ "output_type": "stream",
552
+ "text": [
553
+ "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n",
554
+ "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n",
555
+ "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!'], [700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n",
556
+ "[[700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!'], [700, 45, 'Brooklyn', 'I want to take a break from work 😴!!!']]\n"
557
+ ]
558
+ }
559
+ ],
560
+ "source": [
561
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
562
+ " history = gr.Variable(value=[]) #beginning\n",
563
+ " gr.Markdown(\n",
564
+ " \"\"\"\n",
565
+ " # Airbnb Search Engine\n",
566
+ " \"\"\"\n",
567
+ " )\n",
568
+ " input1 = gr.Slider(100, 1200, value=700, step_size=100, label=\"Max Price\")\n",
569
+ " input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)\n",
570
+ " input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Brooklyn')\n",
571
+ " input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')\n",
572
+ "\n",
573
+ " btn = gr.Button(value=\"Search for a Room\")\n",
574
+ " output1 = gr.Dataframe()\n",
575
+ " output2 = gr.Textbox(label='prediction for the next search')\n",
576
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
577
+ " btn.click(predict, [history, input1, input2, input3, input4], [output1, output2])\n",
578
+ "demo.launch(share=False)"
579
+ ]
580
+ }
581
+ ],
582
+ "metadata": {
583
+ "kernelspec": {
584
+ "display_name": "Python 3",
585
+ "language": "python",
586
+ "name": "python3"
587
+ },
588
+ "language_info": {
589
+ "codemirror_mode": {
590
+ "name": "ipython",
591
+ "version": 3
592
+ },
593
+ "file_extension": ".py",
594
+ "mimetype": "text/x-python",
595
+ "name": "python",
596
+ "nbconvert_exporter": "python",
597
+ "pygments_lexer": "ipython3",
598
+ "version": "3.9.13"
599
+ },
600
+ "orig_nbformat": 4
601
+ },
602
+ "nbformat": 4,
603
+ "nbformat_minor": 2
604
+ }
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system('pip install openpyxl')
3
+ os.system('pip install sentence-transformers')
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import statistics
7
+ from sklearn.neighbors import NearestNeighbors
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ df = pd.read_parquet('df_encoded.parquet')
11
+ df['neighbourhood group'][0:2500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Manhattan')
12
+ df['neighbourhood group'][2500:5000] = df['neighbourhood group'][0:2500].apply(lambda x : 'Brooklyn')
13
+ df['neighbourhood group'][5000:7500] = df['neighbourhood group'][0:2500].apply(lambda x : 'Queens')
14
+ df['neighbourhood group'][7500:] = df['neighbourhood group'][0:2500].apply(lambda x : 'Bronx')
15
+ df['location'] = df['neighbourhood group']
16
+ df = df[['price', 'sq. meters', 'description', 'location', 'host name', 'cancellation_policy', 'house_rules', 'text_vector_']]
17
+ df = df.reset_index(drop=True)
18
+ df
19
+
20
+ model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
21
+
22
+ #prepare model #we run it anew in the search function every time, after the initial filtering
23
+ # nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
24
+
25
+ def closest_number(x):
26
+ closest_numbers = [25, 40, 45, 55, 60, 70]
27
+ closest_number = closest_numbers[0]
28
+ min_distance = abs(x - closest_number)
29
+ for number in closest_numbers[1:]:
30
+ distance = abs(x - number)
31
+ if distance < min_distance:
32
+ closest_number = number
33
+ min_distance = distance
34
+ return closest_number
35
+
36
+ def search(df, query):
37
+ product = model.encode(query).tolist()
38
+ # product = df.iloc[0]['text_vector_'] #use one of the products as sample
39
+
40
+ nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
41
+ distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
42
+
43
+ #print out the description of every recommended product
44
+ df_search = df.iloc[list(indices)[0]].drop(['text_vector_'], axis=1) #.sort_values('avgFeedbackScore', ascending=False)
45
+
46
+ return df_search.sort_values('price', ascending=False)
47
+
48
+ def filter_df(df, column_name, filter_type, filter_value):
49
+ if filter_type == '==':
50
+ df_filtered = df[df[column_name]==filter_value]
51
+ elif filter_type == '>=':
52
+ df_filtered = df[df[column_name]>=filter_value]
53
+ elif filter_type == '<=':
54
+ df_filtered = df[df[column_name]<=filter_value]
55
+ return df_filtered
56
+
57
+ def predict(history, input1, input2, input3, input4):
58
+ history.append([input1, input2, input3, input4])
59
+
60
+ print(history)
61
+ df_location = filter_df(df, 'location', '==', input3)
62
+ df_size = filter_df(df_location, 'sq. meters', '==', input2)
63
+ df_price = filter_df(df_size, 'price', '<=', input1)
64
+ df_result = search(df_price, input4)
65
+
66
+ prediction = [
67
+ round(statistics.mean([x[0] for x in history])), #price
68
+ closest_number(statistics.mean([x[1] for x in history])), #square meters
69
+ statistics.mode([x[2] for x in history]) #state
70
+ ]
71
+
72
+ return df_result, prediction
73
+
74
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:
75
+ history = gr.Variable(value=[]) #beginning
76
+ gr.Markdown(
77
+ """
78
+ # Airbnb Search Engine
79
+ """
80
+ )
81
+ input1 = gr.Slider(100, 1200, value=700, step_size=100, label="Max Price")
82
+ input2 = gr.Radio([25, 40, 45, 55, 60, 70], multiselect=False, label='square meters', value=45)
83
+ input3 = gr.Radio(['Manhattan', 'Brooklyn', 'Queens', 'Bronx'], multiselect=False, label='State', value='Brooklyn')
84
+ input4 = gr.Textbox(label='Query', value='I want to take a break from work 😴!!!')
85
+
86
+ btn = gr.Button(value="Search for a Room")
87
+ output1 = gr.Dataframe()
88
+ output2 = gr.Textbox(label='prediction for the next search')
89
+ # btn.click(greet, inputs='text', outputs=['dataframe'])
90
+ btn.click(predict, [history, input1, input2, input3, input4], [output1, output2])
91
+ demo.launch(share=False)
df_encoded.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe09f27cabb790b1de79ba1483bceded0499ef48627bde47756b1905dd72a91
3
+ size 48169491
history.ipynb ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\Users\\ardit\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\gradio\\deprecation.py:43: UserWarning: You have unused kwarg parameters in Radio, please remove them: {'multiselect': False}\n",
13
+ " warnings.warn(\n"
14
+ ]
15
+ },
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "Running on local URL: http://127.0.0.1:7861\n",
21
+ "\n",
22
+ "To create a public link, set `share=True` in `launch()`.\n"
23
+ ]
24
+ },
25
+ {
26
+ "data": {
27
+ "text/html": [
28
+ "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
29
+ ],
30
+ "text/plain": [
31
+ "<IPython.core.display.HTML object>"
32
+ ]
33
+ },
34
+ "metadata": {},
35
+ "output_type": "display_data"
36
+ },
37
+ {
38
+ "data": {
39
+ "text/plain": []
40
+ },
41
+ "execution_count": 2,
42
+ "metadata": {},
43
+ "output_type": "execute_result"
44
+ },
45
+ {
46
+ "name": "stdout",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "[40]\n",
50
+ "[40, 30]\n",
51
+ "[40, 30, 10]\n",
52
+ "[40, 30, 10, 10]\n",
53
+ "[40, 30, 10, 10, 10]\n"
54
+ ]
55
+ }
56
+ ],
57
+ "source": [
58
+ "import gradio as gr\n",
59
+ "import statistics\n",
60
+ "\n",
61
+ "def predict(history, input1):\n",
62
+ " history.append(input1)\n",
63
+ "\n",
64
+ " print(history)\n",
65
+ " total = statistics.mean(history)\n",
66
+ "\n",
67
+ " return total\n",
68
+ "\n",
69
+ "with gr.Blocks(theme=gr.themes.Soft(primary_hue='amber', secondary_hue='gray', neutral_hue='amber')) as demo:\n",
70
+ " gr.Markdown(\n",
71
+ " \"\"\"\n",
72
+ " # Gradio with History\n",
73
+ " \"\"\"\n",
74
+ " )\n",
75
+ " history = gr.Variable(value=[]) #beginning\n",
76
+ " input1 = gr.Radio([10, 20, 30, 40, 50], multiselect=False, label='value')\n",
77
+ " btn = gr.Button(value=\"Search for a Room\")\n",
78
+ " output1 = gr.Textbox(label='value')\n",
79
+ " # btn.click(greet, inputs='text', outputs=['dataframe'])\n",
80
+ " btn.click(predict, [history, input1], [output1])\n",
81
+ "demo.launch(share=False)"
82
+ ]
83
+ }
84
+ ],
85
+ "metadata": {
86
+ "kernelspec": {
87
+ "display_name": "Python 3",
88
+ "language": "python",
89
+ "name": "python3"
90
+ },
91
+ "language_info": {
92
+ "codemirror_mode": {
93
+ "name": "ipython",
94
+ "version": 3
95
+ },
96
+ "file_extension": ".py",
97
+ "mimetype": "text/x-python",
98
+ "name": "python",
99
+ "nbconvert_exporter": "python",
100
+ "pygments_lexer": "ipython3",
101
+ "version": "3.9.13"
102
+ },
103
+ "orig_nbformat": 4
104
+ },
105
+ "nbformat": 4,
106
+ "nbformat_minor": 2
107
+ }