Spaces:
Runtime error
Runtime error
Upload 23 files
Browse files- .gitattributes +1 -0
- Dockerfile +20 -0
- Hotel New York Combined.csv +0 -0
- README.md +1 -13
- Untitled.ipynb +90 -0
- app.py +297 -0
- app.yaml +3 -0
- basic.py +166 -0
- combined_paris.csv +0 -0
- corpus_embeddings_bi_encoder.pickle +3 -0
- corpus_embeddings_bi_encoder.pickle 2 +0 -0
- df_combined.csv +0 -0
- df_combined_paris.csv +0 -0
- embeddings.npy +3 -0
- embeddings_h_r.npy +3 -0
- embeddings_review.npy +3 -0
- en_core_web_sm-3.2.0-py3-none-any.whl +3 -0
- paris-newer.py +295 -0
- paris.py +298 -0
- paris_clean_newer.csv +0 -0
- query_generator.ipynb +0 -0
- requirements.txt +14 -0
- summary.ipynb +654 -0
- tokenized_corpus.pickle +3 -0
.gitattributes
CHANGED
|
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
en_core_web_sm-3.2.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Base Image to use
|
| 2 |
+
FROM python:3.7.9
|
| 3 |
+
|
| 4 |
+
#Expose port 8080
|
| 5 |
+
EXPOSE 8080
|
| 6 |
+
|
| 7 |
+
#Copy Requirements.txt file into app directory
|
| 8 |
+
COPY requirements.txt app/requirements.txt
|
| 9 |
+
|
| 10 |
+
#install all requirements in requirements.txt
|
| 11 |
+
RUN pip3 install -r app/requirements.txt
|
| 12 |
+
|
| 13 |
+
#Copy all files in current directory into app directory
|
| 14 |
+
COPY . /app
|
| 15 |
+
|
| 16 |
+
#Change Working Directory to app directory
|
| 17 |
+
WORKDIR /app
|
| 18 |
+
|
| 19 |
+
#Run the application on port 8080
|
| 20 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
|
Hotel New York Combined.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,13 +1 @@
|
|
| 1 |
-
|
| 2 |
-
title: Parishotel
|
| 3 |
-
emoji: 🐠
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: streamlit
|
| 7 |
-
sdk_version: 1.10.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: apache-2.0
|
| 11 |
-
---
|
| 12 |
-
|
| 13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
assignment3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Untitled.ipynb
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "611a3e0e",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"Init Plugin\n",
|
| 14 |
+
"Init Graph Optimizer\n",
|
| 15 |
+
"Init Kernel\n",
|
| 16 |
+
"Collecting en-core-web-sm==3.2.0\n",
|
| 17 |
+
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
|
| 18 |
+
" |████████████████████████████████| 13.9 MB 463 kB/s \n",
|
| 19 |
+
"\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from en-core-web-sm==3.2.0) (3.2.1)\n",
|
| 20 |
+
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)\n",
|
| 21 |
+
"Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)\n",
|
| 22 |
+
"Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
|
| 23 |
+
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.62.3)\n",
|
| 24 |
+
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
|
| 25 |
+
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
|
| 26 |
+
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
|
| 27 |
+
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.0)\n",
|
| 28 |
+
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
|
| 29 |
+
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)\n",
|
| 30 |
+
"Requirement already satisfied: jinja2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.1)\n",
|
| 31 |
+
"Requirement already satisfied: numpy>=1.15.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.21.4)\n",
|
| 32 |
+
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
|
| 33 |
+
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
|
| 34 |
+
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
|
| 35 |
+
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.8.2)\n",
|
| 36 |
+
"Requirement already satisfied: pathy>=0.3.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
|
| 37 |
+
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
|
| 38 |
+
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.26.0)\n",
|
| 39 |
+
"Requirement already satisfied: setuptools in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (59.0.1)\n",
|
| 40 |
+
"Requirement already satisfied: pyparsing>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.7)\n",
|
| 41 |
+
"Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
|
| 42 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
|
| 43 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.2)\n",
|
| 44 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.5.30)\n",
|
| 45 |
+
"Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.4)\n",
|
| 46 |
+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.6)\n",
|
| 47 |
+
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
|
| 48 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.1)\n",
|
| 49 |
+
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
| 50 |
+
"You should consider upgrading via the '/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/bin/python -m pip install --upgrade pip' command.\u001b[0m\n",
|
| 51 |
+
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
|
| 52 |
+
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
| 53 |
+
]
|
| 54 |
+
}
|
| 55 |
+
],
|
| 56 |
+
"source": [
|
| 57 |
+
"!python -m spacy download en_core_web_sm"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": null,
|
| 63 |
+
"id": "51a414e5",
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"outputs": [],
|
| 66 |
+
"source": []
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"metadata": {
|
| 70 |
+
"kernelspec": {
|
| 71 |
+
"display_name": "Python 3.9.5 64-bit ('tensorflow': conda)",
|
| 72 |
+
"language": "python",
|
| 73 |
+
"name": "python395jvsc74a57bd04bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
|
| 74 |
+
},
|
| 75 |
+
"language_info": {
|
| 76 |
+
"codemirror_mode": {
|
| 77 |
+
"name": "ipython",
|
| 78 |
+
"version": 3
|
| 79 |
+
},
|
| 80 |
+
"file_extension": ".py",
|
| 81 |
+
"mimetype": "text/x-python",
|
| 82 |
+
"name": "python",
|
| 83 |
+
"nbconvert_exporter": "python",
|
| 84 |
+
"pygments_lexer": "ipython3",
|
| 85 |
+
"version": "3.9.5"
|
| 86 |
+
}
|
| 87 |
+
},
|
| 88 |
+
"nbformat": 4,
|
| 89 |
+
"nbformat_minor": 5
|
| 90 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@author: Hamza Farooq
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import spacy
|
| 10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
| 11 |
+
from string import punctuation
|
| 12 |
+
from collections import Counter
|
| 13 |
+
from heapq import nlargest
|
| 14 |
+
import os
|
| 15 |
+
nlp = spacy.load("en_core_web_sm")
|
| 16 |
+
from spacy import displacy
|
| 17 |
+
import streamlit as st
|
| 18 |
+
import matplotlib.pyplot as plt
|
| 19 |
+
from wordcloud import WordCloud
|
| 20 |
+
from matplotlib import pyplot as plt
|
| 21 |
+
import nltk
|
| 22 |
+
nltk.download('stopwords')
|
| 23 |
+
import geonamescache
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
import streamlit as st
|
| 27 |
+
import utils as utl
|
| 28 |
+
from PIL import Image
|
| 29 |
+
import time
|
| 30 |
+
import torch
|
| 31 |
+
import transformers
|
| 32 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
| 33 |
+
tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
| 34 |
+
mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
|
| 35 |
+
torch_device = 'gpu'
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def main():
|
| 39 |
+
# Settings
|
| 40 |
+
st.set_page_config(layout="wide", page_title='New York Hotels')
|
| 41 |
+
def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
|
| 42 |
+
|
| 43 |
+
text = text.replace('\n','')
|
| 44 |
+
text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
|
| 45 |
+
summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
|
| 46 |
+
summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
| 47 |
+
return summary_txt
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
gc = geonamescache.GeonamesCache()
|
| 51 |
+
|
| 52 |
+
# gets nested dictionary for countries
|
| 53 |
+
countries = gc.get_countries()
|
| 54 |
+
|
| 55 |
+
# gets nested dictionary for cities
|
| 56 |
+
cities = gc.get_cities()
|
| 57 |
+
# def gen_dict_extract(var, key):
|
| 58 |
+
# if isinstance(var, dict):
|
| 59 |
+
# for k, v in var.items():
|
| 60 |
+
# if k == key:
|
| 61 |
+
# yield v
|
| 62 |
+
# if isinstance(v, (dict, list)):
|
| 63 |
+
# yield from gen_dict_extract(v, key)
|
| 64 |
+
# elif isinstance(var, list):
|
| 65 |
+
# for d in var:
|
| 66 |
+
# yield from gen_dict_extract(d, key)
|
| 67 |
+
#
|
| 68 |
+
# cities = [*gen_dict_extract(cities, 'name')]
|
| 69 |
+
# countries = [*gen_dict_extract(countries, 'name')]
|
| 70 |
+
#
|
| 71 |
+
# cities.append('New York')
|
| 72 |
+
|
| 73 |
+
from nltk.corpus import stopwords
|
| 74 |
+
|
| 75 |
+
stopwords = set(stopwords.words('english'))
|
| 76 |
+
#mask = np.array(Image.open('upvote.png'))
|
| 77 |
+
|
| 78 |
+
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
| 79 |
+
import matplotlib.pyplot as plt
|
| 80 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
| 81 |
+
st.title("New York Hotel Finder")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
stopwords=list(STOP_WORDS)
|
| 85 |
+
stopwords.extend(['hotel','room','rooms'])
|
| 86 |
+
from string import punctuation
|
| 87 |
+
punctuation=punctuation+ '\n'
|
| 88 |
+
|
| 89 |
+
import pandas as pd
|
| 90 |
+
from sentence_transformers import SentenceTransformer
|
| 91 |
+
import scipy.spatial
|
| 92 |
+
import pickle as pkl
|
| 93 |
+
from sentence_transformers import SentenceTransformer, util
|
| 94 |
+
import torch
|
| 95 |
+
#import os
|
| 96 |
+
|
| 97 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 98 |
+
|
| 99 |
+
df_all = pd.read_csv('Hotel New York Combined.csv')
|
| 100 |
+
|
| 101 |
+
df_all = df_all[['hotel_name','review_body']]
|
| 102 |
+
#
|
| 103 |
+
# df['hotel_name'].drop_duplicates()
|
| 104 |
+
|
| 105 |
+
# df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).review_body.apply(''.join).reset_index(name='all_review')
|
| 106 |
+
|
| 107 |
+
import re
|
| 108 |
+
|
| 109 |
+
df_combined = pd.read_csv('df_combined.csv')
|
| 110 |
+
|
| 111 |
+
# df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
| 112 |
+
#
|
| 113 |
+
# def lower_case(input_str):
|
| 114 |
+
# input_str = input_str.lower()
|
| 115 |
+
# return input_str
|
| 116 |
+
#
|
| 117 |
+
# df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
| 118 |
+
|
| 119 |
+
df = df_combined
|
| 120 |
+
|
| 121 |
+
df_sentences = df_combined.set_index("all_review")
|
| 122 |
+
|
| 123 |
+
df_sentences = df_sentences["hotel_name"].to_dict()
|
| 124 |
+
df_sentences_list = list(df_sentences.keys())
|
| 125 |
+
|
| 126 |
+
import pandas as pd
|
| 127 |
+
from tqdm import tqdm
|
| 128 |
+
from sentence_transformers import SentenceTransformer, util
|
| 129 |
+
|
| 130 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
| 131 |
+
#
|
| 132 |
+
corpus = df_sentences_list
|
| 133 |
+
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
| 134 |
+
#
|
| 135 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 136 |
+
# paraphrases = util.paraphrase_mining(model, corpus)
|
| 137 |
+
|
| 138 |
+
#queries = ['Hotel close to Central Park',
|
| 139 |
+
# 'Hotel with breakfast'
|
| 140 |
+
# ]
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
# from transformers import AutoTokenizer, AutoModel
|
| 144 |
+
# import torch
|
| 145 |
+
# import torch.nn.functional as F
|
| 146 |
+
#
|
| 147 |
+
# #Mean Pooling - Take attention mask into account for correct averaging
|
| 148 |
+
# def mean_pooling(model_output, attention_mask):
|
| 149 |
+
# token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
| 150 |
+
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 151 |
+
# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 152 |
+
#
|
| 153 |
+
#
|
| 154 |
+
# # Sentences we want sentence embeddings for
|
| 155 |
+
# sentences = corpus
|
| 156 |
+
#
|
| 157 |
+
# # Load model from HuggingFace Hub
|
| 158 |
+
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
| 159 |
+
# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
| 160 |
+
#
|
| 161 |
+
# # Tokenize sentences
|
| 162 |
+
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
| 163 |
+
#
|
| 164 |
+
# # Compute token embeddings
|
| 165 |
+
# with torch.no_grad():
|
| 166 |
+
# model_output = model(**encoded_input)
|
| 167 |
+
#
|
| 168 |
+
# # Perform pooling
|
| 169 |
+
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 170 |
+
#
|
| 171 |
+
# # Normalize embeddings
|
| 172 |
+
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
| 173 |
+
#
|
| 174 |
+
# st.text("Sentence embeddings:")
|
| 175 |
+
# st.text(sentence_embeddings)
|
| 176 |
+
#
|
| 177 |
+
#
|
| 178 |
+
|
| 179 |
+
#corpus_embeddings = sentence_embeddings
|
| 180 |
+
# Query sentences
|
| 181 |
+
|
| 182 |
+
def plot_cloud(wordcloud):
|
| 183 |
+
# Set figure size
|
| 184 |
+
st.pyplot.figure(figsize=(40, 30))
|
| 185 |
+
# Display image
|
| 186 |
+
st.pyplot(wordcloud)
|
| 187 |
+
# No axis details
|
| 188 |
+
#st.pyplot.axis("off");
|
| 189 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?')
|
| 190 |
+
if not userinput:
|
| 191 |
+
st.write("Please enter a query to get results")
|
| 192 |
+
else:
|
| 193 |
+
query = [str(userinput)]
|
| 194 |
+
doc = nlp(str(userinput))
|
| 195 |
+
for ent in doc.ents:
|
| 196 |
+
if ent.label_ == 'GPE':
|
| 197 |
+
if ent.text in countries:
|
| 198 |
+
st.write(f"Country : {ent.text}")
|
| 199 |
+
elif ent.text in cities:
|
| 200 |
+
st.write("city")
|
| 201 |
+
st.write(ent.text)
|
| 202 |
+
st.write(f"City : {ent.text}")
|
| 203 |
+
else:
|
| 204 |
+
print(f"Other GPE : {ent.text}")
|
| 205 |
+
# query_embeddings = embedder.encode(queries,show_progress_bar=True)
|
| 206 |
+
top_k = min(5, len(corpus))
|
| 207 |
+
|
| 208 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
| 209 |
+
|
| 210 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
| 211 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
| 212 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
| 213 |
+
|
| 214 |
+
# st.write("\n\n======================\n\n")
|
| 215 |
+
# st.write("Query:", query)
|
| 216 |
+
# # doc = nlp(query)
|
| 217 |
+
sentence_spans = list(doc.sents)
|
| 218 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
| 219 |
+
# Display the entity visualization in the browser:
|
| 220 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
| 221 |
+
|
| 222 |
+
#displacy.render(doc, jupyter = True, style="ent")
|
| 223 |
+
st.write("##")
|
| 224 |
+
st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
|
| 225 |
+
st.write("\n\n======================\n\n")
|
| 226 |
+
|
| 227 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
| 228 |
+
|
| 229 |
+
row_dict = df.loc[df['all_review']== corpus[idx]]
|
| 230 |
+
st.subheader(row_dict['hotel_name'].values[0])
|
| 231 |
+
hotel_subset = df_all.loc[df_all['hotel_name']==row_dict['hotel_name'].values[0]]
|
| 232 |
+
st.caption("Review Summary:")
|
| 233 |
+
st.write(row_dict['summary'].values[0])
|
| 234 |
+
st.caption("Relevancy: {:.4f}".format(score))
|
| 235 |
+
st.caption("Relevant reviews:")
|
| 236 |
+
|
| 237 |
+
df_sentences_h = hotel_subset.set_index("review_body")
|
| 238 |
+
|
| 239 |
+
df_sentences_h = df_sentences_h["hotel_name"].to_dict()
|
| 240 |
+
df_sentences_list_h = list(df_sentences_h.keys())
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
|
| 245 |
+
#
|
| 246 |
+
corpus_h = df_sentences_list_h
|
| 247 |
+
corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
|
| 248 |
+
cos_scores_h = util.pytorch_cos_sim(query_embedding, corpus_embeddings_h)[0]
|
| 249 |
+
top_results_h = torch.topk(cos_scores_h, k=top_k)
|
| 250 |
+
|
| 251 |
+
for score, idx in zip(top_results_h[0], top_results_h[1]):
|
| 252 |
+
st.write(corpus_h[idx])
|
| 253 |
+
|
| 254 |
+
# st.table(hotel_subset.head())
|
| 255 |
+
|
| 256 |
+
# st.write("#")
|
| 257 |
+
#wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
|
| 258 |
+
# wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
|
| 259 |
+
# fig, ax = plt.subplots()
|
| 260 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
| 261 |
+
# plt.axis("off")
|
| 262 |
+
# plt.show()
|
| 263 |
+
# st.pyplot(fig)
|
| 264 |
+
# st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
if __name__ == '__main__':
|
| 268 |
+
main()
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
# cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
|
| 272 |
+
# top_results = torch.topk(cos_scores, k=top_k)
|
| 273 |
+
|
| 274 |
+
# st.write("\n\n======================\n\n")
|
| 275 |
+
# st.write("Query:", query)
|
| 276 |
+
# st.write("\nTop 5 most similar sentences in corpus using sentence embedding:")
|
| 277 |
+
#
|
| 278 |
+
# for score, idx in zip(top_results[0], top_results[1]):
|
| 279 |
+
# st.write("(Score: {:.4f})".format(score))
|
| 280 |
+
# row_dict = df.loc[df['all_review']== corpus[idx]]
|
| 281 |
+
# st.write("paper_id: " , row_dict['hotel_name'] , "\n")
|
| 282 |
+
# #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
|
| 283 |
+
# wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
|
| 284 |
+
# fig, ax = plt.subplots()
|
| 285 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
| 286 |
+
# plt.axis("off")
|
| 287 |
+
# plt.show()
|
| 288 |
+
# st.pyplot(fig)
|
| 289 |
+
# st.set_option('deprecation.showPyplotGlobalUse', False)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 293 |
+
#
|
| 294 |
+
# corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
|
app.yaml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
service: default
|
| 2 |
+
runtime: custom
|
| 3 |
+
env: flex
|
basic.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from time import time
|
| 4 |
+
from lxml import html,etree
|
| 5 |
+
from reviews_final import scrape, write_in_csv
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import requests,re
|
| 8 |
+
import os,sys
|
| 9 |
+
import unicodecsv as csv
|
| 10 |
+
import argparse
|
| 11 |
+
import numpy as np
|
| 12 |
+
import json
|
| 13 |
+
def clean(text):
|
| 14 |
+
if text:
|
| 15 |
+
# Removing \n \r and \t
|
| 16 |
+
return ' '.join(''.join(text).split()).strip()
|
| 17 |
+
return None
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse(locality,checkin_date,checkout_date,sort):
|
| 23 |
+
checkIn = checkin_date.strftime("%Y/%m/%d")
|
| 24 |
+
checkOut = checkout_date.strftime("%Y/%m/%d")
|
| 25 |
+
print ("Scraper Inititated for Locality:%s"%locality)
|
| 26 |
+
header = {
|
| 27 |
+
|
| 28 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
|
| 29 |
+
}
|
| 30 |
+
# TA rendering the autocomplete list using this API
|
| 31 |
+
print ("Finding search result page URL")
|
| 32 |
+
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
|
| 33 |
+
api_response = requests.get(geo_url,headers=header, timeout=120).json()
|
| 34 |
+
#getting the TA url for th equery from the autocomplete response
|
| 35 |
+
url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
|
| 36 |
+
print ('URL found %s'%url_from_autocomplete)
|
| 37 |
+
geo = api_response['results'][0]['value']
|
| 38 |
+
#Formating date for writing to file
|
| 39 |
+
a=url_from_autocomplete
|
| 40 |
+
b=a.split("-")
|
| 41 |
+
s="-"
|
| 42 |
+
c=s.join([b[0],b[1],"oa30",b[2],b[3]])
|
| 43 |
+
d=s.join([b[0],b[1],"oa60",b[2],b[3]])
|
| 44 |
+
e=s.join([b[0],b[1],"oa90",b[2],b[3]])
|
| 45 |
+
f=s.join([b[0],b[1],"oa120",b[2],b[3]])
|
| 46 |
+
urllist = [a,c,d,e,f]
|
| 47 |
+
|
| 48 |
+
date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
|
| 49 |
+
#form data to get the hotels list from TA for the selected date
|
| 50 |
+
form_data = {'changeSet': 'TRAVEL_INFO',
|
| 51 |
+
'showSnippets': 'false',
|
| 52 |
+
'staydates':date,
|
| 53 |
+
'uguests': '2',
|
| 54 |
+
'sortOrder':sort
|
| 55 |
+
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
json_arr = []
|
| 61 |
+
for url_from_autocomplete in urllist:
|
| 62 |
+
print(url_from_autocomplete)
|
| 63 |
+
|
| 64 |
+
headers = {
|
| 65 |
+
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
|
| 66 |
+
'Accept-Encoding': 'gzip,deflate',
|
| 67 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 68 |
+
'Cache-Control': 'no-cache',
|
| 69 |
+
'Connection': 'keep-alive',
|
| 70 |
+
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
|
| 71 |
+
'Host': 'www.tripadvisor.com',
|
| 72 |
+
'Pragma': 'no-cache',
|
| 73 |
+
'Referer': url_from_autocomplete,
|
| 74 |
+
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
|
| 75 |
+
'X-Requested-With': 'XMLHttpRequest'
|
| 76 |
+
}
|
| 77 |
+
cookies= {"SetCurrency":"USD"}
|
| 78 |
+
print ("Downloading search results page")
|
| 79 |
+
page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
|
| 80 |
+
print ("Parsing results ")
|
| 81 |
+
parser = html.fromstring(page_response.text)
|
| 82 |
+
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
|
| 83 |
+
hotel_data = []
|
| 84 |
+
if not hotel_lists:
|
| 85 |
+
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')
|
| 86 |
+
|
| 87 |
+
for hotel in hotel_lists:
|
| 88 |
+
XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
|
| 89 |
+
XPATH_REVIEWS = './/a[@class="review_count"]//text()'
|
| 90 |
+
XPATH_RANK = './/div[@class="popindex"]//text()'
|
| 91 |
+
XPATH_RATING = './/span[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
|
| 92 |
+
XPATH_RATING_2 = './/a[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
|
| 93 |
+
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
|
| 94 |
+
XPATH_HOTEL_FEATURES = './/div[contains(@casls,"common_hotel_icons_list")]//li//text()'
|
| 95 |
+
XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
|
| 96 |
+
XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
|
| 97 |
+
XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()' #<span class="dekGp Ci _R S4 H3 MD">#74 of 319 hotels in Lisbon</span><span class="dekGp Ci _R S4 H3 MD">#6 of 319 hotels in Lisbon</span>
|
| 98 |
+
XPATH_RATING_ORDER = './/span[contains(@class,"dekGp Ci _R S4 H3 MD")]//text()'
|
| 99 |
+
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
|
| 103 |
+
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
|
| 104 |
+
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
|
| 105 |
+
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
|
| 106 |
+
raw_rank = hotel.xpath(XPATH_RANK)
|
| 107 |
+
raw_rating = hotel.xpath(XPATH_RATING_2)
|
| 108 |
+
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
|
| 109 |
+
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
|
| 110 |
+
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
|
| 111 |
+
raw_rank_order = hotel.xpath(XPATH_RATING_ORDER)
|
| 112 |
+
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
|
| 113 |
+
|
| 114 |
+
url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None
|
| 115 |
+
reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
|
| 116 |
+
rank = ''.join(raw_rank) if raw_rank else None
|
| 117 |
+
rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
|
| 118 |
+
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
|
| 119 |
+
hotel_features = ','.join(raw_hotel_features)
|
| 120 |
+
#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
|
| 121 |
+
price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
|
| 122 |
+
rank_order = ''.join(raw_rank_order) if raw_rank_order else None
|
| 123 |
+
no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
|
| 124 |
+
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
|
| 125 |
+
official_description = clean(raw_official_description)
|
| 126 |
+
|
| 127 |
+
if no_of_deals:
|
| 128 |
+
no_of_deals = no_of_deals[0]
|
| 129 |
+
else:
|
| 130 |
+
no_of_deals = 0
|
| 131 |
+
|
| 132 |
+
data = {
|
| 133 |
+
'hotel_name':name,
|
| 134 |
+
'url':url,
|
| 135 |
+
'locality':locality,
|
| 136 |
+
'reviews':reviews,
|
| 137 |
+
'rank':rank,
|
| 138 |
+
'tripadvisor_rating':rating,
|
| 139 |
+
'checkOut':checkOut,
|
| 140 |
+
'checkIn':checkIn,
|
| 141 |
+
'hotel_features':hotel_features,
|
| 142 |
+
'price_per_night':price_per_night,
|
| 143 |
+
'no_of_deals':no_of_deals,
|
| 144 |
+
'booking_provider':booking_provider,
|
| 145 |
+
'raw_rank': rank_order,
|
| 146 |
+
'desc':official_description
|
| 147 |
+
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if data:
|
| 152 |
+
print("Writing scraped data")
|
| 153 |
+
json_arr.append(data)
|
| 154 |
+
with open('data_file.json', 'w') as outfile:
|
| 155 |
+
json.dump(json_arr, outfile)
|
| 156 |
+
# hotel_data.append(data)
|
| 157 |
+
# all_hotel.append(data)
|
| 158 |
+
# #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
|
| 159 |
+
# my_df=pd.DataFrame(all_hotel)
|
| 160 |
+
# print(my_df['hotel_name'])
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
return urllist
|
combined_paris.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
corpus_embeddings_bi_encoder.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
|
| 3 |
+
size 64918
|
corpus_embeddings_bi_encoder.pickle 2
ADDED
|
Binary file (64.9 kB). View file
|
|
|
df_combined.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
df_combined_paris.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
embeddings.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
|
| 3 |
+
size 64640
|
embeddings_h_r.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76ae4840488129cd4c6917830018219292cca514e62c69ea9e507b185d219aa7
|
| 3 |
+
size 4391552
|
embeddings_review.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96cee6d14a75d19eccbe9decb501dd3c5de6c1fe401d3803a82611f075a8a6a8
|
| 3 |
+
size 144512
|
en_core_web_sm-3.2.0-py3-none-any.whl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e02939fb7fbae6dbcc9c5a1355f5e4e02939b649a1f0846ee844ac1d479bbeb
|
| 3 |
+
size 13900196
|
paris-newer.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@author: Hamza Farooq
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import spacy
|
| 10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
| 11 |
+
from string import punctuation
|
| 12 |
+
from collections import Counter
|
| 13 |
+
from heapq import nlargest
|
| 14 |
+
import os
|
| 15 |
+
nlp = spacy.load("en_core_web_sm")
|
| 16 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
| 17 |
+
import datetime
|
| 18 |
+
|
| 19 |
+
from spacy import displacy
|
| 20 |
+
import streamlit as st
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
from wordcloud import WordCloud
|
| 23 |
+
from matplotlib import pyplot as plt
|
| 24 |
+
|
| 25 |
+
import nltk
|
| 26 |
+
from rank_bm25 import BM25Okapi
|
| 27 |
+
from sklearn.feature_extraction import _stop_words
|
| 28 |
+
import string
|
| 29 |
+
from tqdm.autonotebook import tqdm
|
| 30 |
+
import numpy as np
|
| 31 |
+
import pandas as pd
|
| 32 |
+
from sentence_transformers import SentenceTransformer
|
| 33 |
+
import scipy.spatial
|
| 34 |
+
import pickle
|
| 35 |
+
from sentence_transformers import SentenceTransformer, util
|
| 36 |
+
import torch
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# import utils as utl
|
| 43 |
+
|
| 44 |
+
import time
|
| 45 |
+
import torch
|
| 46 |
+
import transformers
|
| 47 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
| 48 |
+
from string import punctuation
|
| 49 |
+
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
| 50 |
+
|
| 51 |
+
import numpy as np
|
| 52 |
+
import pandas as pd
|
| 53 |
+
from sentence_transformers import SentenceTransformer
|
| 54 |
+
import scipy.spatial
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
from sentence_transformers import SentenceTransformer, util
|
| 58 |
+
import torch
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def main():
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# Settings
|
| 68 |
+
st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
|
| 69 |
+
from string import punctuation
|
| 70 |
+
punctuation=punctuation+ '\n'
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
from sentence_transformers import SentenceTransformer, util
|
| 74 |
+
import torch
|
| 75 |
+
import numpy as np
|
| 76 |
+
import pandas as pd
|
| 77 |
+
from sentence_transformers import SentenceTransformer
|
| 78 |
+
import scipy.spatial
|
| 79 |
+
|
| 80 |
+
from sentence_transformers import SentenceTransformer, util
|
| 81 |
+
import torch
|
| 82 |
+
#import os
|
| 83 |
+
@st.cache(allow_output_mutation=True)
|
| 84 |
+
def load_model():
|
| 85 |
+
return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
| 86 |
+
embedder,bi_encoder,cross_encoder = load_model()
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
| 92 |
+
st.title("Parisian Hotel Finder")
|
| 93 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
| 94 |
+
|
| 95 |
+
st.write(
|
| 96 |
+
"""
|
| 97 |
+
- This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
|
| 98 |
+
- It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
|
| 99 |
+
"""
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
punctuation=punctuation+ '\n'
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
#import os
|
| 107 |
+
|
| 108 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def lower_case(input_str):
|
| 113 |
+
input_str = input_str.lower()
|
| 114 |
+
return input_str
|
| 115 |
+
|
| 116 |
+
df_all = pd.read_csv('paris_clean_newer.csv')
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
|
| 120 |
+
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
|
| 121 |
+
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
|
| 122 |
+
|
| 123 |
+
import re
|
| 124 |
+
|
| 125 |
+
# df_combined = pd.read_csv('df_combined.csv')
|
| 126 |
+
|
| 127 |
+
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
| 131 |
+
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
|
| 132 |
+
df_basic = df_basic.merge(df_combined_paris_summary,how='left')
|
| 133 |
+
df_combined_e = df_combined.merge(df_basic)
|
| 134 |
+
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
|
| 135 |
+
|
| 136 |
+
df = df_combined_e.copy()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
df_sentences = df_combined_e.set_index("all_review")
|
| 140 |
+
|
| 141 |
+
df_sentences = df_sentences["Hotel"].to_dict()
|
| 142 |
+
df_sentences_list = list(df_sentences.keys())
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
import pandas as pd
|
| 147 |
+
from tqdm import tqdm
|
| 148 |
+
from sentence_transformers import SentenceTransformer, util
|
| 149 |
+
|
| 150 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
| 151 |
+
#
|
| 152 |
+
corpus = df_sentences_list
|
| 153 |
+
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
| 154 |
+
corpus_embeddings = np.load('embeddings.npy')
|
| 155 |
+
|
| 156 |
+
bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
|
| 157 |
+
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
|
| 158 |
+
|
| 159 |
+
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
|
| 160 |
+
|
| 161 |
+
# corpus_embeddings_h = np.load('embeddings_h_r.npy')
|
| 162 |
+
|
| 163 |
+
with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
|
| 164 |
+
doc_embedding = pickle.load(pkl)
|
| 165 |
+
|
| 166 |
+
with open('tokenized_corpus.pickle', 'rb') as pkl:
|
| 167 |
+
tokenized_corpus = pickle.load(pkl)
|
| 168 |
+
|
| 169 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
| 170 |
+
passages = corpus
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# We lower case our text and remove stop-words from indexing
|
| 176 |
+
def bm25_tokenizer(text):
|
| 177 |
+
tokenized_doc = []
|
| 178 |
+
for token in text.lower().split():
|
| 179 |
+
token = token.strip(string.punctuation)
|
| 180 |
+
|
| 181 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
| 182 |
+
tokenized_doc.append(token)
|
| 183 |
+
return tokenized_doc
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def search(query):
|
| 187 |
+
# q = [str(userinput)]
|
| 188 |
+
doc = nlp(str(userinput))
|
| 189 |
+
|
| 190 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
| 191 |
+
# Display the entity visualization in the browser:
|
| 192 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
| 193 |
+
##### BM25 search (lexical search) #####
|
| 194 |
+
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|
| 195 |
+
top_n = np.argpartition(bm25_scores, -5)[-5:]
|
| 196 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
| 197 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
| 198 |
+
|
| 199 |
+
bm25list = {}
|
| 200 |
+
st.title("Top-5 lexical search (BM25) hits")
|
| 201 |
+
for hit in bm25_hits[0:5]:
|
| 202 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
| 203 |
+
|
| 204 |
+
st.subheader(row_dict['Hotel'].values[0])
|
| 205 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
| 206 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
| 207 |
+
st.write('Description:')
|
| 208 |
+
st.expander(de.description.values[0],expanded=False)
|
| 209 |
+
# try:
|
| 210 |
+
# st.write('Summary')
|
| 211 |
+
# st.expander(de.summary.values[0],expanded=False)
|
| 212 |
+
# except:
|
| 213 |
+
# None
|
| 214 |
+
# doc = corpus[hit['corpus_id']]
|
| 215 |
+
# kp.get_key_phrases(doc)
|
| 216 |
+
|
| 217 |
+
bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
|
| 218 |
+
|
| 219 |
+
#### Sematic Search #####
|
| 220 |
+
# Encode the query using the bi-encoder and find potentially relevant passages
|
| 221 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
| 222 |
+
# question_embedding = question_embedding.cuda()
|
| 223 |
+
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
|
| 224 |
+
hits = hits[0] # Get the hits for the first query
|
| 225 |
+
|
| 226 |
+
##### Re-Ranking #####
|
| 227 |
+
# Now, score all retrieved passages with the cross_encoder
|
| 228 |
+
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
|
| 229 |
+
cross_scores = cross_encoder.predict(cross_inp)
|
| 230 |
+
|
| 231 |
+
# Sort results by the cross-encoder scores
|
| 232 |
+
for idx in range(len(cross_scores)):
|
| 233 |
+
hits[idx]['cross-score'] = cross_scores[idx]
|
| 234 |
+
|
| 235 |
+
# Output of top-5 hits from bi-encoder
|
| 236 |
+
st.write("\n-------------------------\n")
|
| 237 |
+
st.title("Top-5 Bi-Encoder Retrieval hits")
|
| 238 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
| 239 |
+
for hit in hits[0:5]:
|
| 240 |
+
# st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
|
| 241 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
| 242 |
+
st.subheader(row_dict['Hotel'].values[0])
|
| 243 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
| 244 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
| 245 |
+
st.write('Description:')
|
| 246 |
+
st.expander(de.description.values[0])
|
| 247 |
+
# try:
|
| 248 |
+
# st.write('Summary')
|
| 249 |
+
# st.expander(de.summary.values[0],expanded=False)
|
| 250 |
+
# except:
|
| 251 |
+
# None
|
| 252 |
+
|
| 253 |
+
# Output of top-5 hits from re-ranker
|
| 254 |
+
st.write("\n-------------------------\n")
|
| 255 |
+
st.title("Top-5 Cross-Encoder Re-ranker hits")
|
| 256 |
+
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
| 257 |
+
for hit in hits[0:5]:
|
| 258 |
+
# st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
|
| 259 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
| 260 |
+
st.subheader(row_dict['Hotel'].values[0])
|
| 261 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
| 262 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
| 263 |
+
st.write('Description:')
|
| 264 |
+
st.expander(de.description.values[0])
|
| 265 |
+
# try:
|
| 266 |
+
# st.write('Summary')
|
| 267 |
+
# st.expander(de.summary.values[0],expanded=False)
|
| 268 |
+
# except:
|
| 269 |
+
# None
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
|
| 275 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
|
| 276 |
+
da = st.date_input(
|
| 277 |
+
"Date Check-in",
|
| 278 |
+
datetime.date(2022, 10, 5))
|
| 279 |
+
|
| 280 |
+
dst = st.date_input(
|
| 281 |
+
"Date Check-out",
|
| 282 |
+
datetime.date(2022, 10, 8))
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if not userinput or userinput == sampletext:
|
| 286 |
+
st.write("Please enter a query to get results")
|
| 287 |
+
else:
|
| 288 |
+
query = [str(userinput)]
|
| 289 |
+
doc = nlp(str(userinput))
|
| 290 |
+
search(str(userinput))
|
| 291 |
+
|
| 292 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
| 293 |
+
|
| 294 |
+
if __name__ == '__main__':
|
| 295 |
+
main()
|
paris.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@author: Hamza Farooq
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import spacy
|
| 10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
| 11 |
+
from string import punctuation
|
| 12 |
+
from collections import Counter
|
| 13 |
+
from heapq import nlargest
|
| 14 |
+
import os
|
| 15 |
+
nlp = spacy.load("en_core_web_sm")
|
| 16 |
+
|
| 17 |
+
from spacy import displacy
|
| 18 |
+
import streamlit as st
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
from wordcloud import WordCloud
|
| 21 |
+
from matplotlib import pyplot as plt
|
| 22 |
+
|
| 23 |
+
import nltk
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# import utils as utl
|
| 30 |
+
|
| 31 |
+
import time
|
| 32 |
+
import torch
|
| 33 |
+
import transformers
|
| 34 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
| 35 |
+
from string import punctuation
|
| 36 |
+
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
| 37 |
+
|
| 38 |
+
import numpy as np
|
| 39 |
+
import pandas as pd
|
| 40 |
+
from sentence_transformers import SentenceTransformer
|
| 41 |
+
import scipy.spatial
|
| 42 |
+
import pickle as pkl
|
| 43 |
+
from sentence_transformers import SentenceTransformer, util
|
| 44 |
+
import torch
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def main():
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Settings
|
| 54 |
+
st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
|
| 55 |
+
from string import punctuation
|
| 56 |
+
punctuation=punctuation+ '\n'
|
| 57 |
+
|
| 58 |
+
# def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
|
| 59 |
+
#
|
| 60 |
+
# text = text.replace('\n','')
|
| 61 |
+
# text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
|
| 62 |
+
# summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
|
| 63 |
+
# summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
| 64 |
+
# return summary_txt
|
| 65 |
+
|
| 66 |
+
from sentence_transformers import SentenceTransformer, util
|
| 67 |
+
import torch
|
| 68 |
+
import numpy as np
|
| 69 |
+
import pandas as pd
|
| 70 |
+
from sentence_transformers import SentenceTransformer
|
| 71 |
+
import scipy.spatial
|
| 72 |
+
import pickle as pkl
|
| 73 |
+
from sentence_transformers import SentenceTransformer, util
|
| 74 |
+
import torch
|
| 75 |
+
#import os
|
| 76 |
+
@st.cache(allow_output_mutation=True)
|
| 77 |
+
def load_model():
|
| 78 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
| 79 |
+
embedder = load_model()
|
| 80 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 81 |
+
|
| 82 |
+
# gc = geonamescache.GeonamesCache()
|
| 83 |
+
#
|
| 84 |
+
# # gets nested dictionary for countries
|
| 85 |
+
# countries = gc.get_countries()
|
| 86 |
+
#
|
| 87 |
+
# # gets nested dictionary for cities
|
| 88 |
+
# cities = gc.get_cities()
|
| 89 |
+
# def gen_dict_extract(var, key):
|
| 90 |
+
# if isinstance(var, dict):
|
| 91 |
+
# for k, v in var.items():
|
| 92 |
+
# if k == key:
|
| 93 |
+
# yield v
|
| 94 |
+
# if isinstance(v, (dict, list)):
|
| 95 |
+
# yield from gen_dict_extract(v, key)
|
| 96 |
+
# elif isinstance(var, list):
|
| 97 |
+
# for d in var:
|
| 98 |
+
# yield from gen_dict_extract(d, key)
|
| 99 |
+
#
|
| 100 |
+
# cities = [*gen_dict_extract(cities, 'name')]
|
| 101 |
+
# countries = [*gen_dict_extract(countries, 'name')]
|
| 102 |
+
#
|
| 103 |
+
# cities.append('New York')
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# mask = np.array(Image.open('upvote.png'))
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
| 112 |
+
st.title("Parisian Hotel Finder")
|
| 113 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
| 114 |
+
|
| 115 |
+
st.write(
|
| 116 |
+
"""
|
| 117 |
+
- This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
|
| 118 |
+
- It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
|
| 119 |
+
"""
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
punctuation=punctuation+ '\n'
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
#import os
|
| 127 |
+
|
| 128 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 129 |
+
|
| 130 |
+
df_all = pd.read_csv('combined_paris.csv')
|
| 131 |
+
|
| 132 |
+
df_all = df_all[['Hotel','review']]
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
df_all = df_all.drop_duplicates()
|
| 136 |
+
df_all = df_all.reset_index(drop=True)
|
| 137 |
+
summary_hotel = pd.read_csv('df_combined_paris.csv')
|
| 138 |
+
#
|
| 139 |
+
# df['hotel_name'].drop_duplicates()
|
| 140 |
+
|
| 141 |
+
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')
|
| 142 |
+
|
| 143 |
+
import re
|
| 144 |
+
|
| 145 |
+
# df_combined = pd.read_csv('df_combined.csv')
|
| 146 |
+
|
| 147 |
+
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
| 148 |
+
|
| 149 |
+
def lower_case(input_str):
|
| 150 |
+
input_str = input_str.lower()
|
| 151 |
+
return input_str
|
| 152 |
+
|
| 153 |
+
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
| 154 |
+
|
| 155 |
+
df = df_combined
|
| 156 |
+
|
| 157 |
+
df_sentences = df_combined.set_index("all_review")
|
| 158 |
+
|
| 159 |
+
df_sentences = df_sentences["Hotel"].to_dict()
|
| 160 |
+
df_sentences_list = list(df_sentences.keys())
|
| 161 |
+
|
| 162 |
+
import pandas as pd
|
| 163 |
+
from tqdm import tqdm
|
| 164 |
+
from sentence_transformers import SentenceTransformer, util
|
| 165 |
+
|
| 166 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
| 167 |
+
#
|
| 168 |
+
corpus = df_sentences_list
|
| 169 |
+
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
| 170 |
+
corpus_embeddings = np.load('embeddings_review.npy')
|
| 171 |
+
corpus_embeddings_h = np.load('embeddings_h_r.npy')
|
| 172 |
+
#
|
| 173 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 174 |
+
# paraphrases = util.paraphrase_mining(model, corpus)
|
| 175 |
+
|
| 176 |
+
#queries = ['Hotel close to Central Park',
|
| 177 |
+
# 'Hotel with breakfast'
|
| 178 |
+
# ]
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# from transformers import AutoTokenizer, AutoModel
|
| 182 |
+
# import torch
|
| 183 |
+
# import torch.nn.functional as F
|
| 184 |
+
#
|
| 185 |
+
# #Mean Pooling - Take attention mask into account for correct averaging
|
| 186 |
+
# def mean_pooling(model_output, attention_mask):
|
| 187 |
+
# token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
| 188 |
+
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
| 189 |
+
# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
| 190 |
+
#
|
| 191 |
+
#
|
| 192 |
+
# # Sentences we want sentence embeddings for
|
| 193 |
+
# sentences = corpus
|
| 194 |
+
#
|
| 195 |
+
# # Load model from HuggingFace Hub
|
| 196 |
+
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
| 197 |
+
# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
| 198 |
+
#
|
| 199 |
+
# # Tokenize sentences
|
| 200 |
+
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
| 201 |
+
#
|
| 202 |
+
# # Compute token embeddings
|
| 203 |
+
# with torch.no_grad():
|
| 204 |
+
# model_output = model(**encoded_input)
|
| 205 |
+
#
|
| 206 |
+
# # Perform pooling
|
| 207 |
+
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
| 208 |
+
#
|
| 209 |
+
# # Normalize embeddings
|
| 210 |
+
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
| 211 |
+
#
|
| 212 |
+
# st.text("Sentence embeddings:")
|
| 213 |
+
# st.text(sentence_embeddings)
|
| 214 |
+
#
|
| 215 |
+
#
|
| 216 |
+
|
| 217 |
+
#corpus_embeddings = sentence_embeddings
|
| 218 |
+
# Query sentences
|
| 219 |
+
|
| 220 |
+
def plot_cloud(wordcloud):
|
| 221 |
+
# Set figure size
|
| 222 |
+
st.pyplot.figure(figsize=(20, 10))
|
| 223 |
+
# Display image
|
| 224 |
+
st.pyplot(wordcloud)
|
| 225 |
+
# No axis details
|
| 226 |
+
#st.pyplot.axis("off");
|
| 227 |
+
sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
|
| 228 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
|
| 229 |
+
if not userinput or userinput == sampletext:
|
| 230 |
+
st.write("Please enter a query to get results")
|
| 231 |
+
else:
|
| 232 |
+
query = [str(userinput)]
|
| 233 |
+
doc = nlp(str(userinput))
|
| 234 |
+
# for ent in doc.ents:
|
| 235 |
+
# if ent.label_ == 'GPE':
|
| 236 |
+
# if ent.text in countries:
|
| 237 |
+
# st.write(f"Country : {ent.text}")
|
| 238 |
+
# elif ent.text in cities:
|
| 239 |
+
# st.write("city")
|
| 240 |
+
# st.write(ent.text)
|
| 241 |
+
# st.write(f"City : {ent.text}")
|
| 242 |
+
# else:
|
| 243 |
+
# print(f"Other GPE : {ent.text}")
|
| 244 |
+
# query_embeddings = embedder.encode(queries,show_progress_bar=True)
|
| 245 |
+
top_k = min(5, len(corpus))
|
| 246 |
+
|
| 247 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
| 248 |
+
|
| 249 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
| 250 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
| 251 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
| 252 |
+
|
| 253 |
+
# st.write("\n\n======================\n\n")
|
| 254 |
+
# st.write("Query:", query)
|
| 255 |
+
# # doc = nlp(query)
|
| 256 |
+
sentence_spans = list(doc.sents)
|
| 257 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
| 258 |
+
# Display the entity visualization in the browser:
|
| 259 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
| 260 |
+
|
| 261 |
+
#displacy.render(doc, jupyter = True, style="ent")
|
| 262 |
+
st.write("##")
|
| 263 |
+
st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
|
| 264 |
+
st.write("\n\n======================\n\n")
|
| 265 |
+
|
| 266 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
| 267 |
+
|
| 268 |
+
row_dict = df.loc[df['all_review']== corpus[idx]]
|
| 269 |
+
st.subheader(row_dict['Hotel'].values[0])
|
| 270 |
+
|
| 271 |
+
hotel_subset = df_all.loc[df_all['Hotel']==row_dict['Hotel'].values[0]]
|
| 272 |
+
hotel_sub = summary_hotel.loc[summary_hotel['Hotel']==row_dict['Hotel'].values[0]]
|
| 273 |
+
st.caption("Review Summary:")
|
| 274 |
+
st.write(hotel_sub['summary'].values[0])
|
| 275 |
+
st.caption("Relevancy: {:.4f}".format(score))
|
| 276 |
+
st.caption("Relevant reviews:")
|
| 277 |
+
|
| 278 |
+
df_sentences_h = hotel_subset.set_index("review")
|
| 279 |
+
|
| 280 |
+
df_sentences_h = df_sentences_h["Hotel"].to_dict()
|
| 281 |
+
df_sentences_list_h = list(df_sentences_h.keys())
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
|
| 286 |
+
#
|
| 287 |
+
corpus_h = df_sentences_list_h
|
| 288 |
+
# corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
|
| 289 |
+
sublist = [element for i, element in enumerate(corpus_embeddings_h) if i in (df_all[df_all['Hotel'] == row_dict['Hotel'].values[0]].index.values)]
|
| 290 |
+
cos_scores_h = util.pytorch_cos_sim(query_embedding, sublist)[0]
|
| 291 |
+
top_results_h = torch.topk(cos_scores_h, k=top_k)
|
| 292 |
+
|
| 293 |
+
for score, idx in zip(top_results_h[0], top_results_h[1]):
|
| 294 |
+
st.write(corpus_h[idx])
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
if __name__ == '__main__':
|
| 298 |
+
main()
|
paris_clean_newer.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
query_generator.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
streamlit==1.1.0
|
| 3 |
+
regex==2021.8.3
|
| 4 |
+
sklearn
|
| 5 |
+
sentence_transformers
|
| 6 |
+
scipy
|
| 7 |
+
tqdm
|
| 8 |
+
gensim
|
| 9 |
+
plotly
|
| 10 |
+
wordcloud
|
| 11 |
+
matplotlib
|
| 12 |
+
spacy
|
| 13 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
|
| 14 |
+
rank-bm25
|
summary.ipynb
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import torch\n",
|
| 10 |
+
"import transformers\n",
|
| 11 |
+
"from transformers import BartTokenizer, BartForConditionalGeneration\n",
|
| 12 |
+
"tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
|
| 13 |
+
"mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')\n",
|
| 14 |
+
"torch_device = 'cpu'\n"
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": 3,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):\n",
|
| 24 |
+
"\n",
|
| 25 |
+
" text = text.replace('\\n','')\n",
|
| 26 |
+
" text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)\n",
|
| 27 |
+
" summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))\n",
|
| 28 |
+
" summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)\n",
|
| 29 |
+
" return summary_txt"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "code",
|
| 34 |
+
"execution_count": 4,
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"outputs": [],
|
| 37 |
+
"source": [
|
| 38 |
+
"import pandas as pd\n",
|
| 39 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 40 |
+
"import scipy.spatial\n",
|
| 41 |
+
"import pickle as pkl\n",
|
| 42 |
+
"from sentence_transformers import SentenceTransformer, util\n",
|
| 43 |
+
"import torch\n",
|
| 44 |
+
"#import os\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"df = pd.read_csv('combined_paris.csv')\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"df_combined = df.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"import re\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\\s]','',x))\n",
|
| 55 |
+
"def lower_case(input_str):\n",
|
| 56 |
+
" input_str = input_str.lower()\n",
|
| 57 |
+
" return input_str"
|
| 58 |
+
]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"cell_type": "code",
|
| 62 |
+
"execution_count": 5,
|
| 63 |
+
"metadata": {},
|
| 64 |
+
"outputs": [],
|
| 65 |
+
"source": [
|
| 66 |
+
"df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"df = df_combined\n",
|
| 69 |
+
"\n",
|
| 70 |
+
"df_sentences = df_combined.set_index(\"all_review\")\n",
|
| 71 |
+
"\n",
|
| 72 |
+
"df_sentences = df_sentences[\"Hotel\"].to_dict()\n",
|
| 73 |
+
"df_sentences_list = list(df_sentences.keys())\n"
|
| 74 |
+
]
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"cell_type": "code",
|
| 78 |
+
"execution_count": 6,
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"outputs": [
|
| 81 |
+
{
|
| 82 |
+
"data": {
|
| 83 |
+
"text/html": [
|
| 84 |
+
"<div>\n",
|
| 85 |
+
"<style scoped>\n",
|
| 86 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 87 |
+
" vertical-align: middle;\n",
|
| 88 |
+
" }\n",
|
| 89 |
+
"\n",
|
| 90 |
+
" .dataframe tbody tr th {\n",
|
| 91 |
+
" vertical-align: top;\n",
|
| 92 |
+
" }\n",
|
| 93 |
+
"\n",
|
| 94 |
+
" .dataframe thead th {\n",
|
| 95 |
+
" text-align: right;\n",
|
| 96 |
+
" }\n",
|
| 97 |
+
"</style>\n",
|
| 98 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 99 |
+
" <thead>\n",
|
| 100 |
+
" <tr style=\"text-align: right;\">\n",
|
| 101 |
+
" <th></th>\n",
|
| 102 |
+
" <th>0</th>\n",
|
| 103 |
+
" <th>1</th>\n",
|
| 104 |
+
" <th>2</th>\n",
|
| 105 |
+
" <th>3</th>\n",
|
| 106 |
+
" <th>4</th>\n",
|
| 107 |
+
" </tr>\n",
|
| 108 |
+
" </thead>\n",
|
| 109 |
+
" <tbody>\n",
|
| 110 |
+
" <tr>\n",
|
| 111 |
+
" <th>Hotel</th>\n",
|
| 112 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
| 113 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
| 114 |
+
" <td>COQ Hotel Paris</td>\n",
|
| 115 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
| 116 |
+
" <td>Cler Hotel</td>\n",
|
| 117 |
+
" </tr>\n",
|
| 118 |
+
" <tr>\n",
|
| 119 |
+
" <th>all_review</th>\n",
|
| 120 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
| 121 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
| 122 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 123 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
| 124 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 125 |
+
" </tr>\n",
|
| 126 |
+
" </tbody>\n",
|
| 127 |
+
"</table>\n",
|
| 128 |
+
"</div>"
|
| 129 |
+
],
|
| 130 |
+
"text/plain": [
|
| 131 |
+
" 0 \\\n",
|
| 132 |
+
"Hotel 25hours Hotel Terminus Nord \n",
|
| 133 |
+
"all_review weve spent lots of time in paris and this was ... \n",
|
| 134 |
+
"\n",
|
| 135 |
+
" 1 \\\n",
|
| 136 |
+
"Hotel Acacias Etoile Hotel \n",
|
| 137 |
+
"all_review the hotel is great for value the breakfast sel... \n",
|
| 138 |
+
"\n",
|
| 139 |
+
" 2 \\\n",
|
| 140 |
+
"Hotel COQ Hotel Paris \n",
|
| 141 |
+
"all_review stayed for a short city break the hotel is a ... \n",
|
| 142 |
+
"\n",
|
| 143 |
+
" 3 \\\n",
|
| 144 |
+
"Hotel Campanile Paris 14 - Maine Montparnasse \n",
|
| 145 |
+
"all_review room was very clean transportation is very ne... \n",
|
| 146 |
+
"\n",
|
| 147 |
+
" 4 \n",
|
| 148 |
+
"Hotel Cler Hotel \n",
|
| 149 |
+
"all_review we had the best stay at cler hotel the locati... "
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
"execution_count": 6,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"output_type": "execute_result"
|
| 155 |
+
}
|
| 156 |
+
],
|
| 157 |
+
"source": [
|
| 158 |
+
"df_combined.head().T"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": 7,
|
| 164 |
+
"metadata": {},
|
| 165 |
+
"outputs": [
|
| 166 |
+
{
|
| 167 |
+
"name": "stderr",
|
| 168 |
+
"output_type": "stream",
|
| 169 |
+
"text": [
|
| 170 |
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
|
| 171 |
+
]
|
| 172 |
+
}
|
| 173 |
+
],
|
| 174 |
+
"source": [
|
| 175 |
+
"long_summary = []\n",
|
| 176 |
+
"\n",
|
| 177 |
+
"for i in range(len(df_combined)):\n",
|
| 178 |
+
" t = bart_summarize(df_combined['all_review'][i])\n",
|
| 179 |
+
" long_summary.append(t)"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "code",
|
| 184 |
+
"execution_count": 8,
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [],
|
| 187 |
+
"source": [
|
| 188 |
+
"df_combined['summary'] = long_summary"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": 9,
|
| 194 |
+
"metadata": {},
|
| 195 |
+
"outputs": [],
|
| 196 |
+
"source": [
|
| 197 |
+
"df_combined.to_csv('df_combined_paris.csv',index=False)"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"cell_type": "code",
|
| 202 |
+
"execution_count": 10,
|
| 203 |
+
"metadata": {},
|
| 204 |
+
"outputs": [
|
| 205 |
+
{
|
| 206 |
+
"data": {
|
| 207 |
+
"text/html": [
|
| 208 |
+
"<div>\n",
|
| 209 |
+
"<style scoped>\n",
|
| 210 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 211 |
+
" vertical-align: middle;\n",
|
| 212 |
+
" }\n",
|
| 213 |
+
"\n",
|
| 214 |
+
" .dataframe tbody tr th {\n",
|
| 215 |
+
" vertical-align: top;\n",
|
| 216 |
+
" }\n",
|
| 217 |
+
"\n",
|
| 218 |
+
" .dataframe thead th {\n",
|
| 219 |
+
" text-align: right;\n",
|
| 220 |
+
" }\n",
|
| 221 |
+
"</style>\n",
|
| 222 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 223 |
+
" <thead>\n",
|
| 224 |
+
" <tr style=\"text-align: right;\">\n",
|
| 225 |
+
" <th></th>\n",
|
| 226 |
+
" <th>Hotel</th>\n",
|
| 227 |
+
" <th>all_review</th>\n",
|
| 228 |
+
" <th>summary</th>\n",
|
| 229 |
+
" </tr>\n",
|
| 230 |
+
" </thead>\n",
|
| 231 |
+
" <tbody>\n",
|
| 232 |
+
" <tr>\n",
|
| 233 |
+
" <th>0</th>\n",
|
| 234 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
| 235 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
| 236 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
| 237 |
+
" </tr>\n",
|
| 238 |
+
" <tr>\n",
|
| 239 |
+
" <th>1</th>\n",
|
| 240 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
| 241 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
| 242 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
| 243 |
+
" </tr>\n",
|
| 244 |
+
" <tr>\n",
|
| 245 |
+
" <th>2</th>\n",
|
| 246 |
+
" <td>COQ Hotel Paris</td>\n",
|
| 247 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 248 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 249 |
+
" </tr>\n",
|
| 250 |
+
" <tr>\n",
|
| 251 |
+
" <th>3</th>\n",
|
| 252 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
| 253 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
| 254 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
| 255 |
+
" </tr>\n",
|
| 256 |
+
" <tr>\n",
|
| 257 |
+
" <th>4</th>\n",
|
| 258 |
+
" <td>Cler Hotel</td>\n",
|
| 259 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 260 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 261 |
+
" </tr>\n",
|
| 262 |
+
" </tbody>\n",
|
| 263 |
+
"</table>\n",
|
| 264 |
+
"</div>"
|
| 265 |
+
],
|
| 266 |
+
"text/plain": [
|
| 267 |
+
" Hotel \\\n",
|
| 268 |
+
"0 25hours Hotel Terminus Nord \n",
|
| 269 |
+
"1 Acacias Etoile Hotel \n",
|
| 270 |
+
"2 COQ Hotel Paris \n",
|
| 271 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
| 272 |
+
"4 Cler Hotel \n",
|
| 273 |
+
"\n",
|
| 274 |
+
" all_review \\\n",
|
| 275 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
| 276 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
| 277 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 278 |
+
"3 room was very clean transportation is very ne... \n",
|
| 279 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
| 280 |
+
"\n",
|
| 281 |
+
" summary \n",
|
| 282 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
| 283 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
| 284 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 285 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
| 286 |
+
"4 we had the best stay at cler hotel the locati... "
|
| 287 |
+
]
|
| 288 |
+
},
|
| 289 |
+
"execution_count": 10,
|
| 290 |
+
"metadata": {},
|
| 291 |
+
"output_type": "execute_result"
|
| 292 |
+
}
|
| 293 |
+
],
|
| 294 |
+
"source": [
|
| 295 |
+
"df_combined.head()"
|
| 296 |
+
]
|
| 297 |
+
},
|
| 298 |
+
{
|
| 299 |
+
"cell_type": "code",
|
| 300 |
+
"execution_count": null,
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"outputs": [],
|
| 303 |
+
"source": []
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"cell_type": "code",
|
| 307 |
+
"execution_count": 3,
|
| 308 |
+
"metadata": {},
|
| 309 |
+
"outputs": [
|
| 310 |
+
{
|
| 311 |
+
"name": "stdout",
|
| 312 |
+
"output_type": "stream",
|
| 313 |
+
"text": [
|
| 314 |
+
"Dockerfile df_combined.csv\n",
|
| 315 |
+
"Hotel New York Combined.csv en_core_web_sm-3.2.0-py3-none-any.whl\n",
|
| 316 |
+
"README.md query_generator.ipynb\n",
|
| 317 |
+
"Untitled.ipynb requirements.txt\n",
|
| 318 |
+
"app.py summary.ipynb\n",
|
| 319 |
+
"app.yaml\n"
|
| 320 |
+
]
|
| 321 |
+
}
|
| 322 |
+
],
|
| 323 |
+
"source": [
|
| 324 |
+
"!ls"
|
| 325 |
+
]
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"cell_type": "code",
|
| 329 |
+
"execution_count": 3,
|
| 330 |
+
"metadata": {},
|
| 331 |
+
"outputs": [
|
| 332 |
+
{
|
| 333 |
+
"name": "stderr",
|
| 334 |
+
"output_type": "stream",
|
| 335 |
+
"text": [
|
| 336 |
+
"/Users/aimzlicious/miniforge3/envs/tf_m1/lib/python3.8/site-packages/huggingface_hub/snapshot_download.py:6: FutureWarning: snapshot_download.py has been made private and will no longer be available from version 0.11. Please use `from huggingface_hub import snapshot_download` to import the only public function in this module. Other members of the file may be changed without a deprecation notice.\n",
|
| 337 |
+
" warnings.warn(\n"
|
| 338 |
+
]
|
| 339 |
+
}
|
| 340 |
+
],
|
| 341 |
+
"source": [
|
| 342 |
+
"import pandas as pd\n",
|
| 343 |
+
"from sentence_transformers import SentenceTransformer\n",
|
| 344 |
+
"import scipy.spatial\n",
|
| 345 |
+
"import pickle as pkl\n",
|
| 346 |
+
"from sentence_transformers import SentenceTransformer, util\n",
|
| 347 |
+
"import torch\n",
|
| 348 |
+
"df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')"
|
| 349 |
+
]
|
| 350 |
+
},
|
| 351 |
+
{
|
| 352 |
+
"cell_type": "code",
|
| 353 |
+
"execution_count": 4,
|
| 354 |
+
"metadata": {},
|
| 355 |
+
"outputs": [
|
| 356 |
+
{
|
| 357 |
+
"data": {
|
| 358 |
+
"text/html": [
|
| 359 |
+
"<div>\n",
|
| 360 |
+
"<style scoped>\n",
|
| 361 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 362 |
+
" vertical-align: middle;\n",
|
| 363 |
+
" }\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" .dataframe tbody tr th {\n",
|
| 366 |
+
" vertical-align: top;\n",
|
| 367 |
+
" }\n",
|
| 368 |
+
"\n",
|
| 369 |
+
" .dataframe thead th {\n",
|
| 370 |
+
" text-align: right;\n",
|
| 371 |
+
" }\n",
|
| 372 |
+
"</style>\n",
|
| 373 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 374 |
+
" <thead>\n",
|
| 375 |
+
" <tr style=\"text-align: right;\">\n",
|
| 376 |
+
" <th></th>\n",
|
| 377 |
+
" <th>Hotel</th>\n",
|
| 378 |
+
" <th>all_review</th>\n",
|
| 379 |
+
" <th>summary</th>\n",
|
| 380 |
+
" </tr>\n",
|
| 381 |
+
" </thead>\n",
|
| 382 |
+
" <tbody>\n",
|
| 383 |
+
" <tr>\n",
|
| 384 |
+
" <th>0</th>\n",
|
| 385 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
| 386 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
| 387 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
| 388 |
+
" </tr>\n",
|
| 389 |
+
" <tr>\n",
|
| 390 |
+
" <th>1</th>\n",
|
| 391 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
| 392 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
| 393 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
| 394 |
+
" </tr>\n",
|
| 395 |
+
" <tr>\n",
|
| 396 |
+
" <th>2</th>\n",
|
| 397 |
+
" <td>COQ Hotel Paris</td>\n",
|
| 398 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 399 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 400 |
+
" </tr>\n",
|
| 401 |
+
" <tr>\n",
|
| 402 |
+
" <th>3</th>\n",
|
| 403 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
| 404 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
| 405 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
| 406 |
+
" </tr>\n",
|
| 407 |
+
" <tr>\n",
|
| 408 |
+
" <th>4</th>\n",
|
| 409 |
+
" <td>Cler Hotel</td>\n",
|
| 410 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 411 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 412 |
+
" </tr>\n",
|
| 413 |
+
" </tbody>\n",
|
| 414 |
+
"</table>\n",
|
| 415 |
+
"</div>"
|
| 416 |
+
],
|
| 417 |
+
"text/plain": [
|
| 418 |
+
" Hotel \\\n",
|
| 419 |
+
"0 25hours Hotel Terminus Nord \n",
|
| 420 |
+
"1 Acacias Etoile Hotel \n",
|
| 421 |
+
"2 COQ Hotel Paris \n",
|
| 422 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
| 423 |
+
"4 Cler Hotel \n",
|
| 424 |
+
"\n",
|
| 425 |
+
" all_review \\\n",
|
| 426 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
| 427 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
| 428 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 429 |
+
"3 room was very clean transportation is very ne... \n",
|
| 430 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
| 431 |
+
"\n",
|
| 432 |
+
" summary \n",
|
| 433 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
| 434 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
| 435 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 436 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
| 437 |
+
"4 we had the best stay at cler hotel the locati... "
|
| 438 |
+
]
|
| 439 |
+
},
|
| 440 |
+
"execution_count": 4,
|
| 441 |
+
"metadata": {},
|
| 442 |
+
"output_type": "execute_result"
|
| 443 |
+
}
|
| 444 |
+
],
|
| 445 |
+
"source": [
|
| 446 |
+
"df_combined_paris.head()"
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"cell_type": "code",
|
| 451 |
+
"execution_count": 5,
|
| 452 |
+
"metadata": {},
|
| 453 |
+
"outputs": [],
|
| 454 |
+
"source": [
|
| 455 |
+
"df_paris = pd.read_csv('paris_clean_newer.csv')"
|
| 456 |
+
]
|
| 457 |
+
},
|
| 458 |
+
{
|
| 459 |
+
"cell_type": "code",
|
| 460 |
+
"execution_count": 9,
|
| 461 |
+
"metadata": {},
|
| 462 |
+
"outputs": [],
|
| 463 |
+
"source": [
|
| 464 |
+
"hotel=pd.DataFrame(df_paris['Hotel'].drop_duplicates())"
|
| 465 |
+
]
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"cell_type": "code",
|
| 469 |
+
"execution_count": 11,
|
| 470 |
+
"metadata": {},
|
| 471 |
+
"outputs": [
|
| 472 |
+
{
|
| 473 |
+
"data": {
|
| 474 |
+
"text/html": [
|
| 475 |
+
"<div>\n",
|
| 476 |
+
"<style scoped>\n",
|
| 477 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 478 |
+
" vertical-align: middle;\n",
|
| 479 |
+
" }\n",
|
| 480 |
+
"\n",
|
| 481 |
+
" .dataframe tbody tr th {\n",
|
| 482 |
+
" vertical-align: top;\n",
|
| 483 |
+
" }\n",
|
| 484 |
+
"\n",
|
| 485 |
+
" .dataframe thead th {\n",
|
| 486 |
+
" text-align: right;\n",
|
| 487 |
+
" }\n",
|
| 488 |
+
"</style>\n",
|
| 489 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 490 |
+
" <thead>\n",
|
| 491 |
+
" <tr style=\"text-align: right;\">\n",
|
| 492 |
+
" <th></th>\n",
|
| 493 |
+
" <th>Hotel</th>\n",
|
| 494 |
+
" <th>all_review</th>\n",
|
| 495 |
+
" <th>summary</th>\n",
|
| 496 |
+
" </tr>\n",
|
| 497 |
+
" </thead>\n",
|
| 498 |
+
" <tbody>\n",
|
| 499 |
+
" <tr>\n",
|
| 500 |
+
" <th>0</th>\n",
|
| 501 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
| 502 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
| 503 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
| 504 |
+
" </tr>\n",
|
| 505 |
+
" <tr>\n",
|
| 506 |
+
" <th>1</th>\n",
|
| 507 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
| 508 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
| 509 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
| 510 |
+
" </tr>\n",
|
| 511 |
+
" <tr>\n",
|
| 512 |
+
" <th>2</th>\n",
|
| 513 |
+
" <td>COQ Hotel Paris</td>\n",
|
| 514 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 515 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
| 516 |
+
" </tr>\n",
|
| 517 |
+
" <tr>\n",
|
| 518 |
+
" <th>3</th>\n",
|
| 519 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
| 520 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
| 521 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
| 522 |
+
" </tr>\n",
|
| 523 |
+
" <tr>\n",
|
| 524 |
+
" <th>4</th>\n",
|
| 525 |
+
" <td>Cler Hotel</td>\n",
|
| 526 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 527 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
| 528 |
+
" </tr>\n",
|
| 529 |
+
" <tr>\n",
|
| 530 |
+
" <th>...</th>\n",
|
| 531 |
+
" <td>...</td>\n",
|
| 532 |
+
" <td>...</td>\n",
|
| 533 |
+
" <td>...</td>\n",
|
| 534 |
+
" </tr>\n",
|
| 535 |
+
" <tr>\n",
|
| 536 |
+
" <th>89</th>\n",
|
| 537 |
+
" <td>Sofitel Paris Le Faubourg</td>\n",
|
| 538 |
+
" <td>4 years ago i was the last time at sofitel le ...</td>\n",
|
| 539 |
+
" <td>4 years ago i was the last time at sofitel le ...</td>\n",
|
| 540 |
+
" </tr>\n",
|
| 541 |
+
" <tr>\n",
|
| 542 |
+
" <th>90</th>\n",
|
| 543 |
+
" <td>St Christopher's Gare du Nord Paris</td>\n",
|
| 544 |
+
" <td>when arriving to the area it felt a little dan...</td>\n",
|
| 545 |
+
" <td>Barry is the best bartender in paris cheers gr...</td>\n",
|
| 546 |
+
" </tr>\n",
|
| 547 |
+
" <tr>\n",
|
| 548 |
+
" <th>91</th>\n",
|
| 549 |
+
" <td>St Christopher's Inn Canal Paris</td>\n",
|
| 550 |
+
" <td>ive stayed at st christopher inn canal in pari...</td>\n",
|
| 551 |
+
" <td>ive stayed at st christopher inn canal in pari...</td>\n",
|
| 552 |
+
" </tr>\n",
|
| 553 |
+
" <tr>\n",
|
| 554 |
+
" <th>92</th>\n",
|
| 555 |
+
" <td>Touring Hotel</td>\n",
|
| 556 |
+
" <td>hotel is in a great location minutes walk fro...</td>\n",
|
| 557 |
+
" <td>Hotel is in a great location minutes walk fro...</td>\n",
|
| 558 |
+
" </tr>\n",
|
| 559 |
+
" <tr>\n",
|
| 560 |
+
" <th>93</th>\n",
|
| 561 |
+
" <td>Warwick Paris</td>\n",
|
| 562 |
+
" <td>if i know of anybody heading to paris i will r...</td>\n",
|
| 563 |
+
" <td>warwick hotel in paris is a good hotel to stay...</td>\n",
|
| 564 |
+
" </tr>\n",
|
| 565 |
+
" </tbody>\n",
|
| 566 |
+
"</table>\n",
|
| 567 |
+
"<p>94 rows × 3 columns</p>\n",
|
| 568 |
+
"</div>"
|
| 569 |
+
],
|
| 570 |
+
"text/plain": [
|
| 571 |
+
" Hotel \\\n",
|
| 572 |
+
"0 25hours Hotel Terminus Nord \n",
|
| 573 |
+
"1 Acacias Etoile Hotel \n",
|
| 574 |
+
"2 COQ Hotel Paris \n",
|
| 575 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
| 576 |
+
"4 Cler Hotel \n",
|
| 577 |
+
".. ... \n",
|
| 578 |
+
"89 Sofitel Paris Le Faubourg \n",
|
| 579 |
+
"90 St Christopher's Gare du Nord Paris \n",
|
| 580 |
+
"91 St Christopher's Inn Canal Paris \n",
|
| 581 |
+
"92 Touring Hotel \n",
|
| 582 |
+
"93 Warwick Paris \n",
|
| 583 |
+
"\n",
|
| 584 |
+
" all_review \\\n",
|
| 585 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
| 586 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
| 587 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 588 |
+
"3 room was very clean transportation is very ne... \n",
|
| 589 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
| 590 |
+
".. ... \n",
|
| 591 |
+
"89 4 years ago i was the last time at sofitel le ... \n",
|
| 592 |
+
"90 when arriving to the area it felt a little dan... \n",
|
| 593 |
+
"91 ive stayed at st christopher inn canal in pari... \n",
|
| 594 |
+
"92 hotel is in a great location minutes walk fro... \n",
|
| 595 |
+
"93 if i know of anybody heading to paris i will r... \n",
|
| 596 |
+
"\n",
|
| 597 |
+
" summary \n",
|
| 598 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
| 599 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
| 600 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
| 601 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
| 602 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
| 603 |
+
".. ... \n",
|
| 604 |
+
"89 4 years ago i was the last time at sofitel le ... \n",
|
| 605 |
+
"90 Barry is the best bartender in paris cheers gr... \n",
|
| 606 |
+
"91 ive stayed at st christopher inn canal in pari... \n",
|
| 607 |
+
"92 Hotel is in a great location minutes walk fro... \n",
|
| 608 |
+
"93 warwick hotel in paris is a good hotel to stay... \n",
|
| 609 |
+
"\n",
|
| 610 |
+
"[94 rows x 3 columns]"
|
| 611 |
+
]
|
| 612 |
+
},
|
| 613 |
+
"execution_count": 11,
|
| 614 |
+
"metadata": {},
|
| 615 |
+
"output_type": "execute_result"
|
| 616 |
+
}
|
| 617 |
+
],
|
| 618 |
+
"source": [
|
| 619 |
+
"df_combined_paris.merge(hotel,how='left')"
|
| 620 |
+
]
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"cell_type": "code",
|
| 624 |
+
"execution_count": null,
|
| 625 |
+
"metadata": {},
|
| 626 |
+
"outputs": [],
|
| 627 |
+
"source": []
|
| 628 |
+
}
|
| 629 |
+
],
|
| 630 |
+
"metadata": {
|
| 631 |
+
"interpreter": {
|
| 632 |
+
"hash": "4bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
|
| 633 |
+
},
|
| 634 |
+
"kernelspec": {
|
| 635 |
+
"display_name": "Python 3 (ipykernel)",
|
| 636 |
+
"language": "python",
|
| 637 |
+
"name": "python3"
|
| 638 |
+
},
|
| 639 |
+
"language_info": {
|
| 640 |
+
"codemirror_mode": {
|
| 641 |
+
"name": "ipython",
|
| 642 |
+
"version": 3
|
| 643 |
+
},
|
| 644 |
+
"file_extension": ".py",
|
| 645 |
+
"mimetype": "text/x-python",
|
| 646 |
+
"name": "python",
|
| 647 |
+
"nbconvert_exporter": "python",
|
| 648 |
+
"pygments_lexer": "ipython3",
|
| 649 |
+
"version": "3.8.12"
|
| 650 |
+
}
|
| 651 |
+
},
|
| 652 |
+
"nbformat": 4,
|
| 653 |
+
"nbformat_minor": 4
|
| 654 |
+
}
|
tokenized_corpus.pickle
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
|
| 3 |
+
size 1261235
|