Spaces:

goliathaiconsulting
/

ecommerce-platform

Runtime error

App Files Files Community

Michelangiolo commited on Mar 26, 2023

Commit

cf172ac

1 Parent(s): 77a74de

changes

Browse files

Files changed (4) hide show

1_data_processing.ipynb +215 -0
2_gradio.ipynb +145 -0
app.py +36 -0
df_encoded.parquet +3 -0

1_data_processing.ipynb ADDED Viewed

	@@ -0,0 +1,215 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install sentence-transformers==2.0.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. Load dataset with pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Description</th>\n",
+       "      <th>UnitPrice</th>\n",
+       "      <th>Country</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>WHITE HANGING HEART T-LIGHT HOLDER</td>\n",
+       "      <td>2.55</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>WHITE METAL LANTERN</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CREAM CUPID HEARTS COAT HANGER</td>\n",
+       "      <td>2.75</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>KNITTED UNION FLAG HOT WATER BOTTLE</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>RED WOOLLY HOTTIE WHITE HEART.</td>\n",
+       "      <td>3.39</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>535327</th>\n",
+       "      <td>????damages????</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>535329</th>\n",
+       "      <td>mixed up</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>535335</th>\n",
+       "      <td>lost</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>537621</th>\n",
+       "      <td>CREAM HANGING HEART T-LIGHT HOLDER</td>\n",
+       "      <td>2.95</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>540421</th>\n",
+       "      <td>PAPER CRAFT , LITTLE BIRDIE</td>\n",
+       "      <td>2.08</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>4223 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                Description  UnitPrice         Country\n",
+       "0        WHITE HANGING HEART T-LIGHT HOLDER       2.55  United Kingdom\n",
+       "1                       WHITE METAL LANTERN       3.39  United Kingdom\n",
+       "2            CREAM CUPID HEARTS COAT HANGER       2.75  United Kingdom\n",
+       "3       KNITTED UNION FLAG HOT WATER BOTTLE       3.39  United Kingdom\n",
+       "4            RED WOOLLY HOTTIE WHITE HEART.       3.39  United Kingdom\n",
+       "...                                     ...        ...             ...\n",
+       "535327                      ????damages????       0.00  United Kingdom\n",
+       "535329                             mixed up       0.00  United Kingdom\n",
+       "535335                                 lost       0.00  United Kingdom\n",
+       "537621   CREAM HANGING HEART T-LIGHT HOLDER       2.95  United Kingdom\n",
+       "540421          PAPER CRAFT , LITTLE BIRDIE       2.08  United Kingdom\n",
+       "\n",
+       "[4223 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv('products.csv')\n",
+    "df = df[['Description', 'UnitPrice', 'Country']]\n",
+    "df = df.dropna().drop_duplicates(subset=['Description'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "2. Encode 100 samples into vectors (1 column with product text, 1 column with vectors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "tqdm.pandas()\n",
+    "\n",
+    "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
+    "\n",
+    "#encode df version: for small dataset only\n",
+    "df['text_vector_'] = df['Description'].progress_apply(lambda x : model.encode(x).tolist())\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_parquet('df_encoded.parquet', index=None)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.0 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "fdf377d643bc1cb065454f0ad2ceac75d834452ecf289e7ba92c6b3f59a7cee1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

2_gradio.ipynb ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install sentence-transformers==2.0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_parquet('df_encoded.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.neighbors import NearestNeighbors\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "\n",
+    "# model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2\n",
+    "\n",
+    "#prepare model\n",
+    "nbrs = NearestNeighbors(n_neighbors=8, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def search(query):\n",
+    "    product = model.encode(query).tolist()\n",
+    "    # product = df.iloc[0]['text_vector_'] #use one of the products as sample\n",
+    "\n",
+    "    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object\n",
+    "\n",
+    "    #print out the description of every recommended product\n",
+    "    return df.iloc[list(indices)[0]][['Description', 'UnitPrice', 'Country']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7860\n",
+      "\n",
+      "To create a public link, set `share=True` in `launch()`.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import os\n",
+    "\n",
+    "#the first module becomes text1, the second module file1\n",
+    "def greet(text1): \n",
+    "    return search(text1)\n",
+    "\n",
+    "iface = gr.Interface(fn=greet, inputs=['text'], outputs=[\"dataframe\"])\n",
+    "iface.launch(share=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.0 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "fdf377d643bc1cb065454f0ad2ceac75d834452ecf289e7ba92c6b3f59a7cee1"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+os.system('pip install openpyxl')
+os.system('pip install sentence-transformers')
+import pandas as pd
+import gradio as gr
+from sentence_transformers import SentenceTransformer
+from sklearn.neighbors import NearestNeighbors
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
+df = pd.read_parquet('df_encoded.parquet')
+#prepare model
+nbrs = NearestNeighbors(n_neighbors=8, algorithm='ball_tree').fit(df['text_vector_'].values.tolist())
+def search(df, query):
+    product = model.encode(query).tolist()
+    # product = df.iloc[0]['text_vector_'] #use one of the products as sample
+    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
+    #print out the description of every recommended product
+    return df.iloc[list(indices)[0]][['Description', 'UnitPrice', 'Country']]
+import gradio as gr
+import os
+#the first module becomes text1, the second module file1
+def greet(text1):
+    return search(df, text1)
+iface = gr.Interface(fn=greet, inputs=['text'], outputs=["dataframe"])
+iface.launch(share=False)

df_encoded.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46c74a19104ae10b2c173f39825f3e08174e0f5f213c2e2392d95ca364e49c60
+size 20362183