Upload 4 files
Browse files- app.py +77 -0
- cities.csv +0 -0
- df_processed.csv +0 -0
- requirements.txt +4 -0
app.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.neighbors import NearestNeighbors
|
| 5 |
+
|
| 6 |
+
# Load data
|
| 7 |
+
df = pd.read_csv("cities.csv")
|
| 8 |
+
df_preprocessed = np.loadtxt("df_processed.csv", delimiter=',') # exported after preprocessing step in lecture example
|
| 9 |
+
|
| 10 |
+
st.title("๐ City Similarity Recommender")
|
| 11 |
+
|
| 12 |
+
# City selection
|
| 13 |
+
selected_places = st.multiselect("Select one or more cities you like:", options=df['place'].tolist())
|
| 14 |
+
|
| 15 |
+
# Optional sub-region filter
|
| 16 |
+
subregion_options = ['All'] + sorted(df['sub-region'].unique().tolist())
|
| 17 |
+
selected_subregion = st.selectbox("Optionally filter recommended city by sub-region:", options=subregion_options)
|
| 18 |
+
|
| 19 |
+
if selected_places:
|
| 20 |
+
# Get indices of selected cities
|
| 21 |
+
selected_indices = df[df['place'].isin(selected_places)].index.tolist()
|
| 22 |
+
|
| 23 |
+
# Compute the average feature vector
|
| 24 |
+
avg_vector = df_preprocessed[selected_indices].mean(axis=0).reshape(1, -1)
|
| 25 |
+
|
| 26 |
+
# Apply sub-region filtering if selected
|
| 27 |
+
if selected_subregion != 'All':
|
| 28 |
+
candidate_mask = (df['sub-region'] == selected_subregion)
|
| 29 |
+
candidate_mask[selected_indices] = False
|
| 30 |
+
else:
|
| 31 |
+
candidate_mask = np.ones(len(df), dtype=bool)
|
| 32 |
+
candidate_mask[selected_indices] = False
|
| 33 |
+
|
| 34 |
+
candidate_indices = np.where(candidate_mask)[0]
|
| 35 |
+
candidate_vectors = df_preprocessed[candidate_indices]
|
| 36 |
+
|
| 37 |
+
# Use NearestNeighbors to find top 3 closest
|
| 38 |
+
nn = NearestNeighbors(n_neighbors=3, metric='euclidean')
|
| 39 |
+
nn.fit(candidate_vectors)
|
| 40 |
+
distances, indices = nn.kneighbors(avg_vector)
|
| 41 |
+
|
| 42 |
+
top_indices = [candidate_indices[i] for i in indices[0]]
|
| 43 |
+
|
| 44 |
+
st.subheader("๐ Top 3 Similar Cities")
|
| 45 |
+
for rank, idx in enumerate(top_indices, 1):
|
| 46 |
+
city = df.iloc[idx]
|
| 47 |
+
st.markdown(f"### {rank}. {city['place']}")
|
| 48 |
+
st.markdown(f"Country: {city['alpha-2']}")
|
| 49 |
+
st.markdown(f"Region: {city['region']}")
|
| 50 |
+
st.markdown(f"Sub-region: {city['sub-region']}")
|
| 51 |
+
st.markdown(f"Similarity score (Euclidean distance): {round(distances[0][rank - 1], 4)}")
|
| 52 |
+
st.markdown("---")
|
| 53 |
+
else:
|
| 54 |
+
st.info("Please select at least one city to get a recommendation.")
|
| 55 |
+
|
| 56 |
+
# --- Explanatory Sections ---
|
| 57 |
+
|
| 58 |
+
with st.expander("๐ง Technique Used"):
|
| 59 |
+
st.markdown("""
|
| 60 |
+
This recommender uses **unsupervised learning techniques** to find the most similar cities based on selected examples. Here's how it works:
|
| 61 |
+
|
| 62 |
+
- The app uses **scaled numerical features** that describe cities in terms of cost, safety, infrastructure, and lifestyle.
|
| 63 |
+
- When you select cities you like, their feature vectors are averaged.
|
| 64 |
+
- We then use **k-Nearest Neighbors (k-NN)** to find the top 3 most similar cities based on **Euclidean distance**.
|
| 65 |
+
- Optionally, results can be limited to a **specific sub-region**.
|
| 66 |
+
""")
|
| 67 |
+
|
| 68 |
+
with st.expander("๐ Why this is Unsupervised Learning"):
|
| 69 |
+
st.markdown("""
|
| 70 |
+
This approach is a classic example of **unsupervised learning (UML)**:
|
| 71 |
+
|
| 72 |
+
- There are **no labels or targets** provided โ we donโt tell the model what to predict.
|
| 73 |
+
- Instead, we explore the **structure of the data** to find **similarities** and groupings.
|
| 74 |
+
- The recommendation is based purely on feature-based proximity โ no training labels are used.
|
| 75 |
+
|
| 76 |
+
Techniques like clustering and similarity search (like k-NN) are key tools in the UML toolbox โ making this system a real-life application of UML.
|
| 77 |
+
""")
|
cities.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
df_processed.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pandas
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|