|
|
import streamlit as st
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from sklearn.neighbors import NearestNeighbors
|
|
|
|
|
|
|
|
|
df = pd.read_csv("cities.csv")
|
|
|
df_preprocessed = np.loadtxt("df_processed.csv", delimiter=',')
|
|
|
|
|
|
st.title("π City Similarity Recommender")
|
|
|
|
|
|
|
|
|
selected_places = st.multiselect("Select one or more cities you like:", options=df['place'].tolist())
|
|
|
|
|
|
|
|
|
subregion_options = ['All'] + sorted(df['sub-region'].unique().tolist())
|
|
|
selected_subregion = st.selectbox("Optionally filter recommended city by sub-region:", options=subregion_options)
|
|
|
|
|
|
if selected_places:
|
|
|
|
|
|
selected_indices = df[df['place'].isin(selected_places)].index.tolist()
|
|
|
|
|
|
|
|
|
avg_vector = df_preprocessed[selected_indices].mean(axis=0).reshape(1, -1)
|
|
|
|
|
|
|
|
|
if selected_subregion != 'All':
|
|
|
candidate_mask = (df['sub-region'] == selected_subregion)
|
|
|
candidate_mask[selected_indices] = False
|
|
|
else:
|
|
|
candidate_mask = np.ones(len(df), dtype=bool)
|
|
|
candidate_mask[selected_indices] = False
|
|
|
|
|
|
candidate_indices = np.where(candidate_mask)[0]
|
|
|
candidate_vectors = df_preprocessed[candidate_indices]
|
|
|
|
|
|
|
|
|
nn = NearestNeighbors(n_neighbors=3, metric='euclidean')
|
|
|
nn.fit(candidate_vectors)
|
|
|
distances, indices = nn.kneighbors(avg_vector)
|
|
|
|
|
|
top_indices = [candidate_indices[i] for i in indices[0]]
|
|
|
|
|
|
st.subheader("π Top 3 Similar Cities")
|
|
|
for rank, idx in enumerate(top_indices, 1):
|
|
|
city = df.iloc[idx]
|
|
|
st.markdown(f"### {rank}. {city['place']}")
|
|
|
st.markdown(f"Country: {city['alpha-2']}")
|
|
|
st.markdown(f"Region: {city['region']}")
|
|
|
st.markdown(f"Sub-region: {city['sub-region']}")
|
|
|
st.markdown(f"Similarity score (Euclidean distance): {round(distances[0][rank - 1], 4)}")
|
|
|
st.markdown("---")
|
|
|
else:
|
|
|
st.info("Please select at least one city to get a recommendation.")
|
|
|
|
|
|
|
|
|
|
|
|
with st.expander("π§ Technique Used"):
|
|
|
st.markdown("""
|
|
|
This recommender uses **unsupervised learning techniques** to find the most similar cities based on selected examples. Here's how it works:
|
|
|
|
|
|
- The app uses **scaled numerical features** that describe cities in terms of cost, safety, infrastructure, and lifestyle.
|
|
|
- When you select cities you like, their feature vectors are averaged.
|
|
|
- We then use **k-Nearest Neighbors (k-NN)** to find the top 3 most similar cities based on **Euclidean distance**.
|
|
|
- Optionally, results can be limited to a **specific sub-region**.
|
|
|
""")
|
|
|
|
|
|
with st.expander("π Why this is Unsupervised Learning"):
|
|
|
st.markdown("""
|
|
|
This approach is a classic example of **unsupervised learning (UML)**:
|
|
|
|
|
|
- There are **no labels or targets** provided β we donβt tell the model what to predict.
|
|
|
- Instead, we explore the **structure of the data** to find **similarities** and groupings.
|
|
|
- The recommendation is based purely on feature-based proximity β no training labels are used.
|
|
|
|
|
|
Techniques like clustering and similarity search (like k-NN) are key tools in the UML toolbox β making this system a real-life application of UML.
|
|
|
""")
|
|
|
|