import streamlit as st import pandas as pd import numpy as np from sklearn.neighbors import NearestNeighbors # Load data df = pd.read_csv("cities.csv") df_preprocessed = np.loadtxt("df_processed.csv", delimiter=',') # exported after preprocessing step in lecture example st.title("🌍 City Similarity Recommender") # City selection selected_places = st.multiselect("Select one or more cities you like:", options=df['place'].tolist()) # Optional sub-region filter subregion_options = ['All'] + sorted(df['sub-region'].unique().tolist()) selected_subregion = st.selectbox("Optionally filter recommended city by sub-region:", options=subregion_options) if selected_places: # Get indices of selected cities selected_indices = df[df['place'].isin(selected_places)].index.tolist() # Compute the average feature vector avg_vector = df_preprocessed[selected_indices].mean(axis=0).reshape(1, -1) # Apply sub-region filtering if selected if selected_subregion != 'All': candidate_mask = (df['sub-region'] == selected_subregion) candidate_mask[selected_indices] = False else: candidate_mask = np.ones(len(df), dtype=bool) candidate_mask[selected_indices] = False candidate_indices = np.where(candidate_mask)[0] candidate_vectors = df_preprocessed[candidate_indices] # Use NearestNeighbors to find top 3 closest nn = NearestNeighbors(n_neighbors=3, metric='euclidean') nn.fit(candidate_vectors) distances, indices = nn.kneighbors(avg_vector) top_indices = [candidate_indices[i] for i in indices[0]] st.subheader("πŸ” Top 3 Similar Cities") for rank, idx in enumerate(top_indices, 1): city = df.iloc[idx] st.markdown(f"### {rank}. {city['place']}") st.markdown(f"Country: {city['alpha-2']}") st.markdown(f"Region: {city['region']}") st.markdown(f"Sub-region: {city['sub-region']}") st.markdown(f"Similarity score (Euclidean distance): {round(distances[0][rank - 1], 4)}") st.markdown("---") else: st.info("Please select at least one city to get a recommendation.") # --- Explanatory Sections --- with st.expander("🧠 Technique Used"): st.markdown(""" This recommender uses **unsupervised learning techniques** to find the most similar cities based on selected examples. Here's how it works: - The app uses **scaled numerical features** that describe cities in terms of cost, safety, infrastructure, and lifestyle. - When you select cities you like, their feature vectors are averaged. - We then use **k-Nearest Neighbors (k-NN)** to find the top 3 most similar cities based on **Euclidean distance**. - Optionally, results can be limited to a **specific sub-region**. """) with st.expander("πŸ“š Why this is Unsupervised Learning"): st.markdown(""" This approach is a classic example of **unsupervised learning (UML)**: - There are **no labels or targets** provided β€” we don’t tell the model what to predict. - Instead, we explore the **structure of the data** to find **similarities** and groupings. - The recommendation is based purely on feature-based proximity β€” no training labels are used. Techniques like clustering and similarity search (like k-NN) are key tools in the UML toolbox β€” making this system a real-life application of UML. """)