saoter commited on
Commit
f20ae83
ยท
verified ยท
1 Parent(s): e8f76cf

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +77 -0
  2. cities.csv +0 -0
  3. df_processed.csv +0 -0
  4. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.neighbors import NearestNeighbors
5
+
6
+ # Load data
7
+ df = pd.read_csv("cities.csv")
8
+ df_preprocessed = np.loadtxt("df_processed.csv", delimiter=',') # exported after preprocessing step in lecture example
9
+
10
+ st.title("๐ŸŒ City Similarity Recommender")
11
+
12
+ # City selection
13
+ selected_places = st.multiselect("Select one or more cities you like:", options=df['place'].tolist())
14
+
15
+ # Optional sub-region filter
16
+ subregion_options = ['All'] + sorted(df['sub-region'].unique().tolist())
17
+ selected_subregion = st.selectbox("Optionally filter recommended city by sub-region:", options=subregion_options)
18
+
19
+ if selected_places:
20
+ # Get indices of selected cities
21
+ selected_indices = df[df['place'].isin(selected_places)].index.tolist()
22
+
23
+ # Compute the average feature vector
24
+ avg_vector = df_preprocessed[selected_indices].mean(axis=0).reshape(1, -1)
25
+
26
+ # Apply sub-region filtering if selected
27
+ if selected_subregion != 'All':
28
+ candidate_mask = (df['sub-region'] == selected_subregion)
29
+ candidate_mask[selected_indices] = False
30
+ else:
31
+ candidate_mask = np.ones(len(df), dtype=bool)
32
+ candidate_mask[selected_indices] = False
33
+
34
+ candidate_indices = np.where(candidate_mask)[0]
35
+ candidate_vectors = df_preprocessed[candidate_indices]
36
+
37
+ # Use NearestNeighbors to find top 3 closest
38
+ nn = NearestNeighbors(n_neighbors=3, metric='euclidean')
39
+ nn.fit(candidate_vectors)
40
+ distances, indices = nn.kneighbors(avg_vector)
41
+
42
+ top_indices = [candidate_indices[i] for i in indices[0]]
43
+
44
+ st.subheader("๐Ÿ” Top 3 Similar Cities")
45
+ for rank, idx in enumerate(top_indices, 1):
46
+ city = df.iloc[idx]
47
+ st.markdown(f"### {rank}. {city['place']}")
48
+ st.markdown(f"Country: {city['alpha-2']}")
49
+ st.markdown(f"Region: {city['region']}")
50
+ st.markdown(f"Sub-region: {city['sub-region']}")
51
+ st.markdown(f"Similarity score (Euclidean distance): {round(distances[0][rank - 1], 4)}")
52
+ st.markdown("---")
53
+ else:
54
+ st.info("Please select at least one city to get a recommendation.")
55
+
56
+ # --- Explanatory Sections ---
57
+
58
+ with st.expander("๐Ÿง  Technique Used"):
59
+ st.markdown("""
60
+ This recommender uses **unsupervised learning techniques** to find the most similar cities based on selected examples. Here's how it works:
61
+
62
+ - The app uses **scaled numerical features** that describe cities in terms of cost, safety, infrastructure, and lifestyle.
63
+ - When you select cities you like, their feature vectors are averaged.
64
+ - We then use **k-Nearest Neighbors (k-NN)** to find the top 3 most similar cities based on **Euclidean distance**.
65
+ - Optionally, results can be limited to a **specific sub-region**.
66
+ """)
67
+
68
+ with st.expander("๐Ÿ“š Why this is Unsupervised Learning"):
69
+ st.markdown("""
70
+ This approach is a classic example of **unsupervised learning (UML)**:
71
+
72
+ - There are **no labels or targets** provided โ€” we donโ€™t tell the model what to predict.
73
+ - Instead, we explore the **structure of the data** to find **similarities** and groupings.
74
+ - The recommendation is based purely on feature-based proximity โ€” no training labels are used.
75
+
76
+ Techniques like clustering and similarity search (like k-NN) are key tools in the UML toolbox โ€” making this system a real-life application of UML.
77
+ """)
cities.csv ADDED
The diff for this file is too large to render. See raw diff
 
df_processed.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn