import streamlit as st
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Load data
df = pd.read_csv("cities.csv")
df_preprocessed = np.loadtxt("df_processed.csv", delimiter=',') # exported after preprocessing step in lecture example

st.title("🌍 City Similarity Recommender")

# City selection
selected_places = st.multiselect("Select one or more cities you like:", options=df['place'].tolist())

# Optional sub-region filter
subregion_options = ['All'] + sorted(df['sub-region'].unique().tolist())
selected_subregion = st.selectbox("Optionally filter recommended city by sub-region:", options=subregion_options)

if selected_places:
    # Get indices of selected cities
    selected_indices = df[df['place'].isin(selected_places)].index.tolist()

    # Compute the average feature vector
    avg_vector = df_preprocessed[selected_indices].mean(axis=0).reshape(1, -1)

    # Apply sub-region filtering if selected
    if selected_subregion != 'All':
        candidate_mask = (df['sub-region'] == selected_subregion)
        candidate_mask[selected_indices] = False
    else:
        candidate_mask = np.ones(len(df), dtype=bool)
        candidate_mask[selected_indices] = False

    candidate_indices = np.where(candidate_mask)[0]
    candidate_vectors = df_preprocessed[candidate_indices]

    # Use NearestNeighbors to find top 3 closest
    nn = NearestNeighbors(n_neighbors=3, metric='euclidean')
    nn.fit(candidate_vectors)
    distances, indices = nn.kneighbors(avg_vector)

    top_indices = [candidate_indices[i] for i in indices[0]]

    st.subheader("🔍 Top 3 Similar Cities")
    for rank, idx in enumerate(top_indices, 1):
        city = df.iloc[idx]
        st.markdown(f"### {rank}. {city['place']}")
        st.markdown(f"Country: {city['alpha-2']}")
        st.markdown(f"Region: {city['region']}")
        st.markdown(f"Sub-region: {city['sub-region']}")
        st.markdown(f"Similarity score (Euclidean distance): {round(distances[0][rank - 1], 4)}")
        st.markdown("---")
else:
    st.info("Please select at least one city to get a recommendation.")

# --- Explanatory Sections ---

with st.expander("🧠 Technique Used"):
    st.markdown("""
    This recommender uses **unsupervised learning techniques** to find the most similar cities based on selected examples. Here's how it works:

    - The app uses **scaled numerical features** that describe cities in terms of cost, safety, infrastructure, and lifestyle.
    - When you select cities you like, their feature vectors are averaged.
    - We then use **k-Nearest Neighbors (k-NN)** to find the top 3 most similar cities based on **Euclidean distance**.
    - Optionally, results can be limited to a **specific sub-region**.
    """)

with st.expander("📚 Why this is Unsupervised Learning"):
    st.markdown("""
    This approach is a classic example of **unsupervised learning (UML)**:

    - There are **no labels or targets** provided — we don’t tell the model what to predict.
    - Instead, we explore the **structure of the data** to find **similarities** and groupings.
    - The recommendation is based purely on feature-based proximity — no training labels are used.

    Techniques like clustering and similarity search (like k-NN) are key tools in the UML toolbox — making this system a real-life application of UML.
    """)