import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import matplotlib.pyplot as plt

st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide")

@st.cache_data
def load_data(path):
    df = pd.read_csv(path)
    # Basic cleaning
    if "rent_dkk" not in df.columns and "rent_raw" in df.columns:
        df["rent_dkk"] = df["rent_raw"]
    # Ensure numerics
    num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    # Fill some booleans if present
    boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"]
    for b in boolish:
        if b in df.columns:
            df[b] = df[b].astype("float").fillna(0.0).astype(int)
    # Derive rent_per_m2 if missing
    if "rent_per_m2" in df.columns:
        df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"])
    else:
        df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"]
    # Drop crazy values
    df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])]
    # Neighborhood quick heuristic from address
    if "address" in df.columns:
        df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg")
    else:
        df["neighborhood"] = "Aalborg"
    # Description text
    if "description" not in df.columns:
        df["description"] = ""
    return df

df = load_data("apt.csv")

st.title("🏠 Aalborg Rentals — Rapid POC")
st.caption("NLP + ML + Network Analysis on Boligportal data.")

tab_overview, tab_search, tab_fair, tab_network = st.tabs(
    ["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"]
)

with tab_overview:
    st.subheader("Dataset Snapshot")
    c1, c2, c3, c4 = st.columns(4)
    c1.metric("Listings", len(df))
    c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median()))
    c3.metric("Median Size (m²)", int(df["size_m2"].median()))
    c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1))
    st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
    st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30))

with tab_search:
    st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)")
    pref = st.text_area("Describe your ideal place (English or Danish):", height=100,
                        value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed")
    max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500)
    min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5)
    rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1)
    filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)]
    if "rooms" in filter_df.columns:
        filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms]
    # TF-IDF over descriptions
    corpus = filter_df["description"].fillna("").tolist()
    vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
    X = vectorizer.fit_transform(corpus)
    q = vectorizer.transform([pref])
    sims = cosine_similarity(q, X).ravel()
    filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50)
    st.write("Top matches:")
    st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]])
    st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())

with tab_fair:
    st.subheader("Estimate a 'fair rent' and spot outliers")
    # Features
    features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"]
    cat = ["type","neighborhood"]
    use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"]
    data = df[use_cols].dropna()
    X = data.drop(columns=["rent_dkk"])
    y = data["rent_dkk"]
    # Preprocess
    num_cols = [c for c in X.columns if c not in cat]
    pre = ColumnTransformer([
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns])
    ])
    model = Pipeline([("pre", pre), ("lin", LinearRegression())])
    if len(data) > 50:
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(Xtr, ytr)
        preds = model.predict(Xte)
        mae = np.mean(np.abs(preds - yte))
        st.metric("Validation MAE (DKK)", int(mae))
        # Score all listings
        df_pred = df.copy()
        df_pred = df_pred.dropna(subset=num_cols)
        df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]])
        df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"]
        st.write("Potentially underpriced (negative delta):")
        st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
        st.write("Potentially overpriced (positive delta):")
        st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
    else:
        st.info("Not enough data to train a model.")
    
with tab_network:
    st.subheader("Amenity Co‑occurrence Network")
    # pick boolean/flag columns as amenities
    amenity_cols = []
    for c in df.columns:
        if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]:
            amenity_cols.append(c)
        elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]:
            amenity_cols.append(c)
    amenity_cols = list(dict.fromkeys(amenity_cols))  # unique preserve order
    st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}")
    if amenity_cols:
        # build co-occurrence matrix
        A = []
        for _, row in df[amenity_cols].fillna(0).iterrows():
            active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True]
            A.append(active)
        # edges: pair counts
        from collections import Counter
        edge_counter = Counter()
        for acts in A:
            for i in range(len(acts)):
                for j in range(i+1, len(acts)):
                    edge_counter[(acts[i], acts[j])] += 1
        G = nx.Graph()
        for (u,v), w in edge_counter.items():
            if w >= 5:  # only keep frequent co-occurrences
                G.add_edge(u, v, weight=w)
        if G.number_of_edges() == 0:
            st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.")
        else:
            pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None)
            plt.figure(figsize=(7,5))
            weights = [G[u][v]['weight'] for u,v in G.edges()]
            nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights])
            st.pyplot(plt.gcf())
            st.write("Edges weighted by how often amenities appear together in listings.")
    else:
        st.info("No amenity-like columns detected.")
    
st.sidebar.header("About this POC")
st.sidebar.markdown(
    """
**Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** + 
**Network analysis (amenity co‑occurrence)** can help **renters** discover good deals 
and **landlords** benchmark prices.

**How to run locally**:
```bash
pip install -r requirements.txt
streamlit run app.py
```
"""
)