File size: 8,190 Bytes
74bf8e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import matplotlib.pyplot as plt

st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide")

@st.cache_data
def load_data(path):
    df = pd.read_csv(path)
    # Basic cleaning
    if "rent_dkk" not in df.columns and "rent_raw" in df.columns:
        df["rent_dkk"] = df["rent_raw"]
    # Ensure numerics
    num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    # Fill some booleans if present
    boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"]
    for b in boolish:
        if b in df.columns:
            df[b] = df[b].astype("float").fillna(0.0).astype(int)
    # Derive rent_per_m2 if missing
    if "rent_per_m2" in df.columns:
        df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"])
    else:
        df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"]
    # Drop crazy values
    df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])]
    # Neighborhood quick heuristic from address
    if "address" in df.columns:
        df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg")
    else:
        df["neighborhood"] = "Aalborg"
    # Description text
    if "description" not in df.columns:
        df["description"] = ""
    return df

df = load_data("apt.csv")

st.title("🏠 Aalborg Rentals — Rapid POC")
st.caption("NLP + ML + Network Analysis on Boligportal data.")

tab_overview, tab_search, tab_fair, tab_network = st.tabs(
    ["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"]
)

with tab_overview:
    st.subheader("Dataset Snapshot")
    c1, c2, c3, c4 = st.columns(4)
    c1.metric("Listings", len(df))
    c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median()))
    c3.metric("Median Size (m²)", int(df["size_m2"].median()))
    c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1))
    st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
    st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30))

with tab_search:
    st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)")
    pref = st.text_area("Describe your ideal place (English or Danish):", height=100,
                        value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed")
    max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500)
    min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5)
    rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1)
    filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)]
    if "rooms" in filter_df.columns:
        filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms]
    # TF-IDF over descriptions
    corpus = filter_df["description"].fillna("").tolist()
    vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
    X = vectorizer.fit_transform(corpus)
    q = vectorizer.transform([pref])
    sims = cosine_similarity(q, X).ravel()
    filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50)
    st.write("Top matches:")
    st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]])
    st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())

with tab_fair:
    st.subheader("Estimate a 'fair rent' and spot outliers")
    # Features
    features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"]
    cat = ["type","neighborhood"]
    use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"]
    data = df[use_cols].dropna()
    X = data.drop(columns=["rent_dkk"])
    y = data["rent_dkk"]
    # Preprocess
    num_cols = [c for c in X.columns if c not in cat]
    pre = ColumnTransformer([
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns])
    ])
    model = Pipeline([("pre", pre), ("lin", LinearRegression())])
    if len(data) > 50:
        Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(Xtr, ytr)
        preds = model.predict(Xte)
        mae = np.mean(np.abs(preds - yte))
        st.metric("Validation MAE (DKK)", int(mae))
        # Score all listings
        df_pred = df.copy()
        df_pred = df_pred.dropna(subset=num_cols)
        df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]])
        df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"]
        st.write("Potentially underpriced (negative delta):")
        st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
        st.write("Potentially overpriced (positive delta):")
        st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
    else:
        st.info("Not enough data to train a model.")
    
with tab_network:
    st.subheader("Amenity Co‑occurrence Network")
    # pick boolean/flag columns as amenities
    amenity_cols = []
    for c in df.columns:
        if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]:
            amenity_cols.append(c)
        elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]:
            amenity_cols.append(c)
    amenity_cols = list(dict.fromkeys(amenity_cols))  # unique preserve order
    st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}")
    if amenity_cols:
        # build co-occurrence matrix
        A = []
        for _, row in df[amenity_cols].fillna(0).iterrows():
            active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True]
            A.append(active)
        # edges: pair counts
        from collections import Counter
        edge_counter = Counter()
        for acts in A:
            for i in range(len(acts)):
                for j in range(i+1, len(acts)):
                    edge_counter[(acts[i], acts[j])] += 1
        G = nx.Graph()
        for (u,v), w in edge_counter.items():
            if w >= 5:  # only keep frequent co-occurrences
                G.add_edge(u, v, weight=w)
        if G.number_of_edges() == 0:
            st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.")
        else:
            pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None)
            plt.figure(figsize=(7,5))
            weights = [G[u][v]['weight'] for u,v in G.edges()]
            nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights])
            st.pyplot(plt.gcf())
            st.write("Edges weighted by how often amenities appear together in listings.")
    else:
        st.info("No amenity-like columns detected.")
    
st.sidebar.header("About this POC")
st.sidebar.markdown(
    """
**Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** + 
**Network analysis (amenity co‑occurrence)** can help **renters** discover good deals 
and **landlords** benchmark prices.

**How to run locally**:
```bash
pip install -r requirements.txt
streamlit run app.py
```
"""
)