|
|
|
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.linear_model import LinearRegression |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
import networkx as nx |
|
|
import matplotlib.pyplot as plt |
|
|
|
|
|
st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide") |
|
|
|
|
|
@st.cache_data |
|
|
def load_data(path): |
|
|
df = pd.read_csv(path) |
|
|
|
|
|
if "rent_dkk" not in df.columns and "rent_raw" in df.columns: |
|
|
df["rent_dkk"] = df["rent_raw"] |
|
|
|
|
|
num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"] |
|
|
for c in num_cols: |
|
|
if c in df.columns: |
|
|
df[c] = pd.to_numeric(df[c], errors="coerce") |
|
|
|
|
|
boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"] |
|
|
for b in boolish: |
|
|
if b in df.columns: |
|
|
df[b] = df[b].astype("float").fillna(0.0).astype(int) |
|
|
|
|
|
if "rent_per_m2" in df.columns: |
|
|
df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"]) |
|
|
else: |
|
|
df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"] |
|
|
|
|
|
df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])] |
|
|
|
|
|
if "address" in df.columns: |
|
|
df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg") |
|
|
else: |
|
|
df["neighborhood"] = "Aalborg" |
|
|
|
|
|
if "description" not in df.columns: |
|
|
df["description"] = "" |
|
|
return df |
|
|
|
|
|
df = load_data("apt.csv") |
|
|
|
|
|
st.title("🏠 Aalborg Rentals — Rapid POC") |
|
|
st.caption("NLP + ML + Network Analysis on Boligportal data.") |
|
|
|
|
|
tab_overview, tab_search, tab_fair, tab_network = st.tabs( |
|
|
["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"] |
|
|
) |
|
|
|
|
|
with tab_overview: |
|
|
st.subheader("Dataset Snapshot") |
|
|
c1, c2, c3, c4 = st.columns(4) |
|
|
c1.metric("Listings", len(df)) |
|
|
c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median())) |
|
|
c3.metric("Median Size (m²)", int(df["size_m2"].median())) |
|
|
c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1)) |
|
|
st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna()) |
|
|
st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30)) |
|
|
|
|
|
with tab_search: |
|
|
st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)") |
|
|
pref = st.text_area("Describe your ideal place (English or Danish):", height=100, |
|
|
value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed") |
|
|
max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500) |
|
|
min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5) |
|
|
rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1) |
|
|
filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)] |
|
|
if "rooms" in filter_df.columns: |
|
|
filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms] |
|
|
|
|
|
corpus = filter_df["description"].fillna("").tolist() |
|
|
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") |
|
|
X = vectorizer.fit_transform(corpus) |
|
|
q = vectorizer.transform([pref]) |
|
|
sims = cosine_similarity(q, X).ravel() |
|
|
filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50) |
|
|
st.write("Top matches:") |
|
|
st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]]) |
|
|
st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna()) |
|
|
|
|
|
with tab_fair: |
|
|
st.subheader("Estimate a 'fair rent' and spot outliers") |
|
|
|
|
|
features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"] |
|
|
cat = ["type","neighborhood"] |
|
|
use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"] |
|
|
data = df[use_cols].dropna() |
|
|
X = data.drop(columns=["rent_dkk"]) |
|
|
y = data["rent_dkk"] |
|
|
|
|
|
num_cols = [c for c in X.columns if c not in cat] |
|
|
pre = ColumnTransformer([ |
|
|
("num", StandardScaler(), num_cols), |
|
|
("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns]) |
|
|
]) |
|
|
model = Pipeline([("pre", pre), ("lin", LinearRegression())]) |
|
|
if len(data) > 50: |
|
|
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
model.fit(Xtr, ytr) |
|
|
preds = model.predict(Xte) |
|
|
mae = np.mean(np.abs(preds - yte)) |
|
|
st.metric("Validation MAE (DKK)", int(mae)) |
|
|
|
|
|
df_pred = df.copy() |
|
|
df_pred = df_pred.dropna(subset=num_cols) |
|
|
df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]]) |
|
|
df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"] |
|
|
st.write("Potentially underpriced (negative delta):") |
|
|
st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]]) |
|
|
st.write("Potentially overpriced (positive delta):") |
|
|
st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]]) |
|
|
else: |
|
|
st.info("Not enough data to train a model.") |
|
|
|
|
|
with tab_network: |
|
|
st.subheader("Amenity Co‑occurrence Network") |
|
|
|
|
|
amenity_cols = [] |
|
|
for c in df.columns: |
|
|
if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]: |
|
|
amenity_cols.append(c) |
|
|
elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]: |
|
|
amenity_cols.append(c) |
|
|
amenity_cols = list(dict.fromkeys(amenity_cols)) |
|
|
st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}") |
|
|
if amenity_cols: |
|
|
|
|
|
A = [] |
|
|
for _, row in df[amenity_cols].fillna(0).iterrows(): |
|
|
active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True] |
|
|
A.append(active) |
|
|
|
|
|
from collections import Counter |
|
|
edge_counter = Counter() |
|
|
for acts in A: |
|
|
for i in range(len(acts)): |
|
|
for j in range(i+1, len(acts)): |
|
|
edge_counter[(acts[i], acts[j])] += 1 |
|
|
G = nx.Graph() |
|
|
for (u,v), w in edge_counter.items(): |
|
|
if w >= 5: |
|
|
G.add_edge(u, v, weight=w) |
|
|
if G.number_of_edges() == 0: |
|
|
st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.") |
|
|
else: |
|
|
pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None) |
|
|
plt.figure(figsize=(7,5)) |
|
|
weights = [G[u][v]['weight'] for u,v in G.edges()] |
|
|
nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights]) |
|
|
st.pyplot(plt.gcf()) |
|
|
st.write("Edges weighted by how often amenities appear together in listings.") |
|
|
else: |
|
|
st.info("No amenity-like columns detected.") |
|
|
|
|
|
st.sidebar.header("About this POC") |
|
|
st.sidebar.markdown( |
|
|
""" |
|
|
**Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** + |
|
|
**Network analysis (amenity co‑occurrence)** can help **renters** discover good deals |
|
|
and **landlords** benchmark prices. |
|
|
|
|
|
**How to run locally**: |
|
|
```bash |
|
|
pip install -r requirements.txt |
|
|
streamlit run app.py |
|
|
``` |
|
|
""" |
|
|
) |
|
|
|