File size: 8,190 Bytes
74bf8e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import matplotlib.pyplot as plt
st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide")
@st.cache_data
def load_data(path):
df = pd.read_csv(path)
# Basic cleaning
if "rent_dkk" not in df.columns and "rent_raw" in df.columns:
df["rent_dkk"] = df["rent_raw"]
# Ensure numerics
num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"]
for c in num_cols:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Fill some booleans if present
boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"]
for b in boolish:
if b in df.columns:
df[b] = df[b].astype("float").fillna(0.0).astype(int)
# Derive rent_per_m2 if missing
if "rent_per_m2" in df.columns:
df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"])
else:
df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"]
# Drop crazy values
df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])]
# Neighborhood quick heuristic from address
if "address" in df.columns:
df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg")
else:
df["neighborhood"] = "Aalborg"
# Description text
if "description" not in df.columns:
df["description"] = ""
return df
df = load_data("apt.csv")
st.title("🏠 Aalborg Rentals — Rapid POC")
st.caption("NLP + ML + Network Analysis on Boligportal data.")
tab_overview, tab_search, tab_fair, tab_network = st.tabs(
["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"]
)
with tab_overview:
st.subheader("Dataset Snapshot")
c1, c2, c3, c4 = st.columns(4)
c1.metric("Listings", len(df))
c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median()))
c3.metric("Median Size (m²)", int(df["size_m2"].median()))
c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1))
st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30))
with tab_search:
st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)")
pref = st.text_area("Describe your ideal place (English or Danish):", height=100,
value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed")
max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500)
min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5)
rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1)
filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)]
if "rooms" in filter_df.columns:
filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms]
# TF-IDF over descriptions
corpus = filter_df["description"].fillna("").tolist()
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(corpus)
q = vectorizer.transform([pref])
sims = cosine_similarity(q, X).ravel()
filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50)
st.write("Top matches:")
st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]])
st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
with tab_fair:
st.subheader("Estimate a 'fair rent' and spot outliers")
# Features
features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"]
cat = ["type","neighborhood"]
use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"]
data = df[use_cols].dropna()
X = data.drop(columns=["rent_dkk"])
y = data["rent_dkk"]
# Preprocess
num_cols = [c for c in X.columns if c not in cat]
pre = ColumnTransformer([
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns])
])
model = Pipeline([("pre", pre), ("lin", LinearRegression())])
if len(data) > 50:
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(Xtr, ytr)
preds = model.predict(Xte)
mae = np.mean(np.abs(preds - yte))
st.metric("Validation MAE (DKK)", int(mae))
# Score all listings
df_pred = df.copy()
df_pred = df_pred.dropna(subset=num_cols)
df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]])
df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"]
st.write("Potentially underpriced (negative delta):")
st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
st.write("Potentially overpriced (positive delta):")
st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
else:
st.info("Not enough data to train a model.")
with tab_network:
st.subheader("Amenity Co‑occurrence Network")
# pick boolean/flag columns as amenities
amenity_cols = []
for c in df.columns:
if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]:
amenity_cols.append(c)
elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]:
amenity_cols.append(c)
amenity_cols = list(dict.fromkeys(amenity_cols)) # unique preserve order
st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}")
if amenity_cols:
# build co-occurrence matrix
A = []
for _, row in df[amenity_cols].fillna(0).iterrows():
active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True]
A.append(active)
# edges: pair counts
from collections import Counter
edge_counter = Counter()
for acts in A:
for i in range(len(acts)):
for j in range(i+1, len(acts)):
edge_counter[(acts[i], acts[j])] += 1
G = nx.Graph()
for (u,v), w in edge_counter.items():
if w >= 5: # only keep frequent co-occurrences
G.add_edge(u, v, weight=w)
if G.number_of_edges() == 0:
st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.")
else:
pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None)
plt.figure(figsize=(7,5))
weights = [G[u][v]['weight'] for u,v in G.edges()]
nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights])
st.pyplot(plt.gcf())
st.write("Edges weighted by how often amenities appear together in listings.")
else:
st.info("No amenity-like columns detected.")
st.sidebar.header("About this POC")
st.sidebar.markdown(
"""
**Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** +
**Network analysis (amenity co‑occurrence)** can help **renters** discover good deals
and **landlords** benchmark prices.
**How to run locally**:
```bash
pip install -r requirements.txt
streamlit run app.py
```
"""
)
|