NLP_exam / app_apt.py
soysouce's picture
Upload app_apt.py
74bf8e2 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import matplotlib.pyplot as plt
st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide")
@st.cache_data
def load_data(path):
df = pd.read_csv(path)
# Basic cleaning
if "rent_dkk" not in df.columns and "rent_raw" in df.columns:
df["rent_dkk"] = df["rent_raw"]
# Ensure numerics
num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"]
for c in num_cols:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
# Fill some booleans if present
boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"]
for b in boolish:
if b in df.columns:
df[b] = df[b].astype("float").fillna(0.0).astype(int)
# Derive rent_per_m2 if missing
if "rent_per_m2" in df.columns:
df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"])
else:
df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"]
# Drop crazy values
df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])]
# Neighborhood quick heuristic from address
if "address" in df.columns:
df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg")
else:
df["neighborhood"] = "Aalborg"
# Description text
if "description" not in df.columns:
df["description"] = ""
return df
df = load_data("apt.csv")
st.title("🏠 Aalborg Rentals — Rapid POC")
st.caption("NLP + ML + Network Analysis on Boligportal data.")
tab_overview, tab_search, tab_fair, tab_network = st.tabs(
["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"]
)
with tab_overview:
st.subheader("Dataset Snapshot")
c1, c2, c3, c4 = st.columns(4)
c1.metric("Listings", len(df))
c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median()))
c3.metric("Median Size (m²)", int(df["size_m2"].median()))
c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1))
st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30))
with tab_search:
st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)")
pref = st.text_area("Describe your ideal place (English or Danish):", height=100,
value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed")
max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500)
min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5)
rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1)
filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)]
if "rooms" in filter_df.columns:
filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms]
# TF-IDF over descriptions
corpus = filter_df["description"].fillna("").tolist()
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(corpus)
q = vectorizer.transform([pref])
sims = cosine_similarity(q, X).ravel()
filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50)
st.write("Top matches:")
st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]])
st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
with tab_fair:
st.subheader("Estimate a 'fair rent' and spot outliers")
# Features
features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"]
cat = ["type","neighborhood"]
use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"]
data = df[use_cols].dropna()
X = data.drop(columns=["rent_dkk"])
y = data["rent_dkk"]
# Preprocess
num_cols = [c for c in X.columns if c not in cat]
pre = ColumnTransformer([
("num", StandardScaler(), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns])
])
model = Pipeline([("pre", pre), ("lin", LinearRegression())])
if len(data) > 50:
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(Xtr, ytr)
preds = model.predict(Xte)
mae = np.mean(np.abs(preds - yte))
st.metric("Validation MAE (DKK)", int(mae))
# Score all listings
df_pred = df.copy()
df_pred = df_pred.dropna(subset=num_cols)
df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]])
df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"]
st.write("Potentially underpriced (negative delta):")
st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
st.write("Potentially overpriced (positive delta):")
st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
else:
st.info("Not enough data to train a model.")
with tab_network:
st.subheader("Amenity Co‑occurrence Network")
# pick boolean/flag columns as amenities
amenity_cols = []
for c in df.columns:
if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]:
amenity_cols.append(c)
elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]:
amenity_cols.append(c)
amenity_cols = list(dict.fromkeys(amenity_cols)) # unique preserve order
st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}")
if amenity_cols:
# build co-occurrence matrix
A = []
for _, row in df[amenity_cols].fillna(0).iterrows():
active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True]
A.append(active)
# edges: pair counts
from collections import Counter
edge_counter = Counter()
for acts in A:
for i in range(len(acts)):
for j in range(i+1, len(acts)):
edge_counter[(acts[i], acts[j])] += 1
G = nx.Graph()
for (u,v), w in edge_counter.items():
if w >= 5: # only keep frequent co-occurrences
G.add_edge(u, v, weight=w)
if G.number_of_edges() == 0:
st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.")
else:
pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None)
plt.figure(figsize=(7,5))
weights = [G[u][v]['weight'] for u,v in G.edges()]
nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights])
st.pyplot(plt.gcf())
st.write("Edges weighted by how often amenities appear together in listings.")
else:
st.info("No amenity-like columns detected.")
st.sidebar.header("About this POC")
st.sidebar.markdown(
"""
**Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** +
**Network analysis (amenity co‑occurrence)** can help **renters** discover good deals
and **landlords** benchmark prices.
**How to run locally**:
```bash
pip install -r requirements.txt
streamlit run app.py
```
"""
)