import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer import networkx as nx import matplotlib.pyplot as plt st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide") @st.cache_data def load_data(path): df = pd.read_csv(path) # Basic cleaning if "rent_dkk" not in df.columns and "rent_raw" in df.columns: df["rent_dkk"] = df["rent_raw"] # Ensure numerics num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"] for c in num_cols: if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce") # Fill some booleans if present boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"] for b in boolish: if b in df.columns: df[b] = df[b].astype("float").fillna(0.0).astype(int) # Derive rent_per_m2 if missing if "rent_per_m2" in df.columns: df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"]) else: df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"] # Drop crazy values df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])] # Neighborhood quick heuristic from address if "address" in df.columns: df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg") else: df["neighborhood"] = "Aalborg" # Description text if "description" not in df.columns: df["description"] = "" return df df = load_data("apt.csv") st.title("🏠 Aalborg Rentals — Rapid POC") st.caption("NLP + ML + Network Analysis on Boligportal data.") tab_overview, tab_search, tab_fair, tab_network = st.tabs( ["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"] ) with tab_overview: st.subheader("Dataset Snapshot") c1, c2, c3, c4 = st.columns(4) c1.metric("Listings", len(df)) c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median())) c3.metric("Median Size (m²)", int(df["size_m2"].median())) c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1)) st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna()) st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30)) with tab_search: st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)") pref = st.text_area("Describe your ideal place (English or Danish):", height=100, value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed") max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500) min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5) rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1) filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)] if "rooms" in filter_df.columns: filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms] # TF-IDF over descriptions corpus = filter_df["description"].fillna("").tolist() vectorizer = TfidfVectorizer(max_features=5000, stop_words="english") X = vectorizer.fit_transform(corpus) q = vectorizer.transform([pref]) sims = cosine_similarity(q, X).ravel() filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50) st.write("Top matches:") st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]]) st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna()) with tab_fair: st.subheader("Estimate a 'fair rent' and spot outliers") # Features features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"] cat = ["type","neighborhood"] use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"] data = df[use_cols].dropna() X = data.drop(columns=["rent_dkk"]) y = data["rent_dkk"] # Preprocess num_cols = [c for c in X.columns if c not in cat] pre = ColumnTransformer([ ("num", StandardScaler(), num_cols), ("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns]) ]) model = Pipeline([("pre", pre), ("lin", LinearRegression())]) if len(data) > 50: Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42) model.fit(Xtr, ytr) preds = model.predict(Xte) mae = np.mean(np.abs(preds - yte)) st.metric("Validation MAE (DKK)", int(mae)) # Score all listings df_pred = df.copy() df_pred = df_pred.dropna(subset=num_cols) df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]]) df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"] st.write("Potentially underpriced (negative delta):") st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]]) st.write("Potentially overpriced (positive delta):") st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]]) else: st.info("Not enough data to train a model.") with tab_network: st.subheader("Amenity Co‑occurrence Network") # pick boolean/flag columns as amenities amenity_cols = [] for c in df.columns: if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]: amenity_cols.append(c) elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]: amenity_cols.append(c) amenity_cols = list(dict.fromkeys(amenity_cols)) # unique preserve order st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}") if amenity_cols: # build co-occurrence matrix A = [] for _, row in df[amenity_cols].fillna(0).iterrows(): active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True] A.append(active) # edges: pair counts from collections import Counter edge_counter = Counter() for acts in A: for i in range(len(acts)): for j in range(i+1, len(acts)): edge_counter[(acts[i], acts[j])] += 1 G = nx.Graph() for (u,v), w in edge_counter.items(): if w >= 5: # only keep frequent co-occurrences G.add_edge(u, v, weight=w) if G.number_of_edges() == 0: st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.") else: pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None) plt.figure(figsize=(7,5)) weights = [G[u][v]['weight'] for u,v in G.edges()] nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights]) st.pyplot(plt.gcf()) st.write("Edges weighted by how often amenities appear together in listings.") else: st.info("No amenity-like columns detected.") st.sidebar.header("About this POC") st.sidebar.markdown( """ **Goal:** Show how **NLP (TF‑IDF)** + **ML (linear model for 'fair rent')** + **Network analysis (amenity co‑occurrence)** can help **renters** discover good deals and **landlords** benchmark prices. **How to run locally**: ```bash pip install -r requirements.txt streamlit run app.py ``` """ )