Spaces:

soysouce
/

NLP_exam

Sleeping

App Files Files Community

NLP_exam / app_apt.py

soysouce

Upload app_apt.py

74bf8e2 verified 3 months ago

raw

history blame contribute delete

8.19 kB


	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.feature_extraction.text import TfidfVectorizer
	import networkx as nx
	import matplotlib.pyplot as plt

	st.set_page_config(page_title="Aalborg Rentals — POC", layout="wide")

	@st.cache_data
	def load_data(path):
	df = pd.read_csv(path)
	# Basic cleaning
	if "rent_dkk" not in df.columns and "rent_raw" in df.columns:
	df["rent_dkk"] = df["rent_raw"]
	# Ensure numerics
	num_cols = ["rent_dkk", "rent_per_m2", "size_m2", "rooms", "floor", "deposit_dkk", "move_in_price", "lat", "lon"]
	for c in num_cols:
	if c in df.columns:
	df[c] = pd.to_numeric(df[c], errors="coerce")
	# Fill some booleans if present
	boolish = ["furnished", "shared_friendly", "pets_allowed", "elevator"]
	for b in boolish:
	if b in df.columns:
	df[b] = df[b].astype("float").fillna(0.0).astype(int)
	# Derive rent_per_m2 if missing
	if "rent_per_m2" in df.columns:
	df["rent_per_m2"] = df["rent_per_m2"].fillna(df["rent_dkk"] / df["size_m2"])
	else:
	df["rent_per_m2"] = df["rent_dkk"] / df["size_m2"]
	# Drop crazy values
	df = df[(df["rent_dkk"] > 0) & (df["size_m2"] > 0) & np.isfinite(df["rent_per_m2"])]
	# Neighborhood quick heuristic from address
	if "address" in df.columns:
	df["neighborhood"] = df["address"].fillna("").str.extract(r",\s*([^,]+)$")[0].fillna("Aalborg")
	else:
	df["neighborhood"] = "Aalborg"
	# Description text
	if "description" not in df.columns:
	df["description"] = ""
	return df

	df = load_data("apt.csv")

	st.title("🏠 Aalborg Rentals — Rapid POC")
	st.caption("NLP + ML + Network Analysis on Boligportal data.")

	tab_overview, tab_search, tab_fair, tab_network = st.tabs(
	["📊 Overview", "🔎 Preference Match (NLP)", "💸 Fair Price (ML)", "🧭 Amenity Network"]
	)

	with tab_overview:
	st.subheader("Dataset Snapshot")
	c1, c2, c3, c4 = st.columns(4)
	c1.metric("Listings", len(df))
	c2.metric("Median Rent (DKK)", int(df["rent_dkk"].median()))
	c3.metric("Median Size (m²)", int(df["size_m2"].median()))
	c4.metric("Median Rent/m²", round(df["rent_per_m2"].median(), 1))
	st.map(df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())
	st.dataframe(df[["title","address","type","rooms","size_m2","rent_dkk","rent_per_m2"]].head(30))

	with tab_search:
	st.subheader("Find listings that match your preferences (TF‑IDF cosine similarity)")
	pref = st.text_area("Describe your ideal place (English or Danish):", height=100,
	value="2 rooms, balcony, dishwasher, close to university, quiet area, pets allowed")
	max_rent = st.slider("Max monthly rent (DKK)", 2000, int(df["rent_dkk"].quantile(0.99)), 8000, step=500)
	min_size = st.slider("Minimum size (m²)", 10, int(df["size_m2"].quantile(0.99)), 50, step=5)
	rooms = st.slider("Min rooms", 1, int(np.nanmax(df["rooms"])) if "rooms" in df.columns else 5, 2, step=1)
	filter_df = df[(df["rent_dkk"] <= max_rent) & (df["size_m2"] >= min_size)]
	if "rooms" in filter_df.columns:
	filter_df = filter_df[filter_df["rooms"].fillna(0) >= rooms]
	# TF-IDF over descriptions
	corpus = filter_df["description"].fillna("").tolist()
	vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
	X = vectorizer.fit_transform(corpus)
	q = vectorizer.transform([pref])
	sims = cosine_similarity(q, X).ravel()
	filter_df = filter_df.assign(similarity=sims).sort_values("similarity", ascending=False).head(50)
	st.write("Top matches:")
	st.dataframe(filter_df[["title","address","rent_dkk","size_m2","rooms","similarity"]])
	st.map(filter_df.rename(columns={"lat":"latitude","lon":"longitude"})[["latitude","longitude"]].dropna())

	with tab_fair:
	st.subheader("Estimate a 'fair rent' and spot outliers")
	# Features
	features = ["size_m2","rooms","floor","furnished","pets_allowed","elevator","rent_per_m2"]
	cat = ["type","neighborhood"]
	use_cols = [c for c in features if c in df.columns] + [c for c in cat if c in df.columns] + ["rent_dkk"]
	data = df[use_cols].dropna()
	X = data.drop(columns=["rent_dkk"])
	y = data["rent_dkk"]
	# Preprocess
	num_cols = [c for c in X.columns if c not in cat]
	pre = ColumnTransformer([
	("num", StandardScaler(), num_cols),
	("cat", OneHotEncoder(handle_unknown="ignore"), [c for c in cat if c in X.columns])
	])
	model = Pipeline([("pre", pre), ("lin", LinearRegression())])
	if len(data) > 50:
	Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)
	model.fit(Xtr, ytr)
	preds = model.predict(Xte)
	mae = np.mean(np.abs(preds - yte))
	st.metric("Validation MAE (DKK)", int(mae))
	# Score all listings
	df_pred = df.copy()
	df_pred = df_pred.dropna(subset=num_cols)
	df_pred["pred_rent"] = model.predict(df_pred[num_cols + [c for c in cat if c in df_pred.columns]])
	df_pred["delta"] = df_pred["rent_dkk"] - df_pred["pred_rent"]
	st.write("Potentially underpriced (negative delta):")
	st.dataframe(df_pred.sort_values("delta").head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
	st.write("Potentially overpriced (positive delta):")
	st.dataframe(df_pred.sort_values("delta", ascending=False).head(20)[["title","address","rent_dkk","pred_rent","delta","size_m2","rooms"]])
	else:
	st.info("Not enough data to train a model.")

	with tab_network:
	st.subheader("Amenity Co‑occurrence Network")
	# pick boolean/flag columns as amenities
	amenity_cols = []
	for c in df.columns:
	if c.lower() in ["furnished","shared_friendly","pets_allowed","elevator"]:
	amenity_cols.append(c)
	elif df[c].dropna().isin([0,1]).all() and df[c].nunique()<=3 and c not in ["rooms","floor"]:
	amenity_cols.append(c)
	amenity_cols = list(dict.fromkeys(amenity_cols)) # unique preserve order
	st.caption(f"Detected amenity columns: {', '.join(amenity_cols) if amenity_cols else 'None'}")
	if amenity_cols:
	# build co-occurrence matrix
	A = []
	for _, row in df[amenity_cols].fillna(0).iterrows():
	active = [amenity_cols[i] for i, v in enumerate(row.values) if v==1 or v==True]
	A.append(active)
	# edges: pair counts
	from collections import Counter
	edge_counter = Counter()
	for acts in A:
	for i in range(len(acts)):
	for j in range(i+1, len(acts)):
	edge_counter[(acts[i], acts[j])] += 1
	G = nx.Graph()
	for (u,v), w in edge_counter.items():
	if w >= 5: # only keep frequent co-occurrences
	G.add_edge(u, v, weight=w)
	if G.number_of_edges() == 0:
	st.info("Not enough co-occurrences to visualize. Try lowering the threshold in code.")
	else:
	pos = nx.spring_layout(G, seed=42, k=2.0, iterations=500, weight=None)
	plt.figure(figsize=(7,5))
	weights = [G[u][v]['weight'] for u,v in G.edges()]
	nx.draw(G, pos, with_labels=True, node_size=1200, width=[w/200 for w in weights])
	st.pyplot(plt.gcf())
	st.write("Edges weighted by how often amenities appear together in listings.")
	else:
	st.info("No amenity-like columns detected.")

	st.sidebar.header("About this POC")
	st.sidebar.markdown(
	"""
	Goal: Show how NLP (TF‑IDF) + ML (linear model for 'fair rent') +
	Network analysis (amenity co‑occurrence) can help renters discover good deals
	and landlords benchmark prices.

	How to run locally:
	```bash
	pip install -r requirements.txt
	streamlit run app.py
	```
	"""
	)