# app.py import pandas as pd import numpy as np from datasets import load_dataset from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor import gradio as gr # ------------------------------ # 1️⃣ Load dataset (smaller subset) # ------------------------------ dataset = load_dataset("divarofficial/real_estate_ads") df = dataset["train"].to_pandas().sample(n=50000, random_state=42) # take only 50k rows # ------------------------------ # 2️⃣ Keep only essential columns # ------------------------------ columns_to_use = ['city_slug', 'rooms_count', 'building_size', 'property_type', 'price_value'] df = df[[col for col in columns_to_use if col in df.columns]] # ------------------------------ # 3️⃣ Convert numeric columns safely # ------------------------------ for col in ['rooms_count', 'building_size', 'price_value']: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') df = df.dropna(subset=['rooms_count', 'building_size', 'price_value']) # ------------------------------ # 4️⃣ Encode categorical features # ------------------------------ le_city = LabelEncoder() df['city_slug'] = le_city.fit_transform(df['city_slug']) le_type = LabelEncoder() df['property_type'] = le_type.fit_transform(df['property_type']) # ------------------------------ # 5️⃣ Features and target # ------------------------------ X = df[['city_slug', 'rooms_count', 'building_size', 'property_type']] y = df['price_value'] # ------------------------------ # 6️⃣ Train model # ------------------------------ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = RandomForestRegressor(n_estimators=100, random_state=42) # fewer trees for speed model.fit(X_train, y_train) # ------------------------------ # 7️⃣ Prediction function # ------------------------------ def predict_price(city, rooms, area, prop_type): try: city_enc = le_city.transform([city])[0] type_enc = le_type.transform([prop_type])[0] X_new = np.array([[city_enc, float(rooms), float(area), type_enc]]) price = model.predict(X_new)[0] return f"Estimated Price: {price:,.0f} BDT" except Exception as e: return f"Error: {str(e)}" # ------------------------------ # 8️⃣ Launch Gradio App # ------------------------------ locations = list(le_city.classes_) types = list(le_type.classes_) gr.Interface( fn=predict_price, inputs=[ gr.Dropdown(locations, label="City"), gr.Number(label="Rooms"), gr.Number(label="Area (sqft)"), gr.Dropdown(types, label="Property Type") ], outputs="text", title="🏠 Simple Bangladesh House Price Predictor", description="Predict house prices based on city, rooms, area, and property type." ).launch()