import pandas as pd import joblib import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.metrics import r2_score from sklearn.svm import SVR from xgboost import XGBRegressor # ------------------------- # LOAD DATASET # ------------------------- # Path from src/ folder to dataset df = pd.read_csv("../dataset/Real_Estate_Data.csv") df.columns = df.columns.str.strip().str.replace(" ", "_") print("Columns:", df.columns) print("Original dataset:", df.shape) # ------------------------- # CLEAN PRICE FUNCTION # ------------------------- def convert_price(price): price = str(price) price = price.replace("₹","").replace(",","").strip() try: if "Cr" in price: return float(price.replace("Cr","").strip()) * 10000000 elif "Lac" in price: return float(price.replace("Lac","").strip()) * 100000 elif "L" in price: return float(price.replace("L","").strip()) * 100000 else: return float(price) except: return None df["Price"] = df["Price"].apply(convert_price) # ------------------------- # CLEAN AREA # ------------------------- df["Total_Area"] = df["Total_Area"].astype(str) df["Total_Area"] = df["Total_Area"].str.replace("sqft","",regex=False) df["Total_Area"] = df["Total_Area"].str.replace("sq.ft","",regex=False) df["Total_Area"] = pd.to_numeric(df["Total_Area"], errors="coerce") # ------------------------- # OTHER FEATURES # ------------------------- df["Baths"] = pd.to_numeric(df["Baths"], errors="coerce") df["Balcony"] = pd.to_numeric(df["Balcony"], errors="coerce") # ------------------------- # HANDLE MISSING VALUES # ------------------------- df["Total_Area"] = df["Total_Area"].fillna(df["Total_Area"].median()) df["Baths"] = df["Baths"].fillna(1) df["Balcony"] = df["Balcony"].fillna(0) df = df.dropna(subset=["Price","Location"]) # ------------------------- # FEATURE ENGINEERING # ------------------------- df["BHK"] = df["Baths"] df = df[["Total_Area","BHK","Baths","Balcony","Location","Price"]] print("Dataset after cleaning:", df.shape) # ------------------------- # ENCODE LOCATION # ------------------------- encoder = LabelEncoder() df["Location"] = encoder.fit_transform(df["Location"]) # ------------------------- # SPLIT DATA # ------------------------- X = df.drop("Price", axis=1) y = df["Price"] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # ------------------------- # SCALE FEATURES FOR SVM # ------------------------- scaler_X = StandardScaler() X_train_scaled = scaler_X.fit_transform(X_train) X_test_scaled = scaler_X.transform(X_test) scaler_y = StandardScaler() y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1,1)).ravel() y_test_scaled = scaler_y.transform(y_test.values.reshape(-1,1)).ravel() # ------------------------- # TRAIN SVM # ------------------------- svm = SVR(kernel="rbf", C=100, epsilon=0.1, gamma='scale') svm.fit(X_train_scaled, y_train_scaled) svm_pred_scaled = svm.predict(X_test_scaled) svm_pred = scaler_y.inverse_transform(svm_pred_scaled.reshape(-1,1)) print("SVM R2:", r2_score(y_test, svm_pred)) # ------------------------- # TRAIN XGBOOST # ------------------------- xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42) xgb.fit(X_train, y_train) xgb_pred = xgb.predict(X_test) print("XGBoost R2:", r2_score(y_test, xgb_pred)) # ------------------------- # SAVE MODELS # ------------------------- joblib.dump(svm, "../model/svm_model.pkl") joblib.dump(xgb, "../model/xgb_model.pkl") joblib.dump(encoder, "../model/location_encoder.pkl") joblib.dump(scaler_X, "../model/scaler_X.pkl") joblib.dump(scaler_y, "../model/scaler_y.pkl") print("Models saved successfully")