mariajessington's picture
Upload 9 files
56e6417 verified
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from xgboost import XGBRegressor
# -------------------------
# LOAD DATASET
# -------------------------
# Path from src/ folder to dataset
df = pd.read_csv("../dataset/Real_Estate_Data.csv")
df.columns = df.columns.str.strip().str.replace(" ", "_")
print("Columns:", df.columns)
print("Original dataset:", df.shape)
# -------------------------
# CLEAN PRICE FUNCTION
# -------------------------
def convert_price(price):
price = str(price)
price = price.replace("₹","").replace(",","").strip()
try:
if "Cr" in price:
return float(price.replace("Cr","").strip()) * 10000000
elif "Lac" in price:
return float(price.replace("Lac","").strip()) * 100000
elif "L" in price:
return float(price.replace("L","").strip()) * 100000
else:
return float(price)
except:
return None
df["Price"] = df["Price"].apply(convert_price)
# -------------------------
# CLEAN AREA
# -------------------------
df["Total_Area"] = df["Total_Area"].astype(str)
df["Total_Area"] = df["Total_Area"].str.replace("sqft","",regex=False)
df["Total_Area"] = df["Total_Area"].str.replace("sq.ft","",regex=False)
df["Total_Area"] = pd.to_numeric(df["Total_Area"], errors="coerce")
# -------------------------
# OTHER FEATURES
# -------------------------
df["Baths"] = pd.to_numeric(df["Baths"], errors="coerce")
df["Balcony"] = pd.to_numeric(df["Balcony"], errors="coerce")
# -------------------------
# HANDLE MISSING VALUES
# -------------------------
df["Total_Area"] = df["Total_Area"].fillna(df["Total_Area"].median())
df["Baths"] = df["Baths"].fillna(1)
df["Balcony"] = df["Balcony"].fillna(0)
df = df.dropna(subset=["Price","Location"])
# -------------------------
# FEATURE ENGINEERING
# -------------------------
df["BHK"] = df["Baths"]
df = df[["Total_Area","BHK","Baths","Balcony","Location","Price"]]
print("Dataset after cleaning:", df.shape)
# -------------------------
# ENCODE LOCATION
# -------------------------
encoder = LabelEncoder()
df["Location"] = encoder.fit_transform(df["Location"])
# -------------------------
# SPLIT DATA
# -------------------------
X = df.drop("Price", axis=1)
y = df["Price"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# -------------------------
# SCALE FEATURES FOR SVM
# -------------------------
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1,1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1,1)).ravel()
# -------------------------
# TRAIN SVM
# -------------------------
svm = SVR(kernel="rbf", C=100, epsilon=0.1, gamma='scale')
svm.fit(X_train_scaled, y_train_scaled)
svm_pred_scaled = svm.predict(X_test_scaled)
svm_pred = scaler_y.inverse_transform(svm_pred_scaled.reshape(-1,1))
print("SVM R2:", r2_score(y_test, svm_pred))
# -------------------------
# TRAIN XGBOOST
# -------------------------
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print("XGBoost R2:", r2_score(y_test, xgb_pred))
# -------------------------
# SAVE MODELS
# -------------------------
joblib.dump(svm, "../model/svm_model.pkl")
joblib.dump(xgb, "../model/xgb_model.pkl")
joblib.dump(encoder, "../model/location_encoder.pkl")
joblib.dump(scaler_X, "../model/scaler_X.pkl")
joblib.dump(scaler_y, "../model/scaler_y.pkl")
print("Models saved successfully")