yashu / src /streamlit_app.py
Yash007001's picture
Update src/streamlit_app.py
7fd49e9 verified
# streamlit_app.py
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import traceback
import sys
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
st.set_page_config(page_title="Synthetic Car Price Prediction (KNN)", layout="wide")
st.title("Synthetic Car Price Prediction — KNN (Streamlit)")
# ----------------------
# Synthetic data generator
# ----------------------
def generate_synthetic_cars(n_samples=2000, random_state=42):
rng = np.random.default_rng(random_state)
manufacturers = ['TOYOTA', 'HONDA', 'HYUNDAI', 'FORD', 'BMW', 'AUDI', 'KIA', 'HUNTER'] # added HUNTER to match user context
models_by_make = {
'TOYOTA': ['Corolla', 'Camry', 'Prius'],
'HONDA': ['Civic', 'Accord', 'City'],
'HYUNDAI': ['i20', 'Elantra', 'Creta'],
'FORD': ['Figo', 'Focus', 'Mustang'],
'BMW': ['3 Series', '5 Series'],
'AUDI': ['A4', 'A6'],
'KIA': ['Seltos', 'Sonet'],
'HUNTER': ['Classic', 'Cruiser']
}
categories = ['Hatchback', 'Sedan', 'SUV', 'Coupe']
leather_opts = ['Yes', 'No']
fuels = ['Petrol', 'Diesel', 'Hybrid', 'Electric']
gearbox = ['Manual', 'Automatic']
drive = ['Front', 'Rear', 'All']
doors = ['02-Jan', '04-May', '05-Oct'] # keep string-like options
wheels = ['Left wheel', 'Right wheel']
colors = ['White', 'Black', 'Silver', 'Red', 'Blue']
data = []
for i in range(n_samples):
make = rng.choice(manufacturers)
model = rng.choice(models_by_make[make])
prod_year = int(rng.integers(2005, 2025)) # years between 2005 and 2024
mileage = int(abs(rng.normal(loc=50000, scale=30000))) # some can be high
engine_vol = float(np.round(rng.choice([1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0]), 1))
cylinders = int(rng.choice([3, 4, 6, 8]))
airbags = int(rng.choice([2, 4, 6, 8, 10]))
levy = float(np.round(np.abs(rng.normal(loc=500.0, scale=1200.0)), 2)) # tax/levy-like small number, can be 0
category = rng.choice(categories)
leather = rng.choice(leather_opts, p=[0.25, 0.75])
fuel = rng.choice(fuels, p=[0.6, 0.25, 0.1, 0.05])
gear = rng.choice(gearbox, p=[0.6, 0.4])
dr = rng.choice(drive, p=[0.7, 0.15, 0.15])
door = rng.choice(doors)
wheel = rng.choice(wheels)
color = rng.choice(colors)
# Base price influenced by: maker prestige, engine size, year, mileage, fuel type, airbags, leather, category
base_price = 0.0
# manufacturer multiplier (toyota/honda lower, BMW/AUDI higher)
prestige = {'TOYOTA': 1.0, 'HONDA': 1.0, 'HYUNDAI': 0.9, 'FORD': 0.95, 'BMW': 2.2, 'AUDI': 2.0, 'KIA': 0.9, 'HUNTER': 1.1}
base_price += 10000 * prestige.get(make, 1.0)
# newer car adds price
base_price += (prod_year - 2000) * 500
# engine volume adds
base_price += engine_vol * 1500
# category price bump
if category == 'SUV':
base_price += 5000
elif category == 'Coupe':
base_price += 3000
# airbags, leather
base_price += airbags * 100
if leather == 'Yes':
base_price += 1500
# fuel adjustments
if fuel == 'Hybrid':
base_price += 2500
if fuel == 'Electric':
base_price += 8000
# mileage depreciation
base_price -= (mileage / 1000) * 300 # 300 per 1000 km
# levy (add if present)
base_price += levy * 1.0
# ensure price not negative
noise = rng.normal(loc=0.0, scale=2000.0)
price = max(1000.0, base_price + noise)
data.append({
'Levy': round(levy, 2),
'Manufacturer': make,
'Model': model,
'Prod. year': prod_year,
'Category': category,
'Leather interior': leather,
'Fuel type': fuel,
'Engine volume': engine_vol,
'Mileage': mileage,
'Cylinders': cylinders,
'Gear box type': gear,
'Drive wheels': dr,
'Doors': door,
'Wheel': wheel,
'Color': color,
'Airbags': airbags,
'Price': round(price, 2)
})
df = pd.DataFrame(data)
return df
# ----------------------
# Sidebar: options
# ----------------------
st.sidebar.header("Synthetic dataset & model controls")
n_samples = st.sidebar.slider("Number of synthetic samples", min_value=200, max_value=20000, value=2000, step=100)
seed = st.sidebar.number_input("Random seed", value=42, step=1)
k_neighbors = st.sidebar.number_input("K (n_neighbors for KNN)", min_value=1, max_value=100, value=5, step=1)
test_size = st.sidebar.slider("Test size (%)", min_value=5, max_value=50, value=20, step=5) / 100.0
random_state = st.sidebar.number_input("Random state (train/test split)", value=42, step=1)
regen = st.sidebar.button("Regenerate dataset")
run_train = st.sidebar.button("Train Model")
# Generate dataset (or regenerate)
if 'synthetic_df' not in st.session_state or regen:
st.session_state['synthetic_df'] = generate_synthetic_cars(n_samples=n_samples, random_state=seed)
df = st.session_state['synthetic_df']
st.subheader("Synthetic dataset preview")
st.dataframe(df.head(10))
# ----------------------
# Prepare features + types
# ----------------------
expected_numerical = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']
expected_categorical = ['Manufacturer', 'Model', 'Category', 'Leather interior',
'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']
numerical_features = [c for c in expected_numerical if c in df.columns]
categorical_features = [c for c in expected_categorical if c in df.columns]
st.markdown(f"**Numerical features used:** {numerical_features}")
st.markdown(f"**Categorical features used:** {categorical_features}")
# Drop rows with missing target (shouldn't be any in synthetic)
df = df.dropna(subset=['Price'])
X = df.drop('Price', axis=1)
y = df['Price']
# ----------------------
# Preprocessor & Pipeline (robust to sklearn version)
# ----------------------
numerical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
ohe_kwargs = dict(handle_unknown='ignore')
try:
OneHotEncoder(sparse_output=False, **ohe_kwargs)
ohe_kwargs['sparse_output'] = False
except TypeError:
ohe_kwargs['sparse'] = False
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(**ohe_kwargs))
])
preprocessor = ColumnTransformer(transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
], remainder='drop')
knn_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', KNeighborsRegressor(n_neighbors=int(k_neighbors)))
])
# ----------------------
# Train / Evaluate
# ----------------------
if run_train:
st.subheader("Training the KNN model on synthetic data")
try:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(random_state))
max_k = max(1, len(X_train))
chosen_k = int(k_neighbors)
if chosen_k > max_k:
st.warning(f"Requested k={chosen_k} is larger than number of training samples ({max_k}). Using k={max_k}.")
chosen_k = max_k
knn_pipeline.set_params(regressor__n_neighbors=chosen_k)
with st.spinner("Fitting model..."):
knn_pipeline.fit(X_train, y_train)
st.success("Training complete.")
# Evaluate
y_pred = knn_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
st.metric("RMSE", f"${rmse:,.2f}")
st.metric("R² score", f"{r2:.4f}")
st.write("Prediction vs Actual (first 20 rows):")
compare_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred}).reset_index(drop=True).head(20)
st.dataframe(compare_df)
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, alpha=0.5)
try:
maxv = max(np.nanmax(y_test.values), np.nanmax(y_pred))
minv = min(np.nanmin(y_test.values), np.nanmin(y_pred))
ax.plot([minv, maxv], [minv, maxv], linestyle='--')
except Exception:
pass
ax.set_xlabel("Actual Price")
ax.set_ylabel("Predicted Price")
ax.set_title("Actual vs Predicted")
st.pyplot(fig)
# Save to session for later prediction
st.session_state['model_pipeline'] = knn_pipeline
st.session_state['numerical_features'] = numerical_features
st.session_state['categorical_features'] = categorical_features
st.success("Model saved in session state — use the form below for single predictions.")
except Exception as e:
st.error(f"Training failed: {e}")
print("Training exception:", file=sys.stderr)
traceback.print_exc()
# ----------------------
# Single-sample prediction form
# ----------------------
st.subheader("Single-sample prediction")
if 'model_pipeline' not in st.session_state:
st.info("Train the model first (click 'Train Model' in the sidebar).")
else:
model_pipeline = st.session_state['model_pipeline']
num_feats = st.session_state.get('numerical_features', [])
cat_feats = st.session_state.get('categorical_features', [])
with st.form("single_predict_form"):
st.write("Fill feature values for a single car (leave numeric blank to use median-imputed value).")
form_vals = {}
for f in num_feats:
form_vals[f] = st.text_input(f"{f} (numeric)", value="")
for f in cat_feats:
form_vals[f] = st.text_input(f"{f} (categorical)", value="")
submitted = st.form_submit_button("Predict")
if submitted:
sample = {}
for f in num_feats:
v = form_vals[f].strip()
if v == "":
sample[f] = np.nan
else:
try:
if f == 'Mileage':
sample[f] = int(float(str(v).replace(',', '')))
elif f == 'Engine volume':
sample[f] = float(v)
else:
sample[f] = float(str(v).replace(',', ''))
except Exception:
sample[f] = pd.to_numeric(v, errors='coerce')
for f in cat_feats:
v = form_vals[f].strip()
sample[f] = v if v != "" else "missing"
sample_df = pd.DataFrame([sample])
try:
pred = model_pipeline.predict(sample_df)[0]
st.success(f"Predicted Price: ${pred:,.2f}")
except Exception as e:
st.error(f"Prediction failed: {e}")
print("Prediction exception:", file=sys.stderr)
traceback.print_exc()
# ----------------------
# Quick example prediction
# ----------------------
st.subheader("Quick example prediction (auto-filled)")
if st.button("Run toy example prediction"):
example = {
'Levy': [1000.0],
'Manufacturer': ['TOYOTA'],
'Model': ['Prius'],
'Prod. year': [2018],
'Category': ['Hatchback'],
'Leather interior': ['Yes'],
'Fuel type': ['Hybrid'],
'Engine volume': [1.8],
'Mileage': [50000],
'Cylinders': [4.0],
'Gear box type': ['Automatic'],
'Drive wheels': ['Front'],
'Doors': ['04-May'],
'Wheel': ['Left wheel'],
'Color': ['White'],
'Airbags': [10]
}
example_df = {}
for col in X.columns:
if col in example:
example_df[col] = example[col]
else:
example_df[col] = [np.nan] if col in numerical_features else ['missing']
example_df = pd.DataFrame(example_df)
if 'model_pipeline' in st.session_state:
try:
pred = st.session_state['model_pipeline'].predict(example_df)[0]
st.success(f"Example Predicted Price: ${pred:,.2f}")
except Exception as e:
st.error(f"Example prediction failed: {e}")
print("Example prediction exception:", file=sys.stderr)
traceback.print_exc()
else:
st.info("Train the model first to run the example prediction.")
st.markdown("---")
st.caption("This app generates synthetic car data and trains a KNN regressor. For production use, replace the synthetic generator with your real dataset and consider tree-based models for better performance.")