Spaces:

Yash007001
/

yashu

Sleeping

App Files Files Community

yashu / src /streamlit_app.py

Yash007001

Update src/streamlit_app.py

7fd49e9 verified 4 months ago

raw

history blame contribute delete

13.2 kB

	# streamlit_app.py
	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import traceback
	import sys

	from sklearn.model_selection import train_test_split
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.metrics import mean_squared_error, r2_score
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.impute import SimpleImputer
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline

	st.set_page_config(page_title="Synthetic Car Price Prediction (KNN)", layout="wide")
	st.title("Synthetic Car Price Prediction — KNN (Streamlit)")

	# ----------------------
	# Synthetic data generator
	# ----------------------
	def generate_synthetic_cars(n_samples=2000, random_state=42):
	rng = np.random.default_rng(random_state)

	manufacturers = ['TOYOTA', 'HONDA', 'HYUNDAI', 'FORD', 'BMW', 'AUDI', 'KIA', 'HUNTER'] # added HUNTER to match user context
	models_by_make = {
	'TOYOTA': ['Corolla', 'Camry', 'Prius'],
	'HONDA': ['Civic', 'Accord', 'City'],
	'HYUNDAI': ['i20', 'Elantra', 'Creta'],
	'FORD': ['Figo', 'Focus', 'Mustang'],
	'BMW': ['3 Series', '5 Series'],
	'AUDI': ['A4', 'A6'],
	'KIA': ['Seltos', 'Sonet'],
	'HUNTER': ['Classic', 'Cruiser']
	}
	categories = ['Hatchback', 'Sedan', 'SUV', 'Coupe']
	leather_opts = ['Yes', 'No']
	fuels = ['Petrol', 'Diesel', 'Hybrid', 'Electric']
	gearbox = ['Manual', 'Automatic']
	drive = ['Front', 'Rear', 'All']
	doors = ['02-Jan', '04-May', '05-Oct'] # keep string-like options
	wheels = ['Left wheel', 'Right wheel']
	colors = ['White', 'Black', 'Silver', 'Red', 'Blue']

	data = []
	for i in range(n_samples):
	make = rng.choice(manufacturers)
	model = rng.choice(models_by_make[make])
	prod_year = int(rng.integers(2005, 2025)) # years between 2005 and 2024
	mileage = int(abs(rng.normal(loc=50000, scale=30000))) # some can be high
	engine_vol = float(np.round(rng.choice([1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0]), 1))
	cylinders = int(rng.choice([3, 4, 6, 8]))
	airbags = int(rng.choice([2, 4, 6, 8, 10]))
	levy = float(np.round(np.abs(rng.normal(loc=500.0, scale=1200.0)), 2)) # tax/levy-like small number, can be 0
	category = rng.choice(categories)
	leather = rng.choice(leather_opts, p=[0.25, 0.75])
	fuel = rng.choice(fuels, p=[0.6, 0.25, 0.1, 0.05])
	gear = rng.choice(gearbox, p=[0.6, 0.4])
	dr = rng.choice(drive, p=[0.7, 0.15, 0.15])
	door = rng.choice(doors)
	wheel = rng.choice(wheels)
	color = rng.choice(colors)

	# Base price influenced by: maker prestige, engine size, year, mileage, fuel type, airbags, leather, category
	base_price = 0.0
	# manufacturer multiplier (toyota/honda lower, BMW/AUDI higher)
	prestige = {'TOYOTA': 1.0, 'HONDA': 1.0, 'HYUNDAI': 0.9, 'FORD': 0.95, 'BMW': 2.2, 'AUDI': 2.0, 'KIA': 0.9, 'HUNTER': 1.1}
	base_price += 10000 * prestige.get(make, 1.0)
	# newer car adds price
	base_price += (prod_year - 2000) * 500
	# engine volume adds
	base_price += engine_vol * 1500
	# category price bump
	if category == 'SUV':
	base_price += 5000
	elif category == 'Coupe':
	base_price += 3000
	# airbags, leather
	base_price += airbags * 100
	if leather == 'Yes':
	base_price += 1500
	# fuel adjustments
	if fuel == 'Hybrid':
	base_price += 2500
	if fuel == 'Electric':
	base_price += 8000
	# mileage depreciation
	base_price -= (mileage / 1000) * 300 # 300 per 1000 km
	# levy (add if present)
	base_price += levy * 1.0

	# ensure price not negative
	noise = rng.normal(loc=0.0, scale=2000.0)
	price = max(1000.0, base_price + noise)

	data.append({
	'Levy': round(levy, 2),
	'Manufacturer': make,
	'Model': model,
	'Prod. year': prod_year,
	'Category': category,
	'Leather interior': leather,
	'Fuel type': fuel,
	'Engine volume': engine_vol,
	'Mileage': mileage,
	'Cylinders': cylinders,
	'Gear box type': gear,
	'Drive wheels': dr,
	'Doors': door,
	'Wheel': wheel,
	'Color': color,
	'Airbags': airbags,
	'Price': round(price, 2)
	})

	df = pd.DataFrame(data)
	return df

	# ----------------------
	# Sidebar: options
	# ----------------------
	st.sidebar.header("Synthetic dataset & model controls")
	n_samples = st.sidebar.slider("Number of synthetic samples", min_value=200, max_value=20000, value=2000, step=100)
	seed = st.sidebar.number_input("Random seed", value=42, step=1)
	k_neighbors = st.sidebar.number_input("K (n_neighbors for KNN)", min_value=1, max_value=100, value=5, step=1)
	test_size = st.sidebar.slider("Test size (%)", min_value=5, max_value=50, value=20, step=5) / 100.0
	random_state = st.sidebar.number_input("Random state (train/test split)", value=42, step=1)

	regen = st.sidebar.button("Regenerate dataset")
	run_train = st.sidebar.button("Train Model")

	# Generate dataset (or regenerate)
	if 'synthetic_df' not in st.session_state or regen:
	st.session_state['synthetic_df'] = generate_synthetic_cars(n_samples=n_samples, random_state=seed)

	df = st.session_state['synthetic_df']

	st.subheader("Synthetic dataset preview")
	st.dataframe(df.head(10))

	# ----------------------
	# Prepare features + types
	# ----------------------
	expected_numerical = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']
	expected_categorical = ['Manufacturer', 'Model', 'Category', 'Leather interior',
	'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color']

	numerical_features = [c for c in expected_numerical if c in df.columns]
	categorical_features = [c for c in expected_categorical if c in df.columns]

	st.markdown(f"Numerical features used: {numerical_features}")
	st.markdown(f"Categorical features used: {categorical_features}")

	# Drop rows with missing target (shouldn't be any in synthetic)
	df = df.dropna(subset=['Price'])

	X = df.drop('Price', axis=1)
	y = df['Price']

	# ----------------------
	# Preprocessor & Pipeline (robust to sklearn version)
	# ----------------------
	numerical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	ohe_kwargs = dict(handle_unknown='ignore')
	try:
	OneHotEncoder(sparse_output=False, **ohe_kwargs)
	ohe_kwargs['sparse_output'] = False
	except TypeError:
	ohe_kwargs['sparse'] = False

	categorical_transformer = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
	('onehot', OneHotEncoder(**ohe_kwargs))
	])

	preprocessor = ColumnTransformer(transformers=[
	('num', numerical_transformer, numerical_features),
	('cat', categorical_transformer, categorical_features)
	], remainder='drop')

	knn_pipeline = Pipeline(steps=[
	('preprocessor', preprocessor),
	('regressor', KNeighborsRegressor(n_neighbors=int(k_neighbors)))
	])

	# ----------------------
	# Train / Evaluate
	# ----------------------
	if run_train:
	st.subheader("Training the KNN model on synthetic data")
	try:
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(random_state))
	max_k = max(1, len(X_train))
	chosen_k = int(k_neighbors)
	if chosen_k > max_k:
	st.warning(f"Requested k={chosen_k} is larger than number of training samples ({max_k}). Using k={max_k}.")
	chosen_k = max_k
	knn_pipeline.set_params(regressor__n_neighbors=chosen_k)

	with st.spinner("Fitting model..."):
	knn_pipeline.fit(X_train, y_train)
	st.success("Training complete.")

	# Evaluate
	y_pred = knn_pipeline.predict(X_test)
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	r2 = r2_score(y_test, y_pred)

	st.metric("RMSE", f"${rmse:,.2f}")
	st.metric("R² score", f"{r2:.4f}")

	st.write("Prediction vs Actual (first 20 rows):")
	compare_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred}).reset_index(drop=True).head(20)
	st.dataframe(compare_df)

	fig, ax = plt.subplots()
	ax.scatter(y_test, y_pred, alpha=0.5)
	try:
	maxv = max(np.nanmax(y_test.values), np.nanmax(y_pred))
	minv = min(np.nanmin(y_test.values), np.nanmin(y_pred))
	ax.plot([minv, maxv], [minv, maxv], linestyle='--')
	except Exception:
	pass
	ax.set_xlabel("Actual Price")
	ax.set_ylabel("Predicted Price")
	ax.set_title("Actual vs Predicted")
	st.pyplot(fig)

	# Save to session for later prediction
	st.session_state['model_pipeline'] = knn_pipeline
	st.session_state['numerical_features'] = numerical_features
	st.session_state['categorical_features'] = categorical_features
	st.success("Model saved in session state — use the form below for single predictions.")
	except Exception as e:
	st.error(f"Training failed: {e}")
	print("Training exception:", file=sys.stderr)
	traceback.print_exc()

	# ----------------------
	# Single-sample prediction form
	# ----------------------
	st.subheader("Single-sample prediction")

	if 'model_pipeline' not in st.session_state:
	st.info("Train the model first (click 'Train Model' in the sidebar).")
	else:
	model_pipeline = st.session_state['model_pipeline']
	num_feats = st.session_state.get('numerical_features', [])
	cat_feats = st.session_state.get('categorical_features', [])

	with st.form("single_predict_form"):
	st.write("Fill feature values for a single car (leave numeric blank to use median-imputed value).")
	form_vals = {}
	for f in num_feats:
	form_vals[f] = st.text_input(f"{f} (numeric)", value="")
	for f in cat_feats:
	form_vals[f] = st.text_input(f"{f} (categorical)", value="")
	submitted = st.form_submit_button("Predict")
	if submitted:
	sample = {}
	for f in num_feats:
	v = form_vals[f].strip()
	if v == "":
	sample[f] = np.nan
	else:
	try:
	if f == 'Mileage':
	sample[f] = int(float(str(v).replace(',', '')))
	elif f == 'Engine volume':
	sample[f] = float(v)
	else:
	sample[f] = float(str(v).replace(',', ''))
	except Exception:
	sample[f] = pd.to_numeric(v, errors='coerce')
	for f in cat_feats:
	v = form_vals[f].strip()
	sample[f] = v if v != "" else "missing"

	sample_df = pd.DataFrame([sample])
	try:
	pred = model_pipeline.predict(sample_df)[0]
	st.success(f"Predicted Price: ${pred:,.2f}")
	except Exception as e:
	st.error(f"Prediction failed: {e}")
	print("Prediction exception:", file=sys.stderr)
	traceback.print_exc()

	# ----------------------
	# Quick example prediction
	# ----------------------
	st.subheader("Quick example prediction (auto-filled)")

	if st.button("Run toy example prediction"):
	example = {
	'Levy': [1000.0],
	'Manufacturer': ['TOYOTA'],
	'Model': ['Prius'],
	'Prod. year': [2018],
	'Category': ['Hatchback'],
	'Leather interior': ['Yes'],
	'Fuel type': ['Hybrid'],
	'Engine volume': [1.8],
	'Mileage': [50000],
	'Cylinders': [4.0],
	'Gear box type': ['Automatic'],
	'Drive wheels': ['Front'],
	'Doors': ['04-May'],
	'Wheel': ['Left wheel'],
	'Color': ['White'],
	'Airbags': [10]
	}
	example_df = {}
	for col in X.columns:
	if col in example:
	example_df[col] = example[col]
	else:
	example_df[col] = [np.nan] if col in numerical_features else ['missing']
	example_df = pd.DataFrame(example_df)
	if 'model_pipeline' in st.session_state:
	try:
	pred = st.session_state['model_pipeline'].predict(example_df)[0]
	st.success(f"Example Predicted Price: ${pred:,.2f}")
	except Exception as e:
	st.error(f"Example prediction failed: {e}")
	print("Example prediction exception:", file=sys.stderr)
	traceback.print_exc()
	else:
	st.info("Train the model first to run the example prediction.")

	st.markdown("---")
	st.caption("This app generates synthetic car data and trains a KNN regressor. For production use, replace the synthetic generator with your real dataset and consider tree-based models for better performance.")