Spaces:
Sleeping
Sleeping
| # streamlit_app.py | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import traceback | |
| import sys | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.neighbors import KNeighborsRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| st.set_page_config(page_title="Synthetic Car Price Prediction (KNN)", layout="wide") | |
| st.title("Synthetic Car Price Prediction — KNN (Streamlit)") | |
| # ---------------------- | |
| # Synthetic data generator | |
| # ---------------------- | |
| def generate_synthetic_cars(n_samples=2000, random_state=42): | |
| rng = np.random.default_rng(random_state) | |
| manufacturers = ['TOYOTA', 'HONDA', 'HYUNDAI', 'FORD', 'BMW', 'AUDI', 'KIA', 'HUNTER'] # added HUNTER to match user context | |
| models_by_make = { | |
| 'TOYOTA': ['Corolla', 'Camry', 'Prius'], | |
| 'HONDA': ['Civic', 'Accord', 'City'], | |
| 'HYUNDAI': ['i20', 'Elantra', 'Creta'], | |
| 'FORD': ['Figo', 'Focus', 'Mustang'], | |
| 'BMW': ['3 Series', '5 Series'], | |
| 'AUDI': ['A4', 'A6'], | |
| 'KIA': ['Seltos', 'Sonet'], | |
| 'HUNTER': ['Classic', 'Cruiser'] | |
| } | |
| categories = ['Hatchback', 'Sedan', 'SUV', 'Coupe'] | |
| leather_opts = ['Yes', 'No'] | |
| fuels = ['Petrol', 'Diesel', 'Hybrid', 'Electric'] | |
| gearbox = ['Manual', 'Automatic'] | |
| drive = ['Front', 'Rear', 'All'] | |
| doors = ['02-Jan', '04-May', '05-Oct'] # keep string-like options | |
| wheels = ['Left wheel', 'Right wheel'] | |
| colors = ['White', 'Black', 'Silver', 'Red', 'Blue'] | |
| data = [] | |
| for i in range(n_samples): | |
| make = rng.choice(manufacturers) | |
| model = rng.choice(models_by_make[make]) | |
| prod_year = int(rng.integers(2005, 2025)) # years between 2005 and 2024 | |
| mileage = int(abs(rng.normal(loc=50000, scale=30000))) # some can be high | |
| engine_vol = float(np.round(rng.choice([1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0]), 1)) | |
| cylinders = int(rng.choice([3, 4, 6, 8])) | |
| airbags = int(rng.choice([2, 4, 6, 8, 10])) | |
| levy = float(np.round(np.abs(rng.normal(loc=500.0, scale=1200.0)), 2)) # tax/levy-like small number, can be 0 | |
| category = rng.choice(categories) | |
| leather = rng.choice(leather_opts, p=[0.25, 0.75]) | |
| fuel = rng.choice(fuels, p=[0.6, 0.25, 0.1, 0.05]) | |
| gear = rng.choice(gearbox, p=[0.6, 0.4]) | |
| dr = rng.choice(drive, p=[0.7, 0.15, 0.15]) | |
| door = rng.choice(doors) | |
| wheel = rng.choice(wheels) | |
| color = rng.choice(colors) | |
| # Base price influenced by: maker prestige, engine size, year, mileage, fuel type, airbags, leather, category | |
| base_price = 0.0 | |
| # manufacturer multiplier (toyota/honda lower, BMW/AUDI higher) | |
| prestige = {'TOYOTA': 1.0, 'HONDA': 1.0, 'HYUNDAI': 0.9, 'FORD': 0.95, 'BMW': 2.2, 'AUDI': 2.0, 'KIA': 0.9, 'HUNTER': 1.1} | |
| base_price += 10000 * prestige.get(make, 1.0) | |
| # newer car adds price | |
| base_price += (prod_year - 2000) * 500 | |
| # engine volume adds | |
| base_price += engine_vol * 1500 | |
| # category price bump | |
| if category == 'SUV': | |
| base_price += 5000 | |
| elif category == 'Coupe': | |
| base_price += 3000 | |
| # airbags, leather | |
| base_price += airbags * 100 | |
| if leather == 'Yes': | |
| base_price += 1500 | |
| # fuel adjustments | |
| if fuel == 'Hybrid': | |
| base_price += 2500 | |
| if fuel == 'Electric': | |
| base_price += 8000 | |
| # mileage depreciation | |
| base_price -= (mileage / 1000) * 300 # 300 per 1000 km | |
| # levy (add if present) | |
| base_price += levy * 1.0 | |
| # ensure price not negative | |
| noise = rng.normal(loc=0.0, scale=2000.0) | |
| price = max(1000.0, base_price + noise) | |
| data.append({ | |
| 'Levy': round(levy, 2), | |
| 'Manufacturer': make, | |
| 'Model': model, | |
| 'Prod. year': prod_year, | |
| 'Category': category, | |
| 'Leather interior': leather, | |
| 'Fuel type': fuel, | |
| 'Engine volume': engine_vol, | |
| 'Mileage': mileage, | |
| 'Cylinders': cylinders, | |
| 'Gear box type': gear, | |
| 'Drive wheels': dr, | |
| 'Doors': door, | |
| 'Wheel': wheel, | |
| 'Color': color, | |
| 'Airbags': airbags, | |
| 'Price': round(price, 2) | |
| }) | |
| df = pd.DataFrame(data) | |
| return df | |
| # ---------------------- | |
| # Sidebar: options | |
| # ---------------------- | |
| st.sidebar.header("Synthetic dataset & model controls") | |
| n_samples = st.sidebar.slider("Number of synthetic samples", min_value=200, max_value=20000, value=2000, step=100) | |
| seed = st.sidebar.number_input("Random seed", value=42, step=1) | |
| k_neighbors = st.sidebar.number_input("K (n_neighbors for KNN)", min_value=1, max_value=100, value=5, step=1) | |
| test_size = st.sidebar.slider("Test size (%)", min_value=5, max_value=50, value=20, step=5) / 100.0 | |
| random_state = st.sidebar.number_input("Random state (train/test split)", value=42, step=1) | |
| regen = st.sidebar.button("Regenerate dataset") | |
| run_train = st.sidebar.button("Train Model") | |
| # Generate dataset (or regenerate) | |
| if 'synthetic_df' not in st.session_state or regen: | |
| st.session_state['synthetic_df'] = generate_synthetic_cars(n_samples=n_samples, random_state=seed) | |
| df = st.session_state['synthetic_df'] | |
| st.subheader("Synthetic dataset preview") | |
| st.dataframe(df.head(10)) | |
| # ---------------------- | |
| # Prepare features + types | |
| # ---------------------- | |
| expected_numerical = ['Levy', 'Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags'] | |
| expected_categorical = ['Manufacturer', 'Model', 'Category', 'Leather interior', | |
| 'Fuel type', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color'] | |
| numerical_features = [c for c in expected_numerical if c in df.columns] | |
| categorical_features = [c for c in expected_categorical if c in df.columns] | |
| st.markdown(f"**Numerical features used:** {numerical_features}") | |
| st.markdown(f"**Categorical features used:** {categorical_features}") | |
| # Drop rows with missing target (shouldn't be any in synthetic) | |
| df = df.dropna(subset=['Price']) | |
| X = df.drop('Price', axis=1) | |
| y = df['Price'] | |
| # ---------------------- | |
| # Preprocessor & Pipeline (robust to sklearn version) | |
| # ---------------------- | |
| numerical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='median')), | |
| ('scaler', StandardScaler()) | |
| ]) | |
| ohe_kwargs = dict(handle_unknown='ignore') | |
| try: | |
| OneHotEncoder(sparse_output=False, **ohe_kwargs) | |
| ohe_kwargs['sparse_output'] = False | |
| except TypeError: | |
| ohe_kwargs['sparse'] = False | |
| categorical_transformer = Pipeline(steps=[ | |
| ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), | |
| ('onehot', OneHotEncoder(**ohe_kwargs)) | |
| ]) | |
| preprocessor = ColumnTransformer(transformers=[ | |
| ('num', numerical_transformer, numerical_features), | |
| ('cat', categorical_transformer, categorical_features) | |
| ], remainder='drop') | |
| knn_pipeline = Pipeline(steps=[ | |
| ('preprocessor', preprocessor), | |
| ('regressor', KNeighborsRegressor(n_neighbors=int(k_neighbors))) | |
| ]) | |
| # ---------------------- | |
| # Train / Evaluate | |
| # ---------------------- | |
| if run_train: | |
| st.subheader("Training the KNN model on synthetic data") | |
| try: | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=int(random_state)) | |
| max_k = max(1, len(X_train)) | |
| chosen_k = int(k_neighbors) | |
| if chosen_k > max_k: | |
| st.warning(f"Requested k={chosen_k} is larger than number of training samples ({max_k}). Using k={max_k}.") | |
| chosen_k = max_k | |
| knn_pipeline.set_params(regressor__n_neighbors=chosen_k) | |
| with st.spinner("Fitting model..."): | |
| knn_pipeline.fit(X_train, y_train) | |
| st.success("Training complete.") | |
| # Evaluate | |
| y_pred = knn_pipeline.predict(X_test) | |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) | |
| r2 = r2_score(y_test, y_pred) | |
| st.metric("RMSE", f"${rmse:,.2f}") | |
| st.metric("R² score", f"{r2:.4f}") | |
| st.write("Prediction vs Actual (first 20 rows):") | |
| compare_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred}).reset_index(drop=True).head(20) | |
| st.dataframe(compare_df) | |
| fig, ax = plt.subplots() | |
| ax.scatter(y_test, y_pred, alpha=0.5) | |
| try: | |
| maxv = max(np.nanmax(y_test.values), np.nanmax(y_pred)) | |
| minv = min(np.nanmin(y_test.values), np.nanmin(y_pred)) | |
| ax.plot([minv, maxv], [minv, maxv], linestyle='--') | |
| except Exception: | |
| pass | |
| ax.set_xlabel("Actual Price") | |
| ax.set_ylabel("Predicted Price") | |
| ax.set_title("Actual vs Predicted") | |
| st.pyplot(fig) | |
| # Save to session for later prediction | |
| st.session_state['model_pipeline'] = knn_pipeline | |
| st.session_state['numerical_features'] = numerical_features | |
| st.session_state['categorical_features'] = categorical_features | |
| st.success("Model saved in session state — use the form below for single predictions.") | |
| except Exception as e: | |
| st.error(f"Training failed: {e}") | |
| print("Training exception:", file=sys.stderr) | |
| traceback.print_exc() | |
| # ---------------------- | |
| # Single-sample prediction form | |
| # ---------------------- | |
| st.subheader("Single-sample prediction") | |
| if 'model_pipeline' not in st.session_state: | |
| st.info("Train the model first (click 'Train Model' in the sidebar).") | |
| else: | |
| model_pipeline = st.session_state['model_pipeline'] | |
| num_feats = st.session_state.get('numerical_features', []) | |
| cat_feats = st.session_state.get('categorical_features', []) | |
| with st.form("single_predict_form"): | |
| st.write("Fill feature values for a single car (leave numeric blank to use median-imputed value).") | |
| form_vals = {} | |
| for f in num_feats: | |
| form_vals[f] = st.text_input(f"{f} (numeric)", value="") | |
| for f in cat_feats: | |
| form_vals[f] = st.text_input(f"{f} (categorical)", value="") | |
| submitted = st.form_submit_button("Predict") | |
| if submitted: | |
| sample = {} | |
| for f in num_feats: | |
| v = form_vals[f].strip() | |
| if v == "": | |
| sample[f] = np.nan | |
| else: | |
| try: | |
| if f == 'Mileage': | |
| sample[f] = int(float(str(v).replace(',', ''))) | |
| elif f == 'Engine volume': | |
| sample[f] = float(v) | |
| else: | |
| sample[f] = float(str(v).replace(',', '')) | |
| except Exception: | |
| sample[f] = pd.to_numeric(v, errors='coerce') | |
| for f in cat_feats: | |
| v = form_vals[f].strip() | |
| sample[f] = v if v != "" else "missing" | |
| sample_df = pd.DataFrame([sample]) | |
| try: | |
| pred = model_pipeline.predict(sample_df)[0] | |
| st.success(f"Predicted Price: ${pred:,.2f}") | |
| except Exception as e: | |
| st.error(f"Prediction failed: {e}") | |
| print("Prediction exception:", file=sys.stderr) | |
| traceback.print_exc() | |
| # ---------------------- | |
| # Quick example prediction | |
| # ---------------------- | |
| st.subheader("Quick example prediction (auto-filled)") | |
| if st.button("Run toy example prediction"): | |
| example = { | |
| 'Levy': [1000.0], | |
| 'Manufacturer': ['TOYOTA'], | |
| 'Model': ['Prius'], | |
| 'Prod. year': [2018], | |
| 'Category': ['Hatchback'], | |
| 'Leather interior': ['Yes'], | |
| 'Fuel type': ['Hybrid'], | |
| 'Engine volume': [1.8], | |
| 'Mileage': [50000], | |
| 'Cylinders': [4.0], | |
| 'Gear box type': ['Automatic'], | |
| 'Drive wheels': ['Front'], | |
| 'Doors': ['04-May'], | |
| 'Wheel': ['Left wheel'], | |
| 'Color': ['White'], | |
| 'Airbags': [10] | |
| } | |
| example_df = {} | |
| for col in X.columns: | |
| if col in example: | |
| example_df[col] = example[col] | |
| else: | |
| example_df[col] = [np.nan] if col in numerical_features else ['missing'] | |
| example_df = pd.DataFrame(example_df) | |
| if 'model_pipeline' in st.session_state: | |
| try: | |
| pred = st.session_state['model_pipeline'].predict(example_df)[0] | |
| st.success(f"Example Predicted Price: ${pred:,.2f}") | |
| except Exception as e: | |
| st.error(f"Example prediction failed: {e}") | |
| print("Example prediction exception:", file=sys.stderr) | |
| traceback.print_exc() | |
| else: | |
| st.info("Train the model first to run the example prediction.") | |
| st.markdown("---") | |
| st.caption("This app generates synthetic car data and trains a KNN regressor. For production use, replace the synthetic generator with your real dataset and consider tree-based models for better performance.") | |