import streamlit as st import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from datasets import load_dataset # -------------------------- Title -------------------------- st.title("🍷 Wine Quality Prediction") st.write("Using Random Forest on the famous Wine Quality dataset") # -------------------------- Load Data -------------------------- @st.cache_data def get_data(): ds = load_dataset("codesignal/wine-quality") df = ds[list(ds.keys())[0]].to_pandas() return df ds = load_dataset("codesignal/wine-quality") df = ds['test'].to_pandas() return df df = get_data() st.write("Dataset loaded! Here's a preview:") st.dataframe(df.head()) # -------------------------- Preprocessing -------------------------- X = df.drop("quality", axis=1) # ← fixed: no "Id" column exists y = df["quality"] # Make it binary classification: good (≥6) vs bad (<6) y = (y >= 6).astype(int) # Train-test split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # -------------------------- Train Model -------------------------- @st.cache_resource def train_model(): model = RandomForestClassifier( n_estimators=200, random_state=42, n_jobs=-1 ) model.fit(X_train_scaled, y_train) return model model = train_model() # Predictions & accuracy y_pred = model.predict(X_test_scaled) accuracy = accuracy_score(y_test, y_pred) st.success(f"Model Accuracy: *{accuracy:.4f}* ({accuracy*100:.2f}%)") # -------------------------- Interactive Prediction -------------------------- st.header("Predict quality of a new wine") cols = st.columns(3) input_data = {} features = X.columns.tolist() for i, feature in enumerate(features): with cols[i % 3]: val = st.slider( feature, float(X[feature].min()), float(X[feature].max()), float(X[feature].mean()) ) input_data[feature] = val if st.button("Predict Quality"): input_df = pd.DataFrame([input_data]) input_scaled = scaler.transform(input_df) pred = model.predict(input_scaled)[0] prob = model.predict_proba(input_scaled)[0] if pred == 1: st.balloons() st.success(f"*Good wine!* 🍾 (confidence: {prob[1]:.2%})") else: st.error(f"*Not great wine* 😢 (confidence: {prob[0]:.2%})")