File size: 2,681 Bytes
08dae5c
6a2423a
0e4dfa8
 
 
 
531c80d
0e4dfa8
08dae5c
 
 
4fd4e56
08dae5c
 
 
 
4ad2739
 
 
 
 
4241a4b
08dae5c
0e4dfa8
08dae5c
 
 
 
2c170d3
0e4dfa8
 
2c170d3
08dae5c
 
2c170d3
08dae5c
 
 
 
0e4dfa8
08dae5c
0e4dfa8
08dae5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e4dfa8
08dae5c
 
 
0e4dfa8
08dae5c
 
0e4dfa8
08dae5c
 
 
 
 
 
 
 
 
0e4dfa8
08dae5c
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datasets import load_dataset

# -------------------------- Title --------------------------
st.title("🍷 Wine Quality Prediction")
st.write("Using Random Forest on the famous Wine Quality dataset")

# -------------------------- Load Data --------------------------
@st.cache_data
def get_data():
    ds = load_dataset("codesignal/wine-quality")
    
    df = ds[list(ds.keys())[0]].to_pandas()  
   
    return df
    ds = load_dataset("codesignal/wine-quality")
    df = ds['test'].to_pandas()
    return df

df = get_data()
st.write("Dataset loaded! Here's a preview:")
st.dataframe(df.head())
# -------------------------- Preprocessing --------------------------
X = df.drop("quality", axis=1)       # ← fixed: no "Id" column exists
y = df["quality"]

# Make it binary classification: good (≥6) vs bad (<6)
y = (y >= 6).astype(int)


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------- Train Model --------------------------
@st.cache_resource
def train_model():
    model = RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_scaled, y_train)
    return model

model = train_model()

# Predictions & accuracy
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

st.success(f"Model Accuracy: *{accuracy:.4f}* ({accuracy*100:.2f}%)")

# -------------------------- Interactive Prediction --------------------------
st.header("Predict quality of a new wine")
cols = st.columns(3)

input_data = {}
features = X.columns.tolist()

for i, feature in enumerate(features):
    with cols[i % 3]:
        val = st.slider(
            feature,
            float(X[feature].min()),
            float(X[feature].max()),
            float(X[feature].mean())
        )
        input_data[feature] = val

if st.button("Predict Quality"):
    input_df = pd.DataFrame([input_data])
    input_scaled = scaler.transform(input_df)
    pred = model.predict(input_scaled)[0]
    prob = model.predict_proba(input_scaled)[0]
    
    if pred == 1:
        st.balloons()
        st.success(f"*Good wine!* 🍾 (confidence: {prob[1]:.2%})")
    else:
        st.error(f"*Not great wine* 😢 (confidence: {prob[0]:.2%})")