wine-ml-app / app.py
henriquebap's picture
fix(app): make predict robust to feature count mismatches, avoid ValueError
c92b8d9
import os
import pandas as pd
import numpy as np
from huggingface_hub import hf_hub_download
from sklearn.ensemble import RandomForestRegressor
import gradio as gr
from pathlib import Path
import joblib
# Configurações - troque pelo seu usuário e dataset repo
HF_DATASET_REPO = "henriquebap/wine-ml-dataset"
CSV_FILENAME = "WineQT.csv"
MODEL_PATH = Path("data/models/wine_quality_regressor.joblib")
STRICT_DEFAULT = (os.getenv("STRICT_SAVED_ONLY", "true").lower() in {"1","true","yes"})
# Fallback para carregar modelo do Hub (configurável por variáveis de ambiente)
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "").strip() or None # ex.: "henriquebap/wine-ml-model"
HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "wine_quality_regressor.joblib")
HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model") # "model" | "dataset" | "space"
model = None
feature_cols = [
"fixed acidity","volatile acidity","citric acid","residual sugar","chlorides",
"free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"
]
def load_data():
csv_path = hf_hub_download(repo_id=HF_DATASET_REPO, repo_type="dataset", filename=CSV_FILENAME)
df = pd.read_csv(csv_path)
# Garantir colunas esperadas; remover colunas extras
cols = set(feature_cols + ["quality"])
df = df[[c for c in df.columns if c in cols]]
return df.dropna()
def load_final_model():
global model, feature_cols
# 1) Tenta carregar do disco
if MODEL_PATH.exists():
try:
bundle = joblib.load(MODEL_PATH)
if isinstance(bundle, dict):
loaded_model = bundle.get("model", None)
meta = bundle.get("metadata", {})
feats = meta.get("features")
if isinstance(feats, list) and len(feats) > 0:
feature_cols = feats
if loaded_model is not None:
model = loaded_model
return "Modelo final carregado do disco."
else:
# Caso seja o estimador diretamente
model_candidate = bundle
if hasattr(model_candidate, "predict"):
model = model_candidate
return "Modelo final carregado do disco."
except Exception as e:
print(f"[warn] Falha ao carregar modelo salvo localmente: {e}")
# 2) Fallback: tentar baixar do Hub, se configurado
if HF_MODEL_REPO:
try:
token = os.getenv("HF_TOKEN", None)
remote_path = hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=HF_MODEL_FILENAME,
repo_type=HF_MODEL_REPO_TYPE,
token=token,
)
bundle = joblib.load(remote_path)
loaded_model = None
meta = {}
if isinstance(bundle, dict):
loaded_model = bundle.get("model", None)
meta = bundle.get("metadata", {})
else:
loaded_model = bundle
# Atualiza features se disponível
feats = (meta or {}).get("features")
if isinstance(feats, list) and len(feats) > 0:
feature_cols = feats
if loaded_model is not None and hasattr(loaded_model, "predict"):
model = loaded_model
# Salva localmente para próximos boots
try:
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump({"model": model, "metadata": {"features": feature_cols}}, MODEL_PATH)
except Exception:
# Não bloquear execução se falhar salvar localmente
pass
return f"Modelo final baixado do Hub ({HF_MODEL_REPO_TYPE}:{HF_MODEL_REPO})."
except Exception as e:
# Não retornar erro aqui; load_or_train decide conforme strict
print(f"[warn] Falha ao baixar modelo do Hub: {e}")
return None
def train():
global model
df = load_data()
X = df[feature_cols]
y = df["quality"]
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X, y)
return f"Modelo treinado com {len(df)} linhas."
def load_or_train(strict: bool = False):
"""Carrega o modelo salvo; se ausente e strict=False, treina um baseline.
strict=True: nunca treina; retorna mensagem de erro se o modelo salvo não existir.
"""
msg = load_final_model()
if msg:
return msg
if strict:
return "Modelo salvo não encontrado. Exporte via notebook 07 e garanta o arquivo em data/models/wine_quality_regressor.joblib."
return train()
def predict(*values):
"""Make a single prediction from UI inputs, robust to feature count mismatches.
Aligns provided values to the model's expected number of features to avoid
shape/column mismatches when the persisted model was trained with a
different feature set than the current UI defaults.
"""
if model is None:
load_or_train()
# Determine how many features the model expects
expected_n = getattr(model, "n_features_in_", None)
cols = list(feature_cols)
if expected_n is None:
expected_n = len(cols) if len(cols) > 0 else len(values)
# Trim or pad the provided values to match the expected feature count
vals = list(values)[:expected_n]
if len(vals) < expected_n:
vals += [np.nan] * (expected_n - len(vals))
# Use feature names if available for the expected size; otherwise, fall back
# to generated placeholder names. Most sklearn estimators ignore column names.
if len(cols) >= expected_n:
used_cols = cols[:expected_n]
else:
used_cols = cols + [f"f{i}" for i in range(len(cols), expected_n)]
x = pd.DataFrame([vals], columns=used_cols)
pred = float(model.predict(x)[0])
return f"{pred:.2f} (arredondado: {int(round(pred))})"
def predict_batch(file: gr.File | None):
if model is None:
load_or_train()
if file is None:
return "Arquivo CSV não fornecido.", None
try:
df_in = pd.read_csv(file.name)
except Exception as e:
return f"Falha ao ler CSV: {e}", None
missing = [c for c in feature_cols if c not in df_in.columns]
if missing:
return f"CSV faltando colunas: {missing}", None
preds = model.predict(df_in[feature_cols])
out = df_in.copy()
out["pred_quality"] = preds
return f"OK - {len(out)} linhas processadas.", out
def csv_template_file():
"""Gera um CSV somente com cabeçalhos de features, para facilitar upload."""
path = Path("wine_features_template.csv")
pd.DataFrame(columns=feature_cols).to_csv(path, index=False)
return str(path)
def sample_csv_file(n: int = 10):
"""Gera um CSV de amostra com n linhas do dataset (sem a coluna quality)."""
try:
df = load_data()
except Exception as e:
return f"Falha ao carregar dataset para amostra: {e}", None
n = int(max(1, min(int(n or 10), len(df))))
sample = df.sample(n, random_state=42)[feature_cols]
path = Path("wine_sample.csv")
sample.to_csv(path, index=False)
return str(path), sample
def model_info():
if model is None:
load_or_train()
info = {
"features": feature_cols,
}
try:
import numpy as _np
importances = getattr(model, "feature_importances_", None)
if importances is not None:
s = pd.Series(importances, index=feature_cols).sort_values(ascending=False)
info["feature_importance_top"] = s.head(10).to_dict()
except Exception:
pass
return info
with gr.Blocks(title="Wine Quality - MVP") as demo:
gr.Markdown("## 🍷 Wine Quality - MVP (Modelo Final + Fallback de Treino)")
status = gr.Textbox(label="Status", interactive=False)
with gr.Row():
strict_only = gr.Checkbox(value=STRICT_DEFAULT, label="Usar somente modelo salvo (sem treinar fallback)")
btn_load = gr.Button("Carregar modelo final / Treinar")
btn_info = gr.Button("Info do modelo")
btn_load.click(fn=load_or_train, inputs=[strict_only], outputs=status)
gr.Markdown("### Fazer predição")
with gr.Row():
fixed_acidity = gr.Number(value=7.5, label="fixed acidity")
volatile_acidity = gr.Number(value=0.5, label="volatile acidity")
citric_acid = gr.Number(value=0.25, label="citric acid")
residual_sugar = gr.Number(value=2.0, label="residual sugar")
chlorides = gr.Number(value=0.08, label="chlorides")
free_sd = gr.Number(value=15.0, label="free sulfur dioxide")
total_sd = gr.Number(value=50.0, label="total sulfur dioxide")
density = gr.Number(value=0.996, label="density")
pH = gr.Number(value=3.3, label="pH")
sulphates = gr.Number(value=0.6, label="sulphates")
alcohol = gr.Number(value=10.0, label="alcohol")
out = gr.Textbox(label="Qualidade prevista")
btn_pred = gr.Button("Prever")
btn_pred.click(
predict,
inputs=[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides,
free_sd, total_sd, density, pH, sulphates, alcohol],
outputs=out
)
gr.Markdown("### Predição em lote (CSV)")
csv_in = gr.File(label="CSV com colunas de features", file_types=[".csv"])
msg, df_out = gr.Textbox(label="Mensagem"), gr.Dataframe(label="Resultado")
btn_batch = gr.Button("Processar CSV")
btn_batch.click(predict_batch, inputs=csv_in, outputs=[msg, df_out])
gr.Markdown("### Arquivos auxiliares")
with gr.Row():
n_rows = gr.Number(value=10, label="N amostras", precision=0)
btn_tpl = gr.Button("Baixar template CSV (cabeçalhos)")
btn_smpl = gr.Button("Gerar amostra CSV")
file_out = gr.File(label="Arquivo gerado")
df_preview = gr.Dataframe(label="Prévia da amostra")
btn_tpl.click(csv_template_file, outputs=file_out)
btn_smpl.click(sample_csv_file, inputs=n_rows, outputs=[file_out, df_preview])
info_out = gr.JSON(label="Detalhes do modelo")
btn_info.click(model_info, outputs=info_out)
# Carrega ao iniciar (controlado por STRICT_DEFAULT)
status.value = load_or_train(strict=STRICT_DEFAULT)
if __name__ == "__main__":
demo.launch()