Spaces:

henriquebap
/

wine-ml-app

Sleeping

App Files Files Community

wine-ml-app / app.py

henriquebap

fix(app): make predict robust to feature count mismatches, avoid ValueError

c92b8d9 7 months ago

raw

history blame contribute delete

10.4 kB

	import os
	import pandas as pd
	import numpy as np
	from huggingface_hub import hf_hub_download
	from sklearn.ensemble import RandomForestRegressor
	import gradio as gr
	from pathlib import Path
	import joblib

	# Configurações - troque pelo seu usuário e dataset repo
	HF_DATASET_REPO = "henriquebap/wine-ml-dataset"
	CSV_FILENAME = "WineQT.csv"
	MODEL_PATH = Path("data/models/wine_quality_regressor.joblib")
	STRICT_DEFAULT = (os.getenv("STRICT_SAVED_ONLY", "true").lower() in {"1","true","yes"})

	# Fallback para carregar modelo do Hub (configurável por variáveis de ambiente)
	HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "").strip() or None # ex.: "henriquebap/wine-ml-model"
	HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "wine_quality_regressor.joblib")
	HF_MODEL_REPO_TYPE = os.getenv("HF_MODEL_REPO_TYPE", "model") # "model" \| "dataset" \| "space"

	model = None
	feature_cols = [
	"fixed acidity","volatile acidity","citric acid","residual sugar","chlorides",
	"free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"
	]


	def load_data():
	csv_path = hf_hub_download(repo_id=HF_DATASET_REPO, repo_type="dataset", filename=CSV_FILENAME)
	df = pd.read_csv(csv_path)
	# Garantir colunas esperadas; remover colunas extras
	cols = set(feature_cols + ["quality"])
	df = df[[c for c in df.columns if c in cols]]
	return df.dropna()


	def load_final_model():
	global model, feature_cols
	# 1) Tenta carregar do disco
	if MODEL_PATH.exists():
	try:
	bundle = joblib.load(MODEL_PATH)
	if isinstance(bundle, dict):
	loaded_model = bundle.get("model", None)
	meta = bundle.get("metadata", {})
	feats = meta.get("features")
	if isinstance(feats, list) and len(feats) > 0:
	feature_cols = feats
	if loaded_model is not None:
	model = loaded_model
	return "Modelo final carregado do disco."
	else:
	# Caso seja o estimador diretamente
	model_candidate = bundle
	if hasattr(model_candidate, "predict"):
	model = model_candidate
	return "Modelo final carregado do disco."
	except Exception as e:
	print(f"[warn] Falha ao carregar modelo salvo localmente: {e}")

	# 2) Fallback: tentar baixar do Hub, se configurado
	if HF_MODEL_REPO:
	try:
	token = os.getenv("HF_TOKEN", None)
	remote_path = hf_hub_download(
	repo_id=HF_MODEL_REPO,
	filename=HF_MODEL_FILENAME,
	repo_type=HF_MODEL_REPO_TYPE,
	token=token,
	)
	bundle = joblib.load(remote_path)
	loaded_model = None
	meta = {}
	if isinstance(bundle, dict):
	loaded_model = bundle.get("model", None)
	meta = bundle.get("metadata", {})
	else:
	loaded_model = bundle

	# Atualiza features se disponível
	feats = (meta or {}).get("features")
	if isinstance(feats, list) and len(feats) > 0:
	feature_cols = feats

	if loaded_model is not None and hasattr(loaded_model, "predict"):
	model = loaded_model
	# Salva localmente para próximos boots
	try:
	MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
	joblib.dump({"model": model, "metadata": {"features": feature_cols}}, MODEL_PATH)
	except Exception:
	# Não bloquear execução se falhar salvar localmente
	pass
	return f"Modelo final baixado do Hub ({HF_MODEL_REPO_TYPE}:{HF_MODEL_REPO})."
	except Exception as e:
	# Não retornar erro aqui; load_or_train decide conforme strict
	print(f"[warn] Falha ao baixar modelo do Hub: {e}")

	return None

	def train():
	global model
	df = load_data()
	X = df[feature_cols]
	y = df["quality"]
	model = RandomForestRegressor(n_estimators=200, random_state=42)
	model.fit(X, y)
	return f"Modelo treinado com {len(df)} linhas."


	def load_or_train(strict: bool = False):
	"""Carrega o modelo salvo; se ausente e strict=False, treina um baseline.

	strict=True: nunca treina; retorna mensagem de erro se o modelo salvo não existir.
	"""
	msg = load_final_model()
	if msg:
	return msg
	if strict:
	return "Modelo salvo não encontrado. Exporte via notebook 07 e garanta o arquivo em data/models/wine_quality_regressor.joblib."
	return train()

	def predict(*values):
	"""Make a single prediction from UI inputs, robust to feature count mismatches.

	Aligns provided values to the model's expected number of features to avoid
	shape/column mismatches when the persisted model was trained with a
	different feature set than the current UI defaults.
	"""
	if model is None:
	load_or_train()

	# Determine how many features the model expects
	expected_n = getattr(model, "n_features_in_", None)
	cols = list(feature_cols)
	if expected_n is None:
	expected_n = len(cols) if len(cols) > 0 else len(values)

	# Trim or pad the provided values to match the expected feature count
	vals = list(values)[:expected_n]
	if len(vals) < expected_n:
	vals += [np.nan] * (expected_n - len(vals))

	# Use feature names if available for the expected size; otherwise, fall back
	# to generated placeholder names. Most sklearn estimators ignore column names.
	if len(cols) >= expected_n:
	used_cols = cols[:expected_n]
	else:
	used_cols = cols + [f"f{i}" for i in range(len(cols), expected_n)]

	x = pd.DataFrame([vals], columns=used_cols)
	pred = float(model.predict(x)[0])
	return f"{pred:.2f} (arredondado: {int(round(pred))})"


	def predict_batch(file: gr.File \| None):
	if model is None:
	load_or_train()
	if file is None:
	return "Arquivo CSV não fornecido.", None
	try:
	df_in = pd.read_csv(file.name)
	except Exception as e:
	return f"Falha ao ler CSV: {e}", None
	missing = [c for c in feature_cols if c not in df_in.columns]
	if missing:
	return f"CSV faltando colunas: {missing}", None
	preds = model.predict(df_in[feature_cols])
	out = df_in.copy()
	out["pred_quality"] = preds
	return f"OK - {len(out)} linhas processadas.", out


	def csv_template_file():
	"""Gera um CSV somente com cabeçalhos de features, para facilitar upload."""
	path = Path("wine_features_template.csv")
	pd.DataFrame(columns=feature_cols).to_csv(path, index=False)
	return str(path)


	def sample_csv_file(n: int = 10):
	"""Gera um CSV de amostra com n linhas do dataset (sem a coluna quality)."""
	try:
	df = load_data()
	except Exception as e:
	return f"Falha ao carregar dataset para amostra: {e}", None
	n = int(max(1, min(int(n or 10), len(df))))
	sample = df.sample(n, random_state=42)[feature_cols]
	path = Path("wine_sample.csv")
	sample.to_csv(path, index=False)
	return str(path), sample


	def model_info():
	if model is None:
	load_or_train()
	info = {
	"features": feature_cols,
	}
	try:
	import numpy as _np
	importances = getattr(model, "feature_importances_", None)
	if importances is not None:
	s = pd.Series(importances, index=feature_cols).sort_values(ascending=False)
	info["feature_importance_top"] = s.head(10).to_dict()
	except Exception:
	pass
	return info

	with gr.Blocks(title="Wine Quality - MVP") as demo:
	gr.Markdown("## 🍷 Wine Quality - MVP (Modelo Final + Fallback de Treino)")
	status = gr.Textbox(label="Status", interactive=False)
	with gr.Row():
	strict_only = gr.Checkbox(value=STRICT_DEFAULT, label="Usar somente modelo salvo (sem treinar fallback)")
	btn_load = gr.Button("Carregar modelo final / Treinar")
	btn_info = gr.Button("Info do modelo")
	btn_load.click(fn=load_or_train, inputs=[strict_only], outputs=status)

	gr.Markdown("### Fazer predição")
	with gr.Row():
	fixed_acidity = gr.Number(value=7.5, label="fixed acidity")
	volatile_acidity = gr.Number(value=0.5, label="volatile acidity")
	citric_acid = gr.Number(value=0.25, label="citric acid")
	residual_sugar = gr.Number(value=2.0, label="residual sugar")
	chlorides = gr.Number(value=0.08, label="chlorides")
	free_sd = gr.Number(value=15.0, label="free sulfur dioxide")
	total_sd = gr.Number(value=50.0, label="total sulfur dioxide")
	density = gr.Number(value=0.996, label="density")
	pH = gr.Number(value=3.3, label="pH")
	sulphates = gr.Number(value=0.6, label="sulphates")
	alcohol = gr.Number(value=10.0, label="alcohol")
	out = gr.Textbox(label="Qualidade prevista")

	btn_pred = gr.Button("Prever")
	btn_pred.click(
	predict,
	inputs=[fixed_acidity, volatile_acidity, citric_acid, residual_sugar, chlorides,
	free_sd, total_sd, density, pH, sulphates, alcohol],
	outputs=out
	)

	gr.Markdown("### Predição em lote (CSV)")
	csv_in = gr.File(label="CSV com colunas de features", file_types=[".csv"])
	msg, df_out = gr.Textbox(label="Mensagem"), gr.Dataframe(label="Resultado")
	btn_batch = gr.Button("Processar CSV")
	btn_batch.click(predict_batch, inputs=csv_in, outputs=[msg, df_out])

	gr.Markdown("### Arquivos auxiliares")
	with gr.Row():
	n_rows = gr.Number(value=10, label="N amostras", precision=0)
	btn_tpl = gr.Button("Baixar template CSV (cabeçalhos)")
	btn_smpl = gr.Button("Gerar amostra CSV")
	file_out = gr.File(label="Arquivo gerado")
	df_preview = gr.Dataframe(label="Prévia da amostra")
	btn_tpl.click(csv_template_file, outputs=file_out)
	btn_smpl.click(sample_csv_file, inputs=n_rows, outputs=[file_out, df_preview])

	info_out = gr.JSON(label="Detalhes do modelo")
	btn_info.click(model_info, outputs=info_out)

	# Carrega ao iniciar (controlado por STRICT_DEFAULT)
	status.value = load_or_train(strict=STRICT_DEFAULT)

	if __name__ == "__main__":
	demo.launch()