Spaces:

servisgas
/

pricing

Sleeping

pricing / src /evaluate.py

GitHub Actions

Deploy selected files

ffdb9be 5 months ago

4.89 kB

	import os, sys
	import json
	import mlflow
	import pandas as pd
	import requests
	from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
	from dotenv import load_dotenv

	load_dotenv()
	# Add project root to sys.path for imports
	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	from src.utils import Timer

	# ------------------------------------------------------------------
	# Constants
	# ------------------------------------------------------------------
	TEST_PATH = "data/processed/test.parquet"
	RUN_INFO_PATH = "reports/last_run_info.json"
	METRICS_PATH = "reports/eval_metrics.json"

	# Tracking server info (should match training script)
	TRACKING_SERVER_HOST = "127.0.0.1"
	TRACKING_SERVER_PORT = 5000


	def main():
	# ---------------------------------------------------------------
	# 1. Load metadata
	# ---------------------------------------------------------------
	if not os.path.exists(RUN_INFO_PATH):
	raise FileNotFoundError(f"❌ {RUN_INFO_PATH} not found. Run training first.")

	print("📄 Loading last MLflow run info...")
	with open(RUN_INFO_PATH) as f:
	run_info = json.load(f)

	run_id = run_info["run_id"]
	model_uri = run_info["pipeline_model_uri"]

	print(f"🔍 Run ID: {run_id}")
	print(f"🔄 Model URI: {model_uri}")
	print()

	# ---------------------------------------------------------------
	# Ensure Google Cloud credentials are set
	# ---------------------------------------------------------------
	GOOGLE_CREDS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
	if not GOOGLE_CREDS or not os.path.isfile(GOOGLE_CREDS):
	raise FileNotFoundError(
	f"❌ GOOGLE_APPLICATION_CREDENTIALS not set or invalid: {GOOGLE_CREDS}"
	)
	print(f"🔐 Using Google credentials: {GOOGLE_CREDS}")

	# ---------------------------------------------------------------
	# 2. Connect to MLflow tracking server
	# ---------------------------------------------------------------
	try:
	r = requests.get(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}", timeout=3)
	if r.status_code != 200:
	raise requests.exceptions.RequestException
	except requests.exceptions.RequestException:
	raise ConnectionError(
	f"❌ MLflow tracking server not reachable at "
	f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}. "
	f"Start the server before evaluation."
	)

	mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
	mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

	print(f"🔗 Connected to MLflow: {mlflow.get_tracking_uri()}")
	print(f" Using run ID: {run_id}")
	print()

	# ---------------------------------------------------------------
	# 3. Load model from GCS
	# ---------------------------------------------------------------
	with Timer("Load MLflow model"):
	model = mlflow.pyfunc.load_model(model_uri)

	# ---------------------------------------------------------------
	# 4. Load test data
	# ---------------------------------------------------------------
	print("📦 Loading test data...")
	df_test = pd.read_parquet(TEST_PATH)
	X_test = df_test.drop(columns=["price"])
	y_test = df_test["price"]

	# ---------------------------------------------------------------
	# 5. Run inference
	# ---------------------------------------------------------------
	print("⚙️ Running inference on test set...")
	with Timer("Model inference"):
	preds = model.predict(X_test)

	# ---------------------------------------------------------------
	# 6. Compute metrics
	# ---------------------------------------------------------------
	print("📊 Computing metrics...")
	metrics = {
	"r2": round(r2_score(y_test, preds), 4),
	"mae": round(mean_absolute_error(y_test, preds), 2),
	"mape": round(mean_absolute_percentage_error(y_test, preds), 4),
	}

	# ---------------------------------------------------------------
	# 7. Log metrics to MLflow (same run)
	# ---------------------------------------------------------------
	print("📝 Logging metrics to MLflow...")
	mlflow.start_run(run_id=run_id)
	mlflow.log_metrics(metrics)
	mlflow.end_run()

	# ---------------------------------------------------------------
	# 8. Save metrics locally for DVC
	# ---------------------------------------------------------------
	os.makedirs(os.path.dirname(METRICS_PATH), exist_ok=True)
	with open(METRICS_PATH, "w") as f:
	json.dump(metrics, f, indent=2)

	print("✅ Evaluation complete!")
	print(json.dumps(metrics, indent=2))
	print(f"🔗 MLflow UI: {run_info['mlflow_ui_link']}")


	if __name__ == "__main__":
	main()