|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from joblib import dump |
|
|
import warnings |
|
|
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, HuberRegressor |
|
|
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor |
|
|
from sklearn.utils.estimator_checks import check_estimator |
|
|
from sklearn.utils.metaestimators import available_if |
|
|
from sklearn.exceptions import NotFittedError |
|
|
from sklearn.neighbors import KNeighborsRegressor |
|
|
from sklearn.svm import SVR, LinearSVR |
|
|
from sklearn.tree import DecisionTreeRegressor |
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split, GridSearchCV |
|
|
from sklearn.metrics import mean_absolute_error, r2_score |
|
|
import plotly.graph_objects as go |
|
|
from huggingface_hub import Repository, HfApi, DatasetCardData |
|
|
from skops.card import Card |
|
|
import pickle |
|
|
from pathlib import Path |
|
|
from tempfile import mkdtemp |
|
|
from skops import hub_utils |
|
|
from pathlib import Path |
|
|
from tempfile import mkdtemp |
|
|
from joblib import dump |
|
|
import pickle |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
User = "PranavSharma" |
|
|
repo_name = "dynamic-pricing-model" |
|
|
repo_url = f"https://huggingface.co/{User}/{repo_name}" |
|
|
|
|
|
from skops.card import Card |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
|
|
|
DATA_PATH = os.path.join("data", "dynamic_pricing.csv") |
|
|
|
|
|
|
|
|
def check_file_exists(file_path): |
|
|
""" |
|
|
Check if a file exists at the given path. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
file_path : str |
|
|
Path to the file. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
FileNotFoundError |
|
|
If the file does not exist. |
|
|
""" |
|
|
if not os.path.exists(file_path): |
|
|
raise FileNotFoundError(f"File not found: {file_path}") |
|
|
|
|
|
|
|
|
def load_data(): |
|
|
""" |
|
|
Load and preprocess the dataset by performing one-hot encoding |
|
|
on categorical variables. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
tuple |
|
|
A tuple containing the processed dataset and the list of boolean columns. |
|
|
""" |
|
|
check_file_exists(DATA_PATH) |
|
|
data = pd.read_csv(DATA_PATH) |
|
|
data = data.sample(frac=1, random_state=42) |
|
|
categorical_columns = data.select_dtypes(include=["object"]).columns |
|
|
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True) |
|
|
bool_columns = [col for col in data.columns if data[col].dropna().value_counts().index.isin([0, 1]).all()] |
|
|
return data, bool_columns |
|
|
|
|
|
|
|
|
def compute_defaults_and_types(X, bool_columns): |
|
|
defaults = {} |
|
|
types = {} |
|
|
for column in X.columns: |
|
|
if column in bool_columns: |
|
|
defaults[column] = 0 |
|
|
types[column] = "Categorical (One-hot)" |
|
|
else: |
|
|
defaults[column] = X[column].mean() |
|
|
types[column] = "Numerical" |
|
|
return defaults, types |
|
|
|
|
|
|
|
|
def duration_vs_cost_plot(data): |
|
|
fig = go.Figure() |
|
|
fig.add_trace(go.Scatter( |
|
|
x=data["Expected_Ride_Duration"], |
|
|
y=data["Historical_Cost_of_Ride"], |
|
|
mode="markers", |
|
|
marker=dict(size=8, color="rgba(99, 110, 250, 0.7)", line=dict(width=1, color="rgba(99, 110, 250, 1)")), |
|
|
name="Data Points" |
|
|
)) |
|
|
fig.update_layout( |
|
|
title=dict(text="Expected Ride Duration vs Historical Ride Cost", font=dict(size=18)), |
|
|
xaxis=dict(title="Expected Ride Duration (minutes)", gridcolor="lightgray"), |
|
|
yaxis=dict(title="Historical Ride Cost ($)", gridcolor="lightgray"), |
|
|
template="plotly_white" |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def performance_plots_with_gridsearch(results): |
|
|
X_train = results["X_train"] |
|
|
y_train = results["y_train"] |
|
|
X_test = results["X_test"] |
|
|
y_test = results["y_test"] |
|
|
train_sizes = np.linspace(50, len(X_train), 10, dtype=int) |
|
|
|
|
|
mae_scores = [] |
|
|
r2_scores = [] |
|
|
|
|
|
param_grid = {"alpha": np.logspace(-4, 0, 10)} |
|
|
|
|
|
for train_size in train_sizes: |
|
|
X_train_sub = X_train.iloc[:train_size] |
|
|
y_train_sub = y_train.iloc[:train_size] |
|
|
|
|
|
grid_search = GridSearchCV( |
|
|
Lasso(fit_intercept=False), |
|
|
param_grid, |
|
|
scoring="neg_mean_absolute_error", |
|
|
cv=5 |
|
|
) |
|
|
grid_search.fit(X_train_sub, y_train_sub) |
|
|
best_model = grid_search.best_estimator_ |
|
|
|
|
|
y_pred = best_model.predict(X_test) |
|
|
mae_scores.append(mean_absolute_error(y_test, y_pred)) |
|
|
r2_scores.append(r2_score(y_test, y_pred)) |
|
|
|
|
|
mae_fig = go.Figure() |
|
|
mae_fig.add_trace(go.Scatter( |
|
|
x=train_sizes, |
|
|
y=mae_scores, |
|
|
mode="lines+markers", |
|
|
marker=dict(size=6, color="blue"), |
|
|
line=dict(width=2, color="blue"), |
|
|
name="MAE" |
|
|
)) |
|
|
mae_fig.update_layout( |
|
|
title="Effect of Training Size on MAE (with GridSearchCV)", |
|
|
xaxis_title="Training Size", |
|
|
yaxis_title="Mean Absolute Error (MAE)", |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
r2_fig = go.Figure() |
|
|
r2_fig.add_trace(go.Scatter( |
|
|
x=train_sizes, |
|
|
y=r2_scores, |
|
|
mode="lines+markers", |
|
|
marker=dict(size=6, color="green"), |
|
|
line=dict(width=2, color="green"), |
|
|
name="Rยฒ" |
|
|
)) |
|
|
r2_fig.update_layout( |
|
|
title="Effect of Training Size on Rยฒ (with GridSearchCV)", |
|
|
xaxis_title="Training Size", |
|
|
yaxis_title="Rยฒ Score", |
|
|
template="plotly_white" |
|
|
) |
|
|
|
|
|
return mae_fig, r2_fig |
|
|
|
|
|
|
|
|
|
|
|
def coefficients_progression_plot_with_tracking(results): |
|
|
X_train = results["X_train"] |
|
|
y_train = results["y_train"] |
|
|
train_sizes = np.linspace(50, len(X_train), 10, dtype=int) |
|
|
|
|
|
coefficients_progress = [] |
|
|
feature_names = results["feature_names"] |
|
|
|
|
|
param_grid = {"alpha": np.logspace(-4, 0, 10)} |
|
|
|
|
|
for train_size in train_sizes: |
|
|
X_train_sub = X_train.iloc[:train_size] |
|
|
y_train_sub = y_train.iloc[:train_size] |
|
|
|
|
|
grid_search = GridSearchCV( |
|
|
Lasso(fit_intercept=False), |
|
|
param_grid, |
|
|
scoring="neg_mean_absolute_error", |
|
|
cv=5 |
|
|
) |
|
|
grid_search.fit(X_train_sub, y_train_sub) |
|
|
best_model = grid_search.best_estimator_ |
|
|
|
|
|
coefficients_progress.append(best_model.coef_) |
|
|
|
|
|
coefficients_array = np.array(coefficients_progress) |
|
|
|
|
|
fig = go.Figure() |
|
|
for idx, feature in enumerate(feature_names): |
|
|
fig.add_trace(go.Scatter( |
|
|
x=train_sizes, |
|
|
y=coefficients_array[:, idx], |
|
|
mode="lines+markers", |
|
|
name=feature, |
|
|
line=dict(width=2), |
|
|
marker=dict(size=6, opacity=0.8) |
|
|
)) |
|
|
fig.update_layout( |
|
|
title="Coefficient Progression with Training Size (Tracking)", |
|
|
xaxis_title="Training Size", |
|
|
yaxis_title="Coefficient Value", |
|
|
template="plotly_white", |
|
|
height=700, |
|
|
legend=dict( |
|
|
orientation="h", |
|
|
y=-0.3, |
|
|
x=0.5, |
|
|
xanchor="center" |
|
|
) |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
|
|
|
def train_linear_models_with_gridsearch(X_train, y_train, X_test, y_test): |
|
|
""" |
|
|
Train and evaluate multiple linear models using GridSearchCV and compare their performance. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
X_train : pd.DataFrame |
|
|
Training feature set. |
|
|
y_train : pd.Series |
|
|
Training target variable. |
|
|
X_test : pd.DataFrame |
|
|
Testing feature set. |
|
|
y_test : pd.Series |
|
|
Testing target variable. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
dict |
|
|
A dictionary containing the best model, its parameters, and performance metrics. |
|
|
""" |
|
|
models = { |
|
|
"Lasso": { |
|
|
"model": Lasso(fit_intercept=False), |
|
|
"param_grid": {"alpha": [0.001, 0.01, 0.1, 1]}, |
|
|
}, |
|
|
"Ridge": { |
|
|
"model": Ridge(fit_intercept=False), |
|
|
"param_grid": {"alpha": [0.001, 0.01, 0.1, 1]}, |
|
|
}, |
|
|
"ElasticNet": { |
|
|
"model": ElasticNet(fit_intercept=False), |
|
|
"param_grid": { |
|
|
"alpha": [0.001, 0.01, 0.1, 1], |
|
|
"l1_ratio": [0.2, 0.5, 0.8], |
|
|
}, |
|
|
}, |
|
|
"LinearRegression": { |
|
|
"model": LinearRegression(fit_intercept=False), |
|
|
"param_grid": {}, |
|
|
}, |
|
|
"HuberRegressor": { |
|
|
"model": HuberRegressor(fit_intercept=False), |
|
|
"param_grid": {"epsilon": [1.2, 1.5], "alpha": [0.001, 0.01]}, |
|
|
}, |
|
|
"KNeighborsRegressor": { |
|
|
"model": KNeighborsRegressor(), |
|
|
"param_grid": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]}, |
|
|
}, |
|
|
"DecisionTreeRegressor": { |
|
|
"model": DecisionTreeRegressor(), |
|
|
"param_grid": { |
|
|
"max_depth": [None, 10, 20], |
|
|
"min_samples_split": [2, 5], |
|
|
"min_samples_leaf": [1, 2], |
|
|
}, |
|
|
}, |
|
|
"RandomForestRegressor": { |
|
|
"model": RandomForestRegressor(random_state=42), |
|
|
"param_grid": { |
|
|
"n_estimators": [50, 100], |
|
|
"max_depth": [10, 20, None], |
|
|
"min_samples_split": [2, 5], |
|
|
}, |
|
|
}, |
|
|
"GradientBoostingRegressor": { |
|
|
"model": GradientBoostingRegressor(random_state=42), |
|
|
"param_grid": { |
|
|
"n_estimators": [50, 100], |
|
|
"learning_rate": [0.05, 0.1], |
|
|
"max_depth": [3, 5], |
|
|
}, |
|
|
}, |
|
|
"AdaBoostRegressor": { |
|
|
"model": AdaBoostRegressor(random_state=42), |
|
|
"param_grid": { |
|
|
"n_estimators": [50, 100], |
|
|
"learning_rate": [0.05, 0.1], |
|
|
}, |
|
|
}, |
|
|
"SVR": { |
|
|
"model": SVR(), |
|
|
"param_grid": { |
|
|
"C": [0.1, 1], |
|
|
"epsilon": [0.01, 0.1], |
|
|
"kernel": ["linear", "rbf"], |
|
|
}, |
|
|
}, |
|
|
"LinearSVR": { |
|
|
"model": LinearSVR(random_state=42), |
|
|
"param_grid": {"C": [0.1, 1]}, |
|
|
}, |
|
|
} |
|
|
|
|
|
results = [] |
|
|
best_model = None |
|
|
best_result = None |
|
|
for name, config in models.items(): |
|
|
try: |
|
|
grid_search = GridSearchCV( |
|
|
config["model"], |
|
|
config["param_grid"], |
|
|
scoring="neg_mean_absolute_error", |
|
|
cv=5 |
|
|
) |
|
|
grid_search.fit(X_train, y_train) |
|
|
|
|
|
|
|
|
y_pred = grid_search.best_estimator_.predict(X_test) |
|
|
mae = mean_absolute_error(y_test, y_pred) |
|
|
r2 = r2_score(y_test, y_pred) |
|
|
|
|
|
|
|
|
results.append({ |
|
|
"model": name, |
|
|
"best_params": grid_search.best_params_, |
|
|
"mae": mae, |
|
|
"r2": r2, |
|
|
"best_estimator": grid_search.best_estimator_, |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error training model {name}: {e}") |
|
|
|
|
|
|
|
|
if results: |
|
|
best_result = min(results, key=lambda x: x["mae"]) |
|
|
best_model = best_result["best_estimator"] |
|
|
|
|
|
return { |
|
|
"results": results, |
|
|
"best_model_name": best_result["model"] if best_result else None, |
|
|
"best_model_metrics": best_result if best_result else None, |
|
|
"best_model": best_model, |
|
|
} |
|
|
|
|
|
def train_model(): |
|
|
original_data = pd.read_csv(DATA_PATH) |
|
|
data, bool_columns = load_data() |
|
|
X = data.drop("Historical_Cost_of_Ride", axis=1) |
|
|
y = data["Historical_Cost_of_Ride"] |
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
|
|
|
|
|
linear_model_results = train_linear_models_with_gridsearch(X_train, y_train, X_test, y_test) |
|
|
best_model_name = linear_model_results["best_model_name"] |
|
|
best_model_metrics = linear_model_results["best_model_metrics"] |
|
|
top_models = linear_model_results["results"] |
|
|
best_model = linear_model_results["best_model"] |
|
|
y_pred = best_model.predict(X_test) |
|
|
mae = mean_absolute_error(y_test, y_pred) |
|
|
r2 = r2_score(y_test, y_pred) |
|
|
|
|
|
feature_names = X_train.columns |
|
|
coefficients = best_model.coef_ |
|
|
|
|
|
useful_features = [(feature, coef) for feature, coef in zip(feature_names, coefficients) if coef != 0] |
|
|
not_useful_features = [feature for feature, coef in zip(feature_names, coefficients) if coef == 0] |
|
|
|
|
|
equation_terms = [f"*{coef:.4f}* ร *{feature}*" for feature, coef in useful_features] |
|
|
regression_equation = " + ".join(equation_terms) |
|
|
regression_equation = "Cost of Ride = " + regression_equation |
|
|
|
|
|
actual_vs_pred_plot = actual_vs_predicted_plot(y_test, y_pred) |
|
|
useful_features_formatted = "\n".join( |
|
|
[f"- {feature}: {coef:.4f}" for feature, coef in useful_features] |
|
|
) |
|
|
not_useful_features_formatted = "\n".join( |
|
|
[f"- {feature}" for feature in not_useful_features] |
|
|
) |
|
|
|
|
|
default_values, types = compute_defaults_and_types(X_train, bool_columns) |
|
|
|
|
|
scatter_plot = duration_vs_cost_plot(original_data) |
|
|
|
|
|
|
|
|
top_models_sorted = sorted(top_models, key=lambda x: x['mae'])[:10] |
|
|
top_models_df = pd.DataFrame.from_records( |
|
|
[ |
|
|
{ |
|
|
"Rank": idx + 1, |
|
|
"Model": result["model"], |
|
|
"MAE": f"{result['mae']:.4f}", |
|
|
"Rยฒ": f"{result['r2']:.4f}", |
|
|
"Best Params": result["best_params"], |
|
|
} |
|
|
for idx, result in enumerate(top_models_sorted) |
|
|
] |
|
|
) |
|
|
top_models_html = top_models_df.to_html(index=False, border=0, classes="table table-striped") |
|
|
|
|
|
return { |
|
|
"X_train": X_train, |
|
|
"y_train": y_train, |
|
|
"X_test": X_test, |
|
|
"y_test": y_test, |
|
|
"y_pred": y_pred, |
|
|
"feature_names": feature_names, |
|
|
"coefficients": coefficients, |
|
|
"mae": mae, |
|
|
"r2": r2, |
|
|
"best_model_name": best_model_name, |
|
|
"best_model_metrics": best_model_metrics, |
|
|
"best_model": best_model, |
|
|
"regression_equation": regression_equation, |
|
|
"scatter_plot": scatter_plot, |
|
|
"useful_features": useful_features_formatted, |
|
|
"not_useful_features": not_useful_features_formatted, |
|
|
"top_models_html": top_models_html, |
|
|
"default_values": default_values, |
|
|
"feature_types": types, |
|
|
"original_data_html": original_data.head(3).to_html(classes="table table-striped"), |
|
|
"original_data": original_data, |
|
|
"actual_vs_predicted_plot": actual_vs_pred_plot |
|
|
|
|
|
} |
|
|
|
|
|
def process_features_with_values(feature_string): |
|
|
"""Cleans and splits the feature string, retaining both feature names and values.""" |
|
|
if not feature_string: |
|
|
return [] |
|
|
feature_string = feature_string.strip() |
|
|
formatted_features = [] |
|
|
for item in feature_string.split("-"): |
|
|
if not item.strip(): |
|
|
continue |
|
|
if item.strip().replace(".", "", 1).isdigit(): |
|
|
if formatted_features: |
|
|
formatted_features[-1] = formatted_features[-1].strip() + ": " + item.strip() + "\n" |
|
|
else: |
|
|
formatted_features.append(" ".join(item.split()) + "\n") |
|
|
return formatted_features |
|
|
|
|
|
def process_features_without_values(feature_string): |
|
|
"""Cleans and splits the feature string, keeping only feature names.""" |
|
|
if not feature_string: |
|
|
return [] |
|
|
feature_string = feature_string.strip() |
|
|
return [ |
|
|
item.split(":")[0].strip() + "\n" |
|
|
for item in feature_string.split("-") |
|
|
if item.strip() |
|
|
] |
|
|
|
|
|
def actual_vs_predicted_plot(y_actual, y_pred): |
|
|
""" |
|
|
Create a scatter plot for Actual vs Predicted values. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
y_actual : array-like |
|
|
Actual target values. |
|
|
y_pred : array-like |
|
|
Predicted target values. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
go.Figure |
|
|
A Plotly scatter plot. |
|
|
""" |
|
|
fig = go.Figure() |
|
|
|
|
|
|
|
|
fig.add_trace(go.Scatter( |
|
|
x=y_actual, |
|
|
y=y_pred, |
|
|
mode="markers", |
|
|
marker=dict(size=8, color="rgba(99, 110, 250, 0.7)", line=dict(width=1)), |
|
|
name="Actual vs Predicted" |
|
|
)) |
|
|
|
|
|
|
|
|
min_val = min(min(y_actual), min(y_pred)) |
|
|
max_val = max(max(y_actual), max(y_pred)) |
|
|
fig.add_trace(go.Scatter( |
|
|
x=[min_val, max_val], |
|
|
y=[min_val, max_val], |
|
|
mode="lines", |
|
|
line=dict(dash="dash", color="gray"), |
|
|
name="Ideal Line" |
|
|
)) |
|
|
|
|
|
|
|
|
fig.update_layout( |
|
|
title="Actual vs Predicted Values", |
|
|
xaxis_title="Actual Values", |
|
|
yaxis_title="Predicted Values", |
|
|
template="plotly_white" |
|
|
) |
|
|
fig.add_annotation( |
|
|
x=max_val, |
|
|
y=max_val, |
|
|
text="Ideal Line (y=x)", |
|
|
showarrow=True, |
|
|
arrowhead=2 |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def train_model_button(): |
|
|
""" |
|
|
Train the model and return all relevant outputs for display. |
|
|
Save a model card documenting the results using skops 0.10.0. |
|
|
Push the model and card to Hugging Face Hub. |
|
|
""" |
|
|
|
|
|
|
|
|
comprehensive_interface.trained_model = train_model() |
|
|
results = comprehensive_interface.trained_model |
|
|
|
|
|
|
|
|
mae = results["mae"] |
|
|
r2 = results["r2"] |
|
|
scatter_plot = results["scatter_plot"] |
|
|
regression_equation = results["regression_equation"] |
|
|
coefficients = results["coefficients"] |
|
|
feature_names = results["feature_names"] |
|
|
coefficients_plot = coefficients_progression_plot_with_tracking(results) |
|
|
mae_plot, r2_plot = performance_plots_with_gridsearch(results) |
|
|
original_data_html = results["original_data_html"] |
|
|
original_data = results["original_data"] |
|
|
actual_vs_pred_plot = results["actual_vs_predicted_plot"] |
|
|
|
|
|
feature_importance_text = ( |
|
|
f"### Useful Features:\n" |
|
|
+ "".join( |
|
|
[ |
|
|
f"- {feature}: {coef:.4f} " |
|
|
f"(e.g., a unit increase in {feature} affects the cost by ${coef:.2f})\n" |
|
|
for feature, coef in zip( |
|
|
results["useful_features"].splitlines(), |
|
|
[float(line.split(":")[1]) for line in results["useful_features"].splitlines()] |
|
|
) |
|
|
] |
|
|
) |
|
|
+ "\n\n### Non-Useful Features:\n" |
|
|
+ "".join([f"- {feature}\n" for feature in results["not_useful_features"].splitlines()]) |
|
|
) |
|
|
|
|
|
|
|
|
model_path = "best_model.joblib" |
|
|
dump(results["best_model"], model_path) |
|
|
|
|
|
|
|
|
local_repo = mkdtemp(prefix="skops-") |
|
|
|
|
|
|
|
|
pkl_name = "best_model.pkl" |
|
|
with open(pkl_name, mode="wb") as f: |
|
|
pickle.dump(results["best_model"], f) |
|
|
|
|
|
|
|
|
hub_utils.init( |
|
|
model=pkl_name, |
|
|
requirements=["scikit-learn"], |
|
|
dst=local_repo, |
|
|
task="tabular-regression", |
|
|
data=original_data, |
|
|
) |
|
|
|
|
|
|
|
|
coefficients_text = "" |
|
|
coefficients_text += "| Feature | Coefficient |\n|---------|-------------|\n" |
|
|
coefficients_text += "\n".join( |
|
|
[f"| {feature} | {value:.4f} |" for feature, value in zip(feature_names, coefficients)] |
|
|
) |
|
|
|
|
|
|
|
|
hyperparameters = results["best_model"].get_params() |
|
|
|
|
|
hyperparameters_text = "### Hyperparameters:\n\n" |
|
|
hyperparameters_text += "\n".join([f"- {param}: {value}" for param, value in hyperparameters.items()]) |
|
|
|
|
|
|
|
|
actual_vs_pred_plot_path = Path(local_repo) / "actual_vs_predicted.png" |
|
|
actual_vs_pred_plot.write_image(str(actual_vs_pred_plot_path), format="png", scale=2) |
|
|
|
|
|
|
|
|
actual_vs_pred_plot_md = ( |
|
|
|
|
|
"The following plot shows the relationship between the actual and predicted values. " |
|
|
"The closer the points are to the diagonal line, the better the predictions. " |
|
|
"The dashed line represents the ideal case where predictions perfectly match the actual values.\n\n" |
|
|
"" |
|
|
) |
|
|
|
|
|
|
|
|
metadata = DatasetCardData( |
|
|
language=["en"], |
|
|
license="apache-2.0", |
|
|
annotations_creators=["machine-generated"], |
|
|
language_creators=["found"], |
|
|
multilinguality="monolingual", |
|
|
size_categories="10K<n<100K", |
|
|
source_datasets=["original"], |
|
|
task_categories=["regression"], |
|
|
task_ids=["dynamic-pricing"], |
|
|
pretty_name="Dynamic Pricing Model", |
|
|
) |
|
|
card = Card(model=pkl_name, metadata=metadata) |
|
|
model_description = ( |
|
|
"This is a regression model trained on the Dynamic Pricing Dataset. " |
|
|
"It was optimized using grid search with multiple hyperparameters." |
|
|
) |
|
|
card.add( |
|
|
**{ |
|
|
"Model description": model_description, |
|
|
"Model description/Intended uses & limitations": ( |
|
|
"This regression model is designed to predict the cost of rides based on various features such as expected ride duration, " |
|
|
"number of drivers, and time of booking.\n\n" |
|
|
"**Intended Uses**:\n" |
|
|
"- **Dynamic Pricing Analysis**: Helps optimize pricing strategies for ride-hailing platforms.\n" |
|
|
"- **Demand Forecasting**: Supports business decisions by estimating cost trends based on ride-specific parameters.\n\n" |
|
|
"**Limitations**:\n" |
|
|
"- **Feature Dependence**: The model's accuracy is highly dependent on the input features provided.\n" |
|
|
"- **Dataset Specificity**: Performance may degrade if applied to datasets with significantly different distributions.\n" |
|
|
"- **Outlier Sensitivity**: Predictions can be affected by extreme values in the dataset." |
|
|
), |
|
|
"Model description/Training Procedure": "The model was trained using grid search to optimize hyperparameters. Cross-validation (5-fold) was performed to ensure robust evaluation. The best model was selected based on the lowest Mean Absolute Error (MAE) on the validation set.", |
|
|
|
|
|
"Model description/Evaluation Results/Model Coefficients": coefficients_text, |
|
|
"Model description/Evaluation Results/Regression Equation": regression_equation, |
|
|
"Model description/Evaluation Results/Actual vs Predicted": ( |
|
|
actual_vs_pred_plot_md + "\n\n" |
|
|
"The scatter plot above shows the predicted values against the actual values. The dashed line represents the ideal predictions " |
|
|
"where the predicted values are equal to the actual values." |
|
|
), |
|
|
"Model description/Evaluation Results": ( |
|
|
"The model achieved the following results on the test set:\n" |
|
|
f"- **Mean Absolute Error (MAE)**: {mae}\n" |
|
|
f"- **Rยฒ Score**: {r2}\n\n" |
|
|
"### Key Insights:\n" |
|
|
"- Longer ride durations increase costs significantly, which may justify adding a surcharge for long-distance rides.\n" |
|
|
"- Evening bookings reduce costs, potentially indicating lower demand during these hours.\n" |
|
|
"- The model's accuracy is dependent on high-quality feature data.\n" |
|
|
|
|
|
"\nRefer to the plots and tables for detailed performance insights." |
|
|
), |
|
|
"How to Get Started with the Model": ( |
|
|
"To use this model:\n" |
|
|
"1. **Install Dependencies**: Ensure `scikit-learn` and `pandas` are installed in your environment.\n" |
|
|
"2. **Load the Model**: Download the saved model file and load it using `joblib`:\n" |
|
|
" ```python\n" |
|
|
" from joblib import load\n" |
|
|
" model = load('best_model.joblib')\n" |
|
|
" ```\n" |
|
|
"3. **Prepare Input Features**: Create a DataFrame with the required input features in the same format as the training dataset.\n" |
|
|
"4. **Make Predictions**: Use the `predict` method to generate predictions:\n" |
|
|
" ```python\n" |
|
|
" predictions = model.predict(input_features)\n" |
|
|
" ```" |
|
|
), |
|
|
"Model Card Authors": "This model card was written by **Pranav Sharma**.", |
|
|
"Model Card Contact": "For inquiries or feedback, you can contact the author via **[GitHub](https://github.com/PranavSharma)**.", |
|
|
"Citation": ( |
|
|
"If you use this model, please cite it as follows:\n" |
|
|
"```\n" |
|
|
"@model{pranav_sharma_dynamic_pricing_model_2025,\n" |
|
|
" author = {Pranav Sharma},\n" |
|
|
" title = {Dynamic Pricing Model},\n" |
|
|
" year = {2025},\n" |
|
|
" version = {1.0.0},\n" |
|
|
" url = {https://huggingface.co/PranavSharma/dynamic-pricing-model}\n" |
|
|
"}\n" |
|
|
"```" |
|
|
), |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
card_path = Path(local_repo) / "README.md" |
|
|
card.save(card_path) |
|
|
print("Model card saved as README.md") |
|
|
|
|
|
|
|
|
try: |
|
|
hub_utils.push( |
|
|
repo_id=f"{User}/{repo_name}", |
|
|
source=local_repo, |
|
|
commit_message="Pushing model and README files to the repo!", |
|
|
create_remote=True, |
|
|
) |
|
|
print("Model and card pushed to Hugging Face Hub.") |
|
|
except Exception as e: |
|
|
print(f"Failed to push to Hugging Face Hub: {e}") |
|
|
|
|
|
|
|
|
return ( |
|
|
"Model trained successfully and pushed to Hugging Face Hub!", |
|
|
scatter_plot, |
|
|
regression_equation, |
|
|
mae_plot, |
|
|
r2_plot, |
|
|
coefficients_plot, |
|
|
actual_vs_pred_plot, |
|
|
results["top_models_html"], |
|
|
original_data_html, |
|
|
feature_importance_text, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def use_trained_model_button(*inputs): |
|
|
""" |
|
|
Use the existing trained model for predictions and return relevant outputs. |
|
|
""" |
|
|
if "trained_model" not in comprehensive_interface.__dict__: |
|
|
return "No trained model found. Please train the model first.", None, None, None, None, None, None, None, None |
|
|
|
|
|
results = comprehensive_interface.trained_model |
|
|
|
|
|
if any(inputs): |
|
|
user_inputs = list(inputs) |
|
|
try: |
|
|
custom_prediction = results["best_model"].predict([user_inputs])[0] |
|
|
prediction_result = f"Custom Prediction: {custom_prediction:.2f}" |
|
|
except NotFittedError: |
|
|
prediction_result = "Trained model is not properly fitted. Please train the model again." |
|
|
else: |
|
|
prediction_result = "No custom input provided." |
|
|
|
|
|
scatter_plot = results["scatter_plot"] |
|
|
regression_equation = results["regression_equation"] |
|
|
coefficients_plot = coefficients_progression_plot_with_tracking(results) |
|
|
mae_plot, r2_plot = performance_plots_with_gridsearch(results) |
|
|
original_data_html = results["original_data_html"] |
|
|
top_models_html = results["top_models_html"] |
|
|
feature_importance = ( |
|
|
f"### Useful Features:\n {results['useful_features']}\n\n" |
|
|
f"### Non-Useful Features:\n {results['not_useful_features']}" |
|
|
) |
|
|
|
|
|
return ( |
|
|
prediction_result, |
|
|
scatter_plot, |
|
|
regression_equation, |
|
|
mae_plot, |
|
|
r2_plot, |
|
|
coefficients_plot, |
|
|
f"<h3>Top 10 Models</h3>{top_models_html}", |
|
|
f"<h3>Original Dataset</h3>{original_data_html}", |
|
|
feature_importance, |
|
|
) |
|
|
|
|
|
|
|
|
def comprehensive_interface(*inputs): |
|
|
if "trained_model" not in comprehensive_interface.__dict__: |
|
|
comprehensive_interface.trained_model = train_model() |
|
|
|
|
|
results = comprehensive_interface.trained_model |
|
|
scatter_plot = results["scatter_plot"] |
|
|
regression_equation = results["regression_equation"] |
|
|
coefficients_plot = coefficients_progression_plot_with_tracking(results) |
|
|
mae_plot, r2_plot = performance_plots_with_gridsearch(results) |
|
|
original_data_html = results["original_data_html"] |
|
|
top_models_html = results["top_models_html"] |
|
|
|
|
|
|
|
|
useful_features = results.get("useful_features", "") |
|
|
not_useful_features = results.get("not_useful_features", "") |
|
|
|
|
|
|
|
|
useful_features = process_features_with_values("".join(useful_features)) |
|
|
not_useful_features = process_features_without_values("".join(not_useful_features)) |
|
|
|
|
|
|
|
|
feature_importance = ( |
|
|
f"### Useful Features:\n " + "".join(useful_features) + "\n\n" |
|
|
f"### Non-Useful Features:\n " + "".join(not_useful_features) |
|
|
) |
|
|
|
|
|
|
|
|
if any(inputs): |
|
|
user_inputs = list(inputs) |
|
|
custom_prediction = results["best_model"].predict([user_inputs])[0] |
|
|
prediction_result = f"Custom Prediction: {custom_prediction:.2f}" |
|
|
else: |
|
|
prediction_result = "No custom input provided." |
|
|
|
|
|
return ( |
|
|
prediction_result, |
|
|
scatter_plot, |
|
|
regression_equation, |
|
|
mae_plot, |
|
|
r2_plot, |
|
|
coefficients_plot, |
|
|
f"<h3>Top 10 Models</h3>{top_models_html}", |
|
|
f"<h3>Original Dataset</h3>{original_data_html}", |
|
|
feature_importance, |
|
|
) |
|
|
|
|
|
|
|
|
def generate_gradio_inputs(): |
|
|
results = train_model() |
|
|
inputs = [] |
|
|
for feature, default in results["default_values"].items(): |
|
|
feature_type = results["feature_types"][feature] |
|
|
inputs.append(gr.Number(label=f"{feature} ({feature_type}, e.g., {default})", value=default)) |
|
|
return inputs |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# Dynamic Pricing Model - Comprehensive Analysis") |
|
|
gr.Markdown( |
|
|
"Train a range of regression models, view metrics, selection of best models, coefficients, and make custom predictions." |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
scatter_plot_output = gr.Plot(label="Scatter Plot") |
|
|
original_data_output = gr.HTML(label="Original Dataset") |
|
|
top_models_output = gr.HTML(label="Top 10 Models") |
|
|
with gr.Column(): |
|
|
actual_vs_predicted_output = gr.Plot(label="Actual vs Predicted Plot") |
|
|
mae_plot_output = gr.Plot(label="MAE Plot") |
|
|
r2_plot_output = gr.Plot(label="Rยฒ Plot") |
|
|
|
|
|
with gr.Column(): |
|
|
coeff_plot_output = gr.Plot(label="Coefficient Progression") |
|
|
regression_eq_output = gr.Textbox(label="Regression Equation") |
|
|
output_feat_importance = gr.Textbox(label="Feature Importance (Useful vs Non-Useful)") |
|
|
|
|
|
|
|
|
gr.Markdown("### Input Features") |
|
|
inputs = generate_gradio_inputs() |
|
|
with gr.Row(): |
|
|
input_fields = [input for input in inputs] |
|
|
with gr.Row(): |
|
|
train_button = gr.Button("Train Model") |
|
|
predict_button = gr.Button("Use Trained Model for Prediction") |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
prediction_output = gr.Textbox(label="Result") |
|
|
|
|
|
|
|
|
train_button.click( |
|
|
fn=train_model_button, |
|
|
inputs=[], |
|
|
outputs=[ |
|
|
prediction_output, |
|
|
scatter_plot_output, |
|
|
regression_eq_output, |
|
|
mae_plot_output, |
|
|
r2_plot_output, |
|
|
coeff_plot_output, |
|
|
actual_vs_predicted_output, |
|
|
top_models_output, |
|
|
original_data_output, |
|
|
output_feat_importance, |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
predict_button.click( |
|
|
fn=use_trained_model_button, |
|
|
inputs=input_fields, |
|
|
outputs=[ |
|
|
prediction_output, |
|
|
scatter_plot_output, |
|
|
regression_eq_output, |
|
|
mae_plot_output, |
|
|
r2_plot_output, |
|
|
coeff_plot_output, |
|
|
top_models_output, |
|
|
original_data_output, |
|
|
output_feat_importance, |
|
|
], |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|