File size: 3,486 Bytes
d31755b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import pandas as pd
import joblib
from huggingface_hub import hf_hub_download
from sklearn.impute import SimpleImputer
import numpy as np
# Define the Hugging Face repository ID and filenames
REPO_ID = "DP1110/mlp-accessibility-model"
MODEL_FILENAME = 'mlp_regressor_model.joblib'
IMPUTER_FILENAME = 'simple_imputer.joblib'
# Define the feature columns, matching the training data order
FEATURE_COLUMNS = ['% ASF (Euclidean)', '% Built-Up Area', '% ASF (Network)', '% ASF from Bus Stops ', '% ASF from Bus Stops', '% ASF (Network) ']
# Download the model and imputer from Hugging Face Hub
try:
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILENAME)
imputer_path = hf_hub_download(repo_id=REPO_ID, filename=IMPUTER_FILENAME)
except Exception as e:
print('Error downloading files from Hugging Face Hub:', e)
model_path = None
imputer_path = None
# Load the model and imputer
loaded_mlp_model = None
loaded_imputer = None
if model_path:
loaded_mlp_model = joblib.load(model_path)
print('MLP model loaded from', model_path)
if imputer_path:
loaded_imputer = joblib.load(imputer_path)
print('Imputer loaded from', imputer_path)
def predict_accessibility_score(new_data_df: pd.DataFrame) -> pd.Series:
"""
Predicts the overall accessibility score for new, raw input data.
Args:
new_data_df (pd.DataFrame): A DataFrame containing new data with the same
feature columns as the training data, before imputation.
Returns:
pd.Series: Predicted overall accessibility scores.
"""
if loaded_mlp_model is None or loaded_imputer is None:
raise RuntimeError('Model or imputer not loaded. Cannot make predictions.')
# Ensure the order of columns matches the training data
# Handle cases where new_data_df might have different columns or order
missing_cols = set(FEATURE_COLUMNS) - set(new_data_df.columns)
for c in missing_cols:
new_data_df[c] = np.nan # Or appropriate default value
# Reorder columns to match the training features
new_data_df = new_data_df[FEATURE_COLUMNS]
# Apply the loaded imputer to handle missing values in new data
new_data_imputed = loaded_imputer.transform(new_data_df)
new_data_imputed_df = pd.DataFrame(new_data_imputed, columns=FEATURE_COLUMNS)
# Make predictions using the loaded MLP model
predictions = loaded_mlp_model.predict(new_data_imputed_df)
return pd.Series(predictions, name='Predicted_Overall_Accessibility_Score')
if __name__ == '__main__':
print("\n--- Demonstrating prediction with sample data ---")
# Create a sample DataFrame for new raw data.
# This should mimic the structure of the features used for training.
sample_data_dict = {}
for i, col_name in enumerate(FEATURE_COLUMNS):
# Assign arbitrary values for demonstration
sample_data_dict[col_name] = [0.5 + (i * 0.005) % 0.1] # Varying slightly for demonstration
new_sample_data = pd.DataFrame(sample_data_dict)
# Make predictions using the defined function
try:
predictions = predict_accessibility_score(new_sample_data)
# Display the new sample data and the predictions
print("\n--- New Sample Data for Prediction ---")
print(new_sample_data)
print("\n--- Predicted Overall Accessibility Score ---")
print(predictions)
except Exception as e:
print('Error during prediction:', e)
|