Spaces:
Runtime error
Runtime error
Upload model and application files to Hugging Face Space
Browse files- app.py +33 -5
- deploy_tourism_model_diagnostic.py +131 -41
- input_data.csv +4 -4
- models/best_rf_model.joblib +3 -0
- src/data_prep.py +20 -4
- src/evaluate.py +63 -19
- src/train.py +52 -17
- src/train_colab_model.py +33 -12
- tests/test_model.py +2 -1
app.py
CHANGED
|
@@ -10,16 +10,26 @@ logger = logging.getLogger(__name__)
|
|
| 10 |
|
| 11 |
app = Flask(__name__)
|
| 12 |
|
|
|
|
| 13 |
base_dir = '/app' if os.path.exists('/app') else os.getcwd()
|
| 14 |
logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
|
| 15 |
|
| 16 |
try:
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
except Exception as e:
|
| 21 |
logger.error(f"Failed to load model or columns: {e}")
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
@app.route('/', methods=['GET'])
|
| 25 |
def index():
|
|
@@ -29,20 +39,38 @@ def index():
|
|
| 29 |
@app.route('/health', methods=['GET'])
|
| 30 |
def health():
|
| 31 |
logger.info("Health check endpoint called")
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
@app.route('/predict', methods=['POST'])
|
| 35 |
def predict():
|
| 36 |
try:
|
| 37 |
data = request.get_json(force=True)
|
| 38 |
logger.info(f"Predict endpoint called with data: {data}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
input_df = pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
|
| 40 |
categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
|
|
|
|
| 41 |
input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
|
|
|
|
|
|
|
| 42 |
input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
|
|
|
|
| 43 |
prediction = model.predict(input_encoded)
|
| 44 |
logger.info(f"Prediction made: {prediction.tolist()}")
|
| 45 |
return jsonify({'prediction': prediction.tolist()})
|
| 46 |
except Exception as e:
|
| 47 |
logger.error(f"Prediction failed: {e}")
|
| 48 |
return jsonify({'error': str(e)}), 400
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
app = Flask(__name__)
|
| 12 |
|
| 13 |
+
# Determine the base directory based on the environment
|
| 14 |
base_dir = '/app' if os.path.exists('/app') else os.getcwd()
|
| 15 |
logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
|
| 16 |
|
| 17 |
try:
|
| 18 |
+
model_path = os.path.join(base_dir, "model.joblib")
|
| 19 |
+
columns_path = os.path.join(base_dir, "columns.joblib")
|
| 20 |
+
|
| 21 |
+
if not os.path.exists(model_path):
|
| 22 |
+
raise FileNotFoundError(f"model.joblib not found at {model_path}")
|
| 23 |
+
if not os.path.exists(columns_path):
|
| 24 |
+
raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
|
| 25 |
+
|
| 26 |
+
model = joblib.load(model_path)
|
| 27 |
+
columns = joblib.load(columns_path)
|
| 28 |
+
logger.info(f"Model and columns loaded successfully from {base_dir}")
|
| 29 |
except Exception as e:
|
| 30 |
logger.error(f"Failed to load model or columns: {e}")
|
| 31 |
+
# In a real application, you might return an error response or have a fallback
|
| 32 |
+
raise # Re-raise the exception to indicate a critical startup failure
|
| 33 |
|
| 34 |
@app.route('/', methods=['GET'])
|
| 35 |
def index():
|
|
|
|
| 39 |
@app.route('/health', methods=['GET'])
|
| 40 |
def health():
|
| 41 |
logger.info("Health check endpoint called")
|
| 42 |
+
# Check if the model and columns are loaded
|
| 43 |
+
if 'model' in globals() and 'columns' in globals():
|
| 44 |
+
return jsonify({'status': 'healthy', 'model_loaded': True})
|
| 45 |
+
else:
|
| 46 |
+
return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
|
| 47 |
+
|
| 48 |
|
| 49 |
@app.route('/predict', methods=['POST'])
|
| 50 |
def predict():
|
| 51 |
try:
|
| 52 |
data = request.get_json(force=True)
|
| 53 |
logger.info(f"Predict endpoint called with data: {data}")
|
| 54 |
+
|
| 55 |
+
if not isinstance(data, dict) or not data:
|
| 56 |
+
return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
|
| 57 |
+
|
| 58 |
input_df = pd.DataFrame(data)
|
| 59 |
+
|
| 60 |
+
# Ensure all expected columns are present and in the correct order
|
| 61 |
+
# This requires knowledge of the columns used during training preprocessing
|
| 62 |
categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
|
| 63 |
+
# Apply the same one-hot encoding and reindexing as in training
|
| 64 |
input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
|
| 65 |
+
|
| 66 |
+
# Reindex to match the training columns, filling missing with 0
|
| 67 |
input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
|
| 68 |
+
|
| 69 |
prediction = model.predict(input_encoded)
|
| 70 |
logger.info(f"Prediction made: {prediction.tolist()}")
|
| 71 |
return jsonify({'prediction': prediction.tolist()})
|
| 72 |
except Exception as e:
|
| 73 |
logger.error(f"Prediction failed: {e}")
|
| 74 |
return jsonify({'error': str(e)}), 400
|
| 75 |
+
|
| 76 |
+
# Note: waitress runs this app in Docker; don't call app.run()
|
deploy_tourism_model_diagnostic.py
CHANGED
|
@@ -6,6 +6,7 @@ from huggingface_hub import HfApi, login, upload_folder
|
|
| 6 |
import subprocess
|
| 7 |
import shutil
|
| 8 |
import logging
|
|
|
|
| 9 |
|
| 10 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
|
|
@@ -19,7 +20,7 @@ def manage_dependencies():
|
|
| 19 |
logging.warning("Installing required libraries...")
|
| 20 |
subprocess.check_call(["pip", "install", "--no-cache-dir",
|
| 21 |
"numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
|
| 22 |
-
"joblib==1.4.2", "dill==0.3.8", "
|
| 23 |
"flask==3.0.3", "waitress==3.0.0"])
|
| 24 |
logging.info("Libraries installed successfully.")
|
| 25 |
return True
|
|
@@ -75,39 +76,75 @@ def load_and_save_model(model_path):
|
|
| 75 |
colab_model_path = "/content/models/best_rf_model.joblib"
|
| 76 |
colab_columns_path = "/content/models/columns.joblib"
|
| 77 |
model_path = model_path or default_model_path
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
| 85 |
else:
|
| 86 |
-
logging.error(f"Model not found at {colab_model_path} or {
|
| 87 |
return False
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return False
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
| 101 |
return False
|
| 102 |
return True
|
| 103 |
|
| 104 |
def prepare_sample_data():
|
| 105 |
from datasets import load_dataset
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
def create_hosting_script():
|
| 113 |
hosting_script_content = """
|
|
@@ -122,16 +159,26 @@ logger = logging.getLogger(__name__)
|
|
| 122 |
|
| 123 |
app = Flask(__name__)
|
| 124 |
|
|
|
|
| 125 |
base_dir = '/app' if os.path.exists('/app') else os.getcwd()
|
| 126 |
logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
|
| 127 |
|
| 128 |
try:
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
except Exception as e:
|
| 133 |
logger.error(f"Failed to load model or columns: {e}")
|
| 134 |
-
|
|
|
|
| 135 |
|
| 136 |
@app.route('/', methods=['GET'])
|
| 137 |
def index():
|
|
@@ -141,23 +188,41 @@ def index():
|
|
| 141 |
@app.route('/health', methods=['GET'])
|
| 142 |
def health():
|
| 143 |
logger.info("Health check endpoint called")
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
@app.route('/predict', methods=['POST'])
|
| 147 |
def predict():
|
| 148 |
try:
|
| 149 |
data = request.get_json(force=True)
|
| 150 |
logger.info(f"Predict endpoint called with data: {data}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
input_df = pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
|
| 152 |
categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
|
|
|
|
| 153 |
input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
|
|
|
|
|
|
|
| 154 |
input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
|
|
|
|
| 155 |
prediction = model.predict(input_encoded)
|
| 156 |
logger.info(f"Prediction made: {prediction.tolist()}")
|
| 157 |
return jsonify({'prediction': prediction.tolist()})
|
| 158 |
except Exception as e:
|
| 159 |
logger.error(f"Prediction failed: {e}")
|
| 160 |
return jsonify({'error': str(e)}), 400
|
|
|
|
|
|
|
| 161 |
"""
|
| 162 |
with open("app.py", "w") as f:
|
| 163 |
f.write(hosting_script_content)
|
|
@@ -165,17 +230,28 @@ def predict():
|
|
| 165 |
|
| 166 |
def upload_to_huggingface(space_name):
|
| 167 |
try:
|
| 168 |
-
api = HfApi()
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
logging.info(f"Created or verified Space: {space_name}")
|
| 171 |
-
|
|
|
|
| 172 |
required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
|
|
|
|
| 173 |
for file in required_files:
|
| 174 |
if not os.path.exists(file):
|
| 175 |
-
logging.error(f"Required file {file} not found")
|
| 176 |
return False
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
| 179 |
upload_folder(
|
| 180 |
folder_path=".",
|
| 181 |
repo_id=space_name,
|
|
@@ -190,18 +266,32 @@ def upload_to_huggingface(space_name):
|
|
| 190 |
return False
|
| 191 |
|
| 192 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
if manage_dependencies():
|
| 194 |
-
hf_token = os.getenv("HF_TOKEN")
|
| 195 |
authenticated = authenticate(hf_token) if hf_token else False
|
| 196 |
create_dockerfile()
|
| 197 |
create_requirements()
|
| 198 |
-
|
|
|
|
| 199 |
prepare_sample_data()
|
| 200 |
create_hosting_script()
|
| 201 |
if authenticated:
|
| 202 |
space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
|
| 203 |
upload_to_huggingface(space_name)
|
| 204 |
else:
|
| 205 |
-
logging.warning("Skipping upload due to authentication failure")
|
| 206 |
else:
|
| 207 |
-
logging.warning("Skipping data preparation and upload due to model loading failure")
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import subprocess
|
| 7 |
import shutil
|
| 8 |
import logging
|
| 9 |
+
from google.colab import userdata # Import userdata for local testing if needed
|
| 10 |
|
| 11 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 12 |
|
|
|
|
| 20 |
logging.warning("Installing required libraries...")
|
| 21 |
subprocess.check_call(["pip", "install", "--no-cache-dir",
|
| 22 |
"numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
|
| 23 |
+
"joblib==1.4.2", "dill==0.3.8", "huggingface-hub==0.23.0",
|
| 24 |
"flask==3.0.3", "waitress==3.0.0"])
|
| 25 |
logging.info("Libraries installed successfully.")
|
| 26 |
return True
|
|
|
|
| 76 |
colab_model_path = "/content/models/best_rf_model.joblib"
|
| 77 |
colab_columns_path = "/content/models/columns.joblib"
|
| 78 |
model_path = model_path or default_model_path
|
| 79 |
+
|
| 80 |
+
# Determine the source paths based on where the script is run
|
| 81 |
+
if os.path.exists(colab_model_path) and os.path.exists(colab_columns_path):
|
| 82 |
+
src_model = colab_model_path
|
| 83 |
+
src_columns = colab_columns_path
|
| 84 |
+
logging.info("Using model and columns from Colab specific path.")
|
| 85 |
+
elif os.path.exists(default_model_path) and os.path.exists(default_columns_path):
|
| 86 |
+
src_model = default_model_path
|
| 87 |
+
src_columns = default_columns_path
|
| 88 |
+
logging.info("Using model and columns from default models directory.")
|
| 89 |
else:
|
| 90 |
+
logging.error(f"Model or columns files not found at {colab_model_path}, {colab_columns_path}, {default_model_path}, or {default_columns_path}")
|
| 91 |
return False
|
| 92 |
+
|
| 93 |
+
# Define the destination paths in the current directory
|
| 94 |
+
dest_model = "model.joblib"
|
| 95 |
+
dest_columns = "columns.joblib"
|
| 96 |
+
|
| 97 |
+
try:
|
| 98 |
+
# Copy model file, avoiding SameFileError
|
| 99 |
+
if not os.path.exists(dest_model) or not os.path.samefile(src_model, dest_model):
|
| 100 |
+
shutil.copy(src_model, dest_model)
|
| 101 |
+
logging.info(f"Model copied from {src_model} to {dest_model}")
|
| 102 |
+
else:
|
| 103 |
+
logging.info(f"Model source and destination are the same ({src_model}), skipping copy.")
|
| 104 |
+
|
| 105 |
+
# Copy columns file, avoiding SameFileError
|
| 106 |
+
if not os.path.exists(dest_columns) or not os.path.samefile(src_columns, dest_columns):
|
| 107 |
+
shutil.copy(src_columns, dest_columns)
|
| 108 |
+
logging.info(f"Columns copied from {src_columns} to {dest_columns}")
|
| 109 |
+
else:
|
| 110 |
+
logging.info(f"Columns source and destination are the same ({src_columns}), skipping copy.")
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logging.error(f"Error during file copy: {e}")
|
| 114 |
return False
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# Verify both files exist in the deployment directory
|
| 118 |
+
if not os.path.exists(dest_model) or not os.path.exists(dest_columns):
|
| 119 |
+
logging.error("Model or columns files not found in deployment directory after copy attempt.")
|
| 120 |
return False
|
| 121 |
return True
|
| 122 |
|
| 123 |
def prepare_sample_data():
|
| 124 |
from datasets import load_dataset
|
| 125 |
+
try:
|
| 126 |
+
dataset = load_dataset("Shramik121/tourism-split-dataset")
|
| 127 |
+
sample_df = pd.DataFrame(dataset['test']).sample(min(3, len(dataset['test'])))
|
| 128 |
+
sample_df.drop(columns=['ProdTaken', 'Unnamed: 0', '__index_level_0__'], inplace=True, errors='ignore') # Drop unnecessary columns
|
| 129 |
+
sample_df.to_csv("input_data.csv", index=False)
|
| 130 |
+
logging.info("Input data saved to input_data.csv")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logging.error(f"Failed to prepare sample data: {e}")
|
| 133 |
+
# Create a dummy sample data if loading fails
|
| 134 |
+
sample_inputs = {
|
| 135 |
+
'Age': [41.0], 'TypeofContact': ['Self Enquiry'], 'CityTier': [3],
|
| 136 |
+
'DurationOfPitch': [6.0], 'Occupation': ['Salaried'], 'Gender': ['Female'],
|
| 137 |
+
'NumberOfPersonVisiting': [3], 'NumberOfFollowups': [3.0],
|
| 138 |
+
'ProductPitched': ['Deluxe'], 'PreferredPropertyStar': [3.0],
|
| 139 |
+
'MaritalStatus': ['Single'], 'NumberOfTrips': [1.0], 'Passport': [1],
|
| 140 |
+
'PitchSatisfactionScore': [2], 'OwnCar': [1],
|
| 141 |
+
'NumberOfChildrenVisiting': [0.0], 'Designation': ['Manager'],
|
| 142 |
+
'MonthlyIncome': [20993.0]
|
| 143 |
+
}
|
| 144 |
+
input_df = pd.DataFrame(sample_inputs)
|
| 145 |
+
input_df.to_csv("input_data.csv", index=False)
|
| 146 |
+
logging.warning("Using dummy sample data due to loading failure.")
|
| 147 |
+
|
| 148 |
|
| 149 |
def create_hosting_script():
|
| 150 |
hosting_script_content = """
|
|
|
|
| 159 |
|
| 160 |
app = Flask(__name__)
|
| 161 |
|
| 162 |
+
# Determine the base directory based on the environment
|
| 163 |
base_dir = '/app' if os.path.exists('/app') else os.getcwd()
|
| 164 |
logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
|
| 165 |
|
| 166 |
try:
|
| 167 |
+
model_path = os.path.join(base_dir, "model.joblib")
|
| 168 |
+
columns_path = os.path.join(base_dir, "columns.joblib")
|
| 169 |
+
|
| 170 |
+
if not os.path.exists(model_path):
|
| 171 |
+
raise FileNotFoundError(f"model.joblib not found at {model_path}")
|
| 172 |
+
if not os.path.exists(columns_path):
|
| 173 |
+
raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
|
| 174 |
+
|
| 175 |
+
model = joblib.load(model_path)
|
| 176 |
+
columns = joblib.load(columns_path)
|
| 177 |
+
logger.info(f"Model and columns loaded successfully from {base_dir}")
|
| 178 |
except Exception as e:
|
| 179 |
logger.error(f"Failed to load model or columns: {e}")
|
| 180 |
+
# In a real application, you might return an error response or have a fallback
|
| 181 |
+
raise # Re-raise the exception to indicate a critical startup failure
|
| 182 |
|
| 183 |
@app.route('/', methods=['GET'])
|
| 184 |
def index():
|
|
|
|
| 188 |
@app.route('/health', methods=['GET'])
|
| 189 |
def health():
|
| 190 |
logger.info("Health check endpoint called")
|
| 191 |
+
# Check if the model and columns are loaded
|
| 192 |
+
if 'model' in globals() and 'columns' in globals():
|
| 193 |
+
return jsonify({'status': 'healthy', 'model_loaded': True})
|
| 194 |
+
else:
|
| 195 |
+
return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
|
| 196 |
+
|
| 197 |
|
| 198 |
@app.route('/predict', methods=['POST'])
|
| 199 |
def predict():
|
| 200 |
try:
|
| 201 |
data = request.get_json(force=True)
|
| 202 |
logger.info(f"Predict endpoint called with data: {data}")
|
| 203 |
+
|
| 204 |
+
if not isinstance(data, dict) or not data:
|
| 205 |
+
return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
|
| 206 |
+
|
| 207 |
input_df = pd.DataFrame(data)
|
| 208 |
+
|
| 209 |
+
# Ensure all expected columns are present and in the correct order
|
| 210 |
+
# This requires knowledge of the columns used during training preprocessing
|
| 211 |
categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
|
| 212 |
+
# Apply the same one-hot encoding and reindexing as in training
|
| 213 |
input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
|
| 214 |
+
|
| 215 |
+
# Reindex to match the training columns, filling missing with 0
|
| 216 |
input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
|
| 217 |
+
|
| 218 |
prediction = model.predict(input_encoded)
|
| 219 |
logger.info(f"Prediction made: {prediction.tolist()}")
|
| 220 |
return jsonify({'prediction': prediction.tolist()})
|
| 221 |
except Exception as e:
|
| 222 |
logger.error(f"Prediction failed: {e}")
|
| 223 |
return jsonify({'error': str(e)}), 400
|
| 224 |
+
|
| 225 |
+
# Note: waitress runs this app in Docker; don't call app.run()
|
| 226 |
"""
|
| 227 |
with open("app.py", "w") as f:
|
| 228 |
f.write(hosting_script_content)
|
|
|
|
| 230 |
|
| 231 |
def upload_to_huggingface(space_name):
|
| 232 |
try:
|
| 233 |
+
api = HfApi(token=os.getenv("HF_TOKEN"))
|
| 234 |
+
# Create the Space if it doesn't exist
|
| 235 |
+
api.create_repo(
|
| 236 |
+
repo_id=space_name,
|
| 237 |
+
repo_type="space",
|
| 238 |
+
space_sdk="docker",
|
| 239 |
+
private=False,
|
| 240 |
+
exist_ok=True
|
| 241 |
+
)
|
| 242 |
logging.info(f"Created or verified Space: {space_name}")
|
| 243 |
+
|
| 244 |
+
# Verify files to be uploaded
|
| 245 |
required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
|
| 246 |
+
logging.info("Checking required files for upload: %s", required_files)
|
| 247 |
for file in required_files:
|
| 248 |
if not os.path.exists(file):
|
| 249 |
+
logging.error(f"Required file {file} not found in deployment directory")
|
| 250 |
return False
|
| 251 |
+
else:
|
| 252 |
+
logging.info(f"File {file} exists in deployment directory")
|
| 253 |
+
|
| 254 |
+
# Upload files to the Space
|
| 255 |
upload_folder(
|
| 256 |
folder_path=".",
|
| 257 |
repo_id=space_name,
|
|
|
|
| 266 |
return False
|
| 267 |
|
| 268 |
if __name__ == "__main__":
|
| 269 |
+
# Get HF_TOKEN from environment or Colab secrets for local testing
|
| 270 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 271 |
+
if not hf_token:
|
| 272 |
+
try:
|
| 273 |
+
hf_token = userdata.get('HF_TOKEN')
|
| 274 |
+
logging.info("Retrieved HF_TOKEN from Colab secrets.")
|
| 275 |
+
except:
|
| 276 |
+
logging.warning("HF_TOKEN not found in environment variables or Colab secrets.")
|
| 277 |
+
hf_token = None
|
| 278 |
+
|
| 279 |
+
|
| 280 |
if manage_dependencies():
|
|
|
|
| 281 |
authenticated = authenticate(hf_token) if hf_token else False
|
| 282 |
create_dockerfile()
|
| 283 |
create_requirements()
|
| 284 |
+
# No need to pass model_path from env here, load_and_save_model handles finding it
|
| 285 |
+
if load_and_save_model(None): # Pass None to let the function find the model
|
| 286 |
prepare_sample_data()
|
| 287 |
create_hosting_script()
|
| 288 |
if authenticated:
|
| 289 |
space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
|
| 290 |
upload_to_huggingface(space_name)
|
| 291 |
else:
|
| 292 |
+
logging.warning("Skipping upload to Hugging Face due to authentication failure.")
|
| 293 |
else:
|
| 294 |
+
logging.warning("Skipping data preparation, hosting script, and upload due to model loading failure.")
|
| 295 |
+
else:
|
| 296 |
+
logging.warning("Skipping execution due to dependency issues.")
|
| 297 |
+
|
input_data.csv
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 1 |
+
Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
|
| 2 |
+
37.0,Company Invited,2,9.0,Salaried,Male,4,5.0,Deluxe,3.0,Unmarried,3.0,0,3,1,1.0,Manager,22564.0
|
| 3 |
+
34.0,Self Enquiry,1,21.0,Small Business,Female,4,4.0,Basic,4.0,Divorced,3.0,0,5,0,1.0,Executive,21434.0
|
| 4 |
+
36.0,Self Enquiry,1,8.0,Salaried,Female,3,3.0,Basic,3.0,Married,5.0,0,5,1,0.0,Executive,17543.0
|
models/best_rf_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ac2e0071a5fe49d998eef5a5d0bdf4a41859d405c9c7f6646385d56ecf9c72a
|
| 3 |
+
size 8651122
|
src/data_prep.py
CHANGED
|
@@ -8,15 +8,31 @@ def prepare_data():
|
|
| 8 |
data = pd.DataFrame(dataset['train'])
|
| 9 |
if 'Unnamed: 0' in data.columns:
|
| 10 |
data = data.drop('Unnamed: 0', axis=1)
|
|
|
|
| 11 |
data = data.dropna()
|
| 12 |
if 'CustomerID' in data:
|
| 13 |
data = data.drop('CustomerID', axis=1)
|
| 14 |
if 'Gender' in data:
|
| 15 |
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
if __name__ == "__main__":
|
| 22 |
prepare_data()
|
|
|
|
| 8 |
data = pd.DataFrame(dataset['train'])
|
| 9 |
if 'Unnamed: 0' in data.columns:
|
| 10 |
data = data.drop('Unnamed: 0', axis=1)
|
| 11 |
+
# Drop rows with missing values for simplicity as done in EDA
|
| 12 |
data = data.dropna()
|
| 13 |
if 'CustomerID' in data:
|
| 14 |
data = data.drop('CustomerID', axis=1)
|
| 15 |
if 'Gender' in data:
|
| 16 |
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
|
| 17 |
+
|
| 18 |
+
# Save processed data to a specific location within the repo
|
| 19 |
+
output_dir = os.getenv('OUTPUT_DIR', 'data')
|
| 20 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 21 |
+
data.to_csv(os.path.join(output_dir, 'processed.csv'), index=False)
|
| 22 |
+
|
| 23 |
+
# Save test data to a specific location within the repo
|
| 24 |
+
test_data = pd.DataFrame(dataset['test'])
|
| 25 |
+
if 'Unnamed: 0' in test_data.columns:
|
| 26 |
+
test_data = test_data.drop('Unnamed: 0', axis=1)
|
| 27 |
+
test_data = test_data.dropna() # Also drop missing for consistency with train
|
| 28 |
+
if 'CustomerID' in test_data:
|
| 29 |
+
test_data = test_data.drop('CustomerID', axis=1)
|
| 30 |
+
if 'Gender' in test_data:
|
| 31 |
+
test_data['Gender'] = test_data['Gender'].replace('Fe Male', 'Female')
|
| 32 |
+
|
| 33 |
+
test_data.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
|
| 34 |
+
|
| 35 |
+
print("Data prepared and saved to", output_dir)
|
| 36 |
|
| 37 |
if __name__ == "__main__":
|
| 38 |
prepare_data()
|
src/evaluate.py
CHANGED
|
@@ -3,33 +3,77 @@ import os
|
|
| 3 |
import pandas as pd
|
| 4 |
import joblib
|
| 5 |
import json
|
| 6 |
-
from sklearn.metrics import accuracy_score
|
| 7 |
from datasets import load_dataset
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def evaluate_model():
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 18 |
-
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 19 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 20 |
-
|
|
|
|
| 21 |
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
|
| 22 |
data[cat_cols] = data[cat_cols].fillna('Unknown')
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
json.dump(results, f)
|
| 32 |
-
|
|
|
|
|
|
|
| 33 |
|
| 34 |
if __name__ == "__main__":
|
| 35 |
evaluate_model()
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import joblib
|
| 5 |
import json
|
| 6 |
+
from sklearn.metrics import accuracy_score, f1_score
|
| 7 |
from datasets import load_dataset
|
| 8 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 9 |
+
from sklearn.compose import ColumnTransformer
|
| 10 |
+
from sklearn.pipeline import Pipeline # Import Pipeline
|
| 11 |
|
| 12 |
def evaluate_model():
|
| 13 |
+
model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
|
| 14 |
+
test_data_path = os.getenv('TEST_DATA', 'data/test.csv')
|
| 15 |
+
evaluation_output_path = 'evaluation_results.json' # Define output path
|
| 16 |
+
|
| 17 |
+
if not os.path.exists(model_path):
|
| 18 |
+
print(f"Error: Model file not found at {model_path}")
|
| 19 |
+
results = {'error': f'Model file not found at {model_path}'}
|
| 20 |
+
with open(evaluation_output_path, 'w') as f:
|
| 21 |
+
json.dump(results, f)
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
if not os.path.exists(test_data_path):
|
| 25 |
+
print(f"Error: Test data file not found at {test_data_path}")
|
| 26 |
+
results = {'error': f'Test data file not found at {test_data_path}'}
|
| 27 |
+
with open(evaluation_output_path, 'w') as f:
|
| 28 |
+
json.dump(results, f)
|
| 29 |
+
return
|
| 30 |
+
|
| 31 |
+
model = joblib.load(model_path)
|
| 32 |
+
data = pd.read_csv(test_data_path)
|
| 33 |
+
|
| 34 |
+
# Apply the same preprocessing steps as in train.py to the test data
|
| 35 |
+
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
|
| 36 |
+
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
|
| 37 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 38 |
+
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 39 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 40 |
+
|
| 41 |
+
# Handle missing values (consistent with training)
|
| 42 |
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
|
| 43 |
data[cat_cols] = data[cat_cols].fillna('Unknown')
|
| 44 |
+
|
| 45 |
+
X_test = data.drop(columns=['ProdTaken'])
|
| 46 |
+
y_test = data['ProdTaken']
|
| 47 |
+
|
| 48 |
+
# Ensure the loaded model is a pipeline and can process the raw X_test
|
| 49 |
+
if isinstance(model, Pipeline):
|
| 50 |
+
predictions = model.predict(X_test)
|
| 51 |
+
else:
|
| 52 |
+
# If the loaded model is just the classifier, apply preprocessing manually
|
| 53 |
+
print("Warning: Loaded model is not a pipeline. Applying preprocessing manually.")
|
| 54 |
+
preprocessor = ColumnTransformer(
|
| 55 |
+
transformers=[
|
| 56 |
+
('num', StandardScaler(), num_cols),
|
| 57 |
+
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
|
| 58 |
+
],
|
| 59 |
+
remainder='passthrough'
|
| 60 |
+
)
|
| 61 |
+
X_test_processed = preprocessor.fit_transform(X_test) # Use fit_transform for consistency
|
| 62 |
+
predictions = model.predict(X_test_processed)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
accuracy = accuracy_score(y_test, predictions)
|
| 66 |
+
f1 = f1_score(y_test, predictions)
|
| 67 |
+
|
| 68 |
+
results = {
|
| 69 |
+
'accuracy': accuracy,
|
| 70 |
+
'f1_score': f1
|
| 71 |
+
}
|
| 72 |
+
with open(evaluation_output_path, 'w') as f:
|
| 73 |
json.dump(results, f)
|
| 74 |
+
|
| 75 |
+
print(f"Model Accuracy: {accuracy}")
|
| 76 |
+
print(f"Model F1 Score: {f1}")
|
| 77 |
|
| 78 |
if __name__ == "__main__":
|
| 79 |
evaluate_model()
|
src/train.py
CHANGED
|
@@ -9,36 +9,71 @@ import os
|
|
| 9 |
from datasets import load_dataset
|
| 10 |
|
| 11 |
def train_model():
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
if
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 20 |
-
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 21 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 22 |
-
|
|
|
|
| 23 |
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
|
| 24 |
data[cat_cols] = data[cat_cols].fillna('Unknown')
|
| 25 |
-
|
| 26 |
X = data.drop(columns=['ProdTaken'])
|
| 27 |
y = data['ProdTaken']
|
| 28 |
-
|
| 29 |
preprocessor = ColumnTransformer(
|
| 30 |
transformers=[
|
| 31 |
('num', StandardScaler(), num_cols),
|
| 32 |
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
|
| 33 |
-
]
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
| 36 |
('classifier', RandomForestClassifier(random_state=42))])
|
| 37 |
-
|
| 38 |
pipeline.fit(X, y)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
os.makedirs('models', exist_ok=True)
|
| 43 |
joblib.dump(columns, 'models/columns.joblib')
|
| 44 |
joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))
|
|
|
|
| 9 |
from datasets import load_dataset
|
| 10 |
|
| 11 |
def train_model():
|
| 12 |
+
# Load processed data
|
| 13 |
+
data_path = os.getenv('DATA_PATH', 'data/processed.csv')
|
| 14 |
+
if not os.path.exists(data_path):
|
| 15 |
+
print(f"Error: Data file not found at {data_path}")
|
| 16 |
+
# Fallback to loading from dataset if file not found (e.g., in initial run)
|
| 17 |
+
try:
|
| 18 |
+
dataset = load_dataset("Shramik121/tourism-split-dataset")
|
| 19 |
+
data = pd.DataFrame(dataset['train'])
|
| 20 |
+
if 'Unnamed: 0' in data.columns:
|
| 21 |
+
data = data.drop('Unnamed: 0', axis=1)
|
| 22 |
+
data = data.dropna()
|
| 23 |
+
if 'CustomerID' in data:
|
| 24 |
+
data = data.drop('CustomerID', axis=1)
|
| 25 |
+
if 'Gender' in data:
|
| 26 |
+
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
|
| 27 |
+
print("Loaded data from Hugging Face dataset.")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Failed to load data from file or Hugging Face: {e}")
|
| 30 |
+
return # Exit if data cannot be loaded
|
| 31 |
+
else:
|
| 32 |
+
data = pd.read_csv(data_path)
|
| 33 |
+
print(f"Loaded data from {data_path}")
|
| 34 |
+
|
| 35 |
+
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
|
| 36 |
+
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
|
| 37 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 38 |
+
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 39 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 40 |
+
|
| 41 |
+
# Handle missing values (should be minimal after data_prep, but for robustness)
|
| 42 |
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
|
| 43 |
data[cat_cols] = data[cat_cols].fillna('Unknown')
|
| 44 |
+
|
| 45 |
X = data.drop(columns=['ProdTaken'])
|
| 46 |
y = data['ProdTaken']
|
| 47 |
+
|
| 48 |
preprocessor = ColumnTransformer(
|
| 49 |
transformers=[
|
| 50 |
('num', StandardScaler(), num_cols),
|
| 51 |
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
|
| 52 |
+
],
|
| 53 |
+
remainder='passthrough' # Keep other columns (like Passport, OwnCar)
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
| 57 |
('classifier', RandomForestClassifier(random_state=42))])
|
| 58 |
+
|
| 59 |
pipeline.fit(X, y)
|
| 60 |
+
|
| 61 |
+
# Extract and save the list of columns *after* preprocessing
|
| 62 |
+
# This is crucial for the prediction script
|
| 63 |
+
# We can create a dummy dataframe processed by the preprocessor to get column names
|
| 64 |
+
dummy_df = pd.DataFrame(columns=X.columns)
|
| 65 |
+
dummy_processed = preprocessor.transform(dummy_df)
|
| 66 |
+
|
| 67 |
+
# Get feature names from preprocessor
|
| 68 |
+
feature_names = []
|
| 69 |
+
for name, transformer, cols in preprocessor.transformers_:
|
| 70 |
+
if hasattr(transformer, 'get_feature_names_out'):
|
| 71 |
+
feature_names.extend(transformer.get_feature_names_out(cols))
|
| 72 |
+
else:
|
| 73 |
+
feature_names.extend(cols) # For transformers without get_feature_names_out
|
| 74 |
+
|
| 75 |
+
columns = feature_names
|
| 76 |
+
|
| 77 |
os.makedirs('models', exist_ok=True)
|
| 78 |
joblib.dump(columns, 'models/columns.joblib')
|
| 79 |
joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))
|
src/train_colab_model.py
CHANGED
|
@@ -8,36 +8,57 @@ from sklearn.pipeline import Pipeline
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
import os
|
| 10 |
|
|
|
|
| 11 |
dataset = load_dataset("Shramik121/tourism-split-dataset")
|
| 12 |
data = pd.DataFrame(dataset['train'])
|
|
|
|
|
|
|
| 13 |
if 'Unnamed: 0' in data.columns:
|
| 14 |
data = data.drop('Unnamed: 0', axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 19 |
-
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 20 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 21 |
|
| 22 |
-
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
|
| 23 |
-
data[cat_cols] = data[cat_cols].fillna('Unknown')
|
| 24 |
-
|
| 25 |
-
X = data.drop(columns=['ProdTaken'])
|
| 26 |
-
y = data['ProdTaken']
|
| 27 |
-
|
| 28 |
preprocessor = ColumnTransformer(
|
| 29 |
transformers=[
|
| 30 |
('num', StandardScaler(), num_cols),
|
| 31 |
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
|
| 32 |
-
]
|
|
|
|
|
|
|
| 33 |
|
|
|
|
| 34 |
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
| 35 |
('classifier', RandomForestClassifier(random_state=42))])
|
| 36 |
|
|
|
|
| 37 |
pipeline.fit(X, y)
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
os.makedirs('/content/models', exist_ok=True)
|
| 42 |
joblib.dump(columns, '/content/models/columns.joblib')
|
| 43 |
joblib.dump(pipeline, '/content/models/best_rf_model.joblib')
|
|
|
|
| 8 |
from datasets import load_dataset
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
# Load the data from the Hugging Face dataset
|
| 12 |
dataset = load_dataset("Shramik121/tourism-split-dataset")
|
| 13 |
data = pd.DataFrame(dataset['train'])
|
| 14 |
+
|
| 15 |
+
# Clean the data (same steps as in the EDA cell and data_prep)
|
| 16 |
if 'Unnamed: 0' in data.columns:
|
| 17 |
data = data.drop('Unnamed: 0', axis=1)
|
| 18 |
+
data_clean = data.dropna()
|
| 19 |
+
if 'CustomerID' in data_clean:
|
| 20 |
+
data_clean = data_clean.drop(columns=['CustomerID'])
|
| 21 |
+
if 'Gender' in data_clean:
|
| 22 |
+
data_clean['Gender'] = data_clean['Gender'].replace('Fe Male', 'Female')
|
| 23 |
+
|
| 24 |
+
# Define features and target
|
| 25 |
+
X = data_clean.drop('ProdTaken', axis=1)
|
| 26 |
+
y = data_clean['ProdTaken']
|
| 27 |
|
| 28 |
+
# Define preprocessing steps (consistent with train.py)
|
| 29 |
+
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
|
| 30 |
+
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
|
| 31 |
'NumberOfChildrenVisiting', 'MonthlyIncome']
|
| 32 |
+
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
|
| 33 |
'MaritalStatus', 'Designation', 'CityTier']
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
preprocessor = ColumnTransformer(
|
| 36 |
transformers=[
|
| 37 |
('num', StandardScaler(), num_cols),
|
| 38 |
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
|
| 39 |
+
],
|
| 40 |
+
remainder='passthrough'
|
| 41 |
+
)
|
| 42 |
|
| 43 |
+
# Create the full pipeline
|
| 44 |
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
| 45 |
('classifier', RandomForestClassifier(random_state=42))])
|
| 46 |
|
| 47 |
+
# Train the pipeline
|
| 48 |
pipeline.fit(X, y)
|
| 49 |
|
| 50 |
+
# Extract and save column names AFTER preprocessing
|
| 51 |
+
dummy_df = pd.DataFrame(columns=X.columns)
|
| 52 |
+
preprocessor.fit(dummy_df) # Fit preprocessor to get feature names
|
| 53 |
+
feature_names = []
|
| 54 |
+
for name, transformer, cols in preprocessor.transformers_:
|
| 55 |
+
if hasattr(transformer, 'get_feature_names_out'):
|
| 56 |
+
feature_names.extend(transformer.get_feature_names_out(cols))
|
| 57 |
+
else:
|
| 58 |
+
feature_names.extend(cols) # Fallback for transformers without get_feature_names_out
|
| 59 |
+
|
| 60 |
+
columns = feature_names
|
| 61 |
+
|
| 62 |
os.makedirs('/content/models', exist_ok=True)
|
| 63 |
joblib.dump(columns, '/content/models/columns.joblib')
|
| 64 |
joblib.dump(pipeline, '/content/models/best_rf_model.joblib')
|
tests/test_model.py
CHANGED
|
@@ -12,5 +12,6 @@ def test_columns_exists():
|
|
| 12 |
assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
|
| 13 |
|
| 14 |
def test_model_loads():
|
| 15 |
-
|
|
|
|
| 16 |
assert model is not None, "Failed to load model"
|
|
|
|
| 12 |
assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
|
| 13 |
|
| 14 |
def test_model_loads():
|
| 15 |
+
model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
|
| 16 |
+
model = joblib.load(model_path)
|
| 17 |
assert model is not None, "Failed to load model"
|