Shramik121 commited on
Commit
3768986
·
verified ·
1 Parent(s): 0551a30

Upload model and application files to Hugging Face Space

Browse files
app.py CHANGED
@@ -10,16 +10,26 @@ logger = logging.getLogger(__name__)
10
 
11
  app = Flask(__name__)
12
 
 
13
  base_dir = '/app' if os.path.exists('/app') else os.getcwd()
14
  logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
15
 
16
  try:
17
- model = joblib.load(os.path.join(base_dir, "model.joblib"))
18
- columns = joblib.load(os.path.join(base_dir, "columns.joblib"))
19
- logger.info("Model and columns loaded successfully")
 
 
 
 
 
 
 
 
20
  except Exception as e:
21
  logger.error(f"Failed to load model or columns: {e}")
22
- raise
 
23
 
24
  @app.route('/', methods=['GET'])
25
  def index():
@@ -29,20 +39,38 @@ def index():
29
  @app.route('/health', methods=['GET'])
30
  def health():
31
  logger.info("Health check endpoint called")
32
- return jsonify({'status': 'healthy'})
 
 
 
 
 
33
 
34
  @app.route('/predict', methods=['POST'])
35
  def predict():
36
  try:
37
  data = request.get_json(force=True)
38
  logger.info(f"Predict endpoint called with data: {data}")
 
 
 
 
39
  input_df = pd.DataFrame(data)
 
 
 
40
  categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
 
41
  input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
 
 
42
  input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
 
43
  prediction = model.predict(input_encoded)
44
  logger.info(f"Prediction made: {prediction.tolist()}")
45
  return jsonify({'prediction': prediction.tolist()})
46
  except Exception as e:
47
  logger.error(f"Prediction failed: {e}")
48
  return jsonify({'error': str(e)}), 400
 
 
 
10
 
11
  app = Flask(__name__)
12
 
13
+ # Determine the base directory based on the environment
14
  base_dir = '/app' if os.path.exists('/app') else os.getcwd()
15
  logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
16
 
17
  try:
18
+ model_path = os.path.join(base_dir, "model.joblib")
19
+ columns_path = os.path.join(base_dir, "columns.joblib")
20
+
21
+ if not os.path.exists(model_path):
22
+ raise FileNotFoundError(f"model.joblib not found at {model_path}")
23
+ if not os.path.exists(columns_path):
24
+ raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
25
+
26
+ model = joblib.load(model_path)
27
+ columns = joblib.load(columns_path)
28
+ logger.info(f"Model and columns loaded successfully from {base_dir}")
29
  except Exception as e:
30
  logger.error(f"Failed to load model or columns: {e}")
31
+ # In a real application, you might return an error response or have a fallback
32
+ raise # Re-raise the exception to indicate a critical startup failure
33
 
34
  @app.route('/', methods=['GET'])
35
  def index():
 
39
  @app.route('/health', methods=['GET'])
40
  def health():
41
  logger.info("Health check endpoint called")
42
+ # Check if the model and columns are loaded
43
+ if 'model' in globals() and 'columns' in globals():
44
+ return jsonify({'status': 'healthy', 'model_loaded': True})
45
+ else:
46
+ return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
47
+
48
 
49
  @app.route('/predict', methods=['POST'])
50
  def predict():
51
  try:
52
  data = request.get_json(force=True)
53
  logger.info(f"Predict endpoint called with data: {data}")
54
+
55
+ if not isinstance(data, dict) or not data:
56
+ return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
57
+
58
  input_df = pd.DataFrame(data)
59
+
60
+ # Ensure all expected columns are present and in the correct order
61
+ # This requires knowledge of the columns used during training preprocessing
62
  categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
63
+ # Apply the same one-hot encoding and reindexing as in training
64
  input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
65
+
66
+ # Reindex to match the training columns, filling missing with 0
67
  input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
68
+
69
  prediction = model.predict(input_encoded)
70
  logger.info(f"Prediction made: {prediction.tolist()}")
71
  return jsonify({'prediction': prediction.tolist()})
72
  except Exception as e:
73
  logger.error(f"Prediction failed: {e}")
74
  return jsonify({'error': str(e)}), 400
75
+
76
+ # Note: waitress runs this app in Docker; don't call app.run()
deploy_tourism_model_diagnostic.py CHANGED
@@ -6,6 +6,7 @@ from huggingface_hub import HfApi, login, upload_folder
6
  import subprocess
7
  import shutil
8
  import logging
 
9
 
10
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11
 
@@ -19,7 +20,7 @@ def manage_dependencies():
19
  logging.warning("Installing required libraries...")
20
  subprocess.check_call(["pip", "install", "--no-cache-dir",
21
  "numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
22
- "joblib==1.4.2", "dill==0.3.8", "huggingface_hub==0.23.0",
23
  "flask==3.0.3", "waitress==3.0.0"])
24
  logging.info("Libraries installed successfully.")
25
  return True
@@ -75,39 +76,75 @@ def load_and_save_model(model_path):
75
  colab_model_path = "/content/models/best_rf_model.joblib"
76
  colab_columns_path = "/content/models/columns.joblib"
77
  model_path = model_path or default_model_path
78
-
79
- if os.path.exists(colab_model_path):
80
- shutil.copy(colab_model_path, "model.joblib")
81
- logging.info(f"Model copied from {colab_model_path} to model.joblib")
82
- elif os.path.exists(model_path):
83
- shutil.copy(model_path, "model.joblib")
84
- logging.info(f"Model copied from {model_path} to model.joblib")
 
 
 
85
  else:
86
- logging.error(f"Model not found at {colab_model_path} or {model_path}")
87
  return False
88
-
89
- if os.path.exists(colab_columns_path):
90
- shutil.copy(colab_columns_path, "columns.joblib")
91
- logging.info(f"Columns copied from {colab_columns_path} to columns.joblib")
92
- elif os.path.exists(default_columns_path):
93
- shutil.copy(default_columns_path, "columns.joblib")
94
- logging.info(f"Columns copied from {default_columns_path} to columns.joblib")
95
- else:
96
- logging.error(f"Columns file not found at {colab_columns_path} or {default_columns_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  return False
98
-
99
- if not os.path.exists("model.joblib") or not os.path.exists("columns.joblib"):
100
- logging.error("Model or columns files not found in deployment directory")
 
 
101
  return False
102
  return True
103
 
104
  def prepare_sample_data():
105
  from datasets import load_dataset
106
- dataset = load_dataset("Shramik121/tourism-split-dataset")
107
- sample_df = pd.DataFrame(dataset['test']).sample(3)
108
- sample_df.drop(columns=['ProdTaken'], inplace=True, errors='ignore')
109
- sample_df.to_csv("input_data.csv", index=False)
110
- logging.info("Input data saved to input_data.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  def create_hosting_script():
113
  hosting_script_content = """
@@ -122,16 +159,26 @@ logger = logging.getLogger(__name__)
122
 
123
  app = Flask(__name__)
124
 
 
125
  base_dir = '/app' if os.path.exists('/app') else os.getcwd()
126
  logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
127
 
128
  try:
129
- model = joblib.load(os.path.join(base_dir, "model.joblib"))
130
- columns = joblib.load(os.path.join(base_dir, "columns.joblib"))
131
- logger.info("Model and columns loaded successfully")
 
 
 
 
 
 
 
 
132
  except Exception as e:
133
  logger.error(f"Failed to load model or columns: {e}")
134
- raise
 
135
 
136
  @app.route('/', methods=['GET'])
137
  def index():
@@ -141,23 +188,41 @@ def index():
141
  @app.route('/health', methods=['GET'])
142
  def health():
143
  logger.info("Health check endpoint called")
144
- return jsonify({'status': 'healthy'})
 
 
 
 
 
145
 
146
  @app.route('/predict', methods=['POST'])
147
  def predict():
148
  try:
149
  data = request.get_json(force=True)
150
  logger.info(f"Predict endpoint called with data: {data}")
 
 
 
 
151
  input_df = pd.DataFrame(data)
 
 
 
152
  categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
 
153
  input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
 
 
154
  input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
 
155
  prediction = model.predict(input_encoded)
156
  logger.info(f"Prediction made: {prediction.tolist()}")
157
  return jsonify({'prediction': prediction.tolist()})
158
  except Exception as e:
159
  logger.error(f"Prediction failed: {e}")
160
  return jsonify({'error': str(e)}), 400
 
 
161
  """
162
  with open("app.py", "w") as f:
163
  f.write(hosting_script_content)
@@ -165,17 +230,28 @@ def predict():
165
 
166
  def upload_to_huggingface(space_name):
167
  try:
168
- api = HfApi()
169
- api.create_repo(repo_id=space_name, repo_type="space", space_sdk="docker", private=False, exist_ok=True)
 
 
 
 
 
 
 
170
  logging.info(f"Created or verified Space: {space_name}")
171
-
 
172
  required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
 
173
  for file in required_files:
174
  if not os.path.exists(file):
175
- logging.error(f"Required file {file} not found")
176
  return False
177
- logging.info(f"File {file} exists")
178
-
 
 
179
  upload_folder(
180
  folder_path=".",
181
  repo_id=space_name,
@@ -190,18 +266,32 @@ def upload_to_huggingface(space_name):
190
  return False
191
 
192
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
193
  if manage_dependencies():
194
- hf_token = os.getenv("HF_TOKEN")
195
  authenticated = authenticate(hf_token) if hf_token else False
196
  create_dockerfile()
197
  create_requirements()
198
- if load_and_save_model(os.getenv("MODEL_PATH")):
 
199
  prepare_sample_data()
200
  create_hosting_script()
201
  if authenticated:
202
  space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
203
  upload_to_huggingface(space_name)
204
  else:
205
- logging.warning("Skipping upload due to authentication failure")
206
  else:
207
- logging.warning("Skipping data preparation and upload due to model loading failure")
 
 
 
 
6
  import subprocess
7
  import shutil
8
  import logging
9
+ from google.colab import userdata # Import userdata for local testing if needed
10
 
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
 
 
20
  logging.warning("Installing required libraries...")
21
  subprocess.check_call(["pip", "install", "--no-cache-dir",
22
  "numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
23
+ "joblib==1.4.2", "dill==0.3.8", "huggingface-hub==0.23.0",
24
  "flask==3.0.3", "waitress==3.0.0"])
25
  logging.info("Libraries installed successfully.")
26
  return True
 
76
  colab_model_path = "/content/models/best_rf_model.joblib"
77
  colab_columns_path = "/content/models/columns.joblib"
78
  model_path = model_path or default_model_path
79
+
80
+ # Determine the source paths based on where the script is run
81
+ if os.path.exists(colab_model_path) and os.path.exists(colab_columns_path):
82
+ src_model = colab_model_path
83
+ src_columns = colab_columns_path
84
+ logging.info("Using model and columns from Colab specific path.")
85
+ elif os.path.exists(default_model_path) and os.path.exists(default_columns_path):
86
+ src_model = default_model_path
87
+ src_columns = default_columns_path
88
+ logging.info("Using model and columns from default models directory.")
89
  else:
90
+ logging.error(f"Model or columns files not found at {colab_model_path}, {colab_columns_path}, {default_model_path}, or {default_columns_path}")
91
  return False
92
+
93
+ # Define the destination paths in the current directory
94
+ dest_model = "model.joblib"
95
+ dest_columns = "columns.joblib"
96
+
97
+ try:
98
+ # Copy model file, avoiding SameFileError
99
+ if not os.path.exists(dest_model) or not os.path.samefile(src_model, dest_model):
100
+ shutil.copy(src_model, dest_model)
101
+ logging.info(f"Model copied from {src_model} to {dest_model}")
102
+ else:
103
+ logging.info(f"Model source and destination are the same ({src_model}), skipping copy.")
104
+
105
+ # Copy columns file, avoiding SameFileError
106
+ if not os.path.exists(dest_columns) or not os.path.samefile(src_columns, dest_columns):
107
+ shutil.copy(src_columns, dest_columns)
108
+ logging.info(f"Columns copied from {src_columns} to {dest_columns}")
109
+ else:
110
+ logging.info(f"Columns source and destination are the same ({src_columns}), skipping copy.")
111
+
112
+ except Exception as e:
113
+ logging.error(f"Error during file copy: {e}")
114
  return False
115
+
116
+
117
+ # Verify both files exist in the deployment directory
118
+ if not os.path.exists(dest_model) or not os.path.exists(dest_columns):
119
+ logging.error("Model or columns files not found in deployment directory after copy attempt.")
120
  return False
121
  return True
122
 
123
  def prepare_sample_data():
124
  from datasets import load_dataset
125
+ try:
126
+ dataset = load_dataset("Shramik121/tourism-split-dataset")
127
+ sample_df = pd.DataFrame(dataset['test']).sample(min(3, len(dataset['test'])))
128
+ sample_df.drop(columns=['ProdTaken', 'Unnamed: 0', '__index_level_0__'], inplace=True, errors='ignore') # Drop unnecessary columns
129
+ sample_df.to_csv("input_data.csv", index=False)
130
+ logging.info("Input data saved to input_data.csv")
131
+ except Exception as e:
132
+ logging.error(f"Failed to prepare sample data: {e}")
133
+ # Create a dummy sample data if loading fails
134
+ sample_inputs = {
135
+ 'Age': [41.0], 'TypeofContact': ['Self Enquiry'], 'CityTier': [3],
136
+ 'DurationOfPitch': [6.0], 'Occupation': ['Salaried'], 'Gender': ['Female'],
137
+ 'NumberOfPersonVisiting': [3], 'NumberOfFollowups': [3.0],
138
+ 'ProductPitched': ['Deluxe'], 'PreferredPropertyStar': [3.0],
139
+ 'MaritalStatus': ['Single'], 'NumberOfTrips': [1.0], 'Passport': [1],
140
+ 'PitchSatisfactionScore': [2], 'OwnCar': [1],
141
+ 'NumberOfChildrenVisiting': [0.0], 'Designation': ['Manager'],
142
+ 'MonthlyIncome': [20993.0]
143
+ }
144
+ input_df = pd.DataFrame(sample_inputs)
145
+ input_df.to_csv("input_data.csv", index=False)
146
+ logging.warning("Using dummy sample data due to loading failure.")
147
+
148
 
149
  def create_hosting_script():
150
  hosting_script_content = """
 
159
 
160
  app = Flask(__name__)
161
 
162
+ # Determine the base directory based on the environment
163
  base_dir = '/app' if os.path.exists('/app') else os.getcwd()
164
  logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
165
 
166
  try:
167
+ model_path = os.path.join(base_dir, "model.joblib")
168
+ columns_path = os.path.join(base_dir, "columns.joblib")
169
+
170
+ if not os.path.exists(model_path):
171
+ raise FileNotFoundError(f"model.joblib not found at {model_path}")
172
+ if not os.path.exists(columns_path):
173
+ raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
174
+
175
+ model = joblib.load(model_path)
176
+ columns = joblib.load(columns_path)
177
+ logger.info(f"Model and columns loaded successfully from {base_dir}")
178
  except Exception as e:
179
  logger.error(f"Failed to load model or columns: {e}")
180
+ # In a real application, you might return an error response or have a fallback
181
+ raise # Re-raise the exception to indicate a critical startup failure
182
 
183
  @app.route('/', methods=['GET'])
184
  def index():
 
188
  @app.route('/health', methods=['GET'])
189
  def health():
190
  logger.info("Health check endpoint called")
191
+ # Check if the model and columns are loaded
192
+ if 'model' in globals() and 'columns' in globals():
193
+ return jsonify({'status': 'healthy', 'model_loaded': True})
194
+ else:
195
+ return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
196
+
197
 
198
  @app.route('/predict', methods=['POST'])
199
  def predict():
200
  try:
201
  data = request.get_json(force=True)
202
  logger.info(f"Predict endpoint called with data: {data}")
203
+
204
+ if not isinstance(data, dict) or not data:
205
+ return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
206
+
207
  input_df = pd.DataFrame(data)
208
+
209
+ # Ensure all expected columns are present and in the correct order
210
+ # This requires knowledge of the columns used during training preprocessing
211
  categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
212
+ # Apply the same one-hot encoding and reindexing as in training
213
  input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
214
+
215
+ # Reindex to match the training columns, filling missing with 0
216
  input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
217
+
218
  prediction = model.predict(input_encoded)
219
  logger.info(f"Prediction made: {prediction.tolist()}")
220
  return jsonify({'prediction': prediction.tolist()})
221
  except Exception as e:
222
  logger.error(f"Prediction failed: {e}")
223
  return jsonify({'error': str(e)}), 400
224
+
225
+ # Note: waitress runs this app in Docker; don't call app.run()
226
  """
227
  with open("app.py", "w") as f:
228
  f.write(hosting_script_content)
 
230
 
231
  def upload_to_huggingface(space_name):
232
  try:
233
+ api = HfApi(token=os.getenv("HF_TOKEN"))
234
+ # Create the Space if it doesn't exist
235
+ api.create_repo(
236
+ repo_id=space_name,
237
+ repo_type="space",
238
+ space_sdk="docker",
239
+ private=False,
240
+ exist_ok=True
241
+ )
242
  logging.info(f"Created or verified Space: {space_name}")
243
+
244
+ # Verify files to be uploaded
245
  required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
246
+ logging.info("Checking required files for upload: %s", required_files)
247
  for file in required_files:
248
  if not os.path.exists(file):
249
+ logging.error(f"Required file {file} not found in deployment directory")
250
  return False
251
+ else:
252
+ logging.info(f"File {file} exists in deployment directory")
253
+
254
+ # Upload files to the Space
255
  upload_folder(
256
  folder_path=".",
257
  repo_id=space_name,
 
266
  return False
267
 
268
  if __name__ == "__main__":
269
+ # Get HF_TOKEN from environment or Colab secrets for local testing
270
+ hf_token = os.getenv("HF_TOKEN")
271
+ if not hf_token:
272
+ try:
273
+ hf_token = userdata.get('HF_TOKEN')
274
+ logging.info("Retrieved HF_TOKEN from Colab secrets.")
275
+ except:
276
+ logging.warning("HF_TOKEN not found in environment variables or Colab secrets.")
277
+ hf_token = None
278
+
279
+
280
  if manage_dependencies():
 
281
  authenticated = authenticate(hf_token) if hf_token else False
282
  create_dockerfile()
283
  create_requirements()
284
+ # No need to pass model_path from env here, load_and_save_model handles finding it
285
+ if load_and_save_model(None): # Pass None to let the function find the model
286
  prepare_sample_data()
287
  create_hosting_script()
288
  if authenticated:
289
  space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
290
  upload_to_huggingface(space_name)
291
  else:
292
+ logging.warning("Skipping upload to Hugging Face due to authentication failure.")
293
  else:
294
+ logging.warning("Skipping data preparation, hosting script, and upload due to model loading failure.")
295
+ else:
296
+ logging.warning("Skipping execution due to dependency issues.")
297
+
input_data.csv CHANGED
@@ -1,4 +1,4 @@
1
- Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,__index_level_0__
2
- 3430,40.0,Self Enquiry,3,28.0,Salaried,Male,3,5.0,Deluxe,3.0,Divorced,5.0,1,1,0,2.0,Manager,24798.0,2803
3
- 3221,22.0,Self Enquiry,1,17.0,Salaried,Female,3,5.0,Basic,5.0,Single,3.0,0,3,1,1.0,Executive,20094.0,2618
4
- 1839,31.0,Self Enquiry,1,15.0,Salaried,Male,2,4.0,Basic,4.0,Single,1.0,1,3,1,0.0,Executive,17657.0,1451
 
1
+ Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
2
+ 37.0,Company Invited,2,9.0,Salaried,Male,4,5.0,Deluxe,3.0,Unmarried,3.0,0,3,1,1.0,Manager,22564.0
3
+ 34.0,Self Enquiry,1,21.0,Small Business,Female,4,4.0,Basic,4.0,Divorced,3.0,0,5,0,1.0,Executive,21434.0
4
+ 36.0,Self Enquiry,1,8.0,Salaried,Female,3,3.0,Basic,3.0,Married,5.0,0,5,1,0.0,Executive,17543.0
models/best_rf_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ac2e0071a5fe49d998eef5a5d0bdf4a41859d405c9c7f6646385d56ecf9c72a
3
+ size 8651122
src/data_prep.py CHANGED
@@ -8,15 +8,31 @@ def prepare_data():
8
  data = pd.DataFrame(dataset['train'])
9
  if 'Unnamed: 0' in data.columns:
10
  data = data.drop('Unnamed: 0', axis=1)
 
11
  data = data.dropna()
12
  if 'CustomerID' in data:
13
  data = data.drop('CustomerID', axis=1)
14
  if 'Gender' in data:
15
  data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
16
- os.makedirs('data', exist_ok=True)
17
- data.to_csv('data/processed.csv', index=False)
18
- data.to_csv('data/test.csv', index=False)
19
- print("Data prepared and saved to data/processed.csv and data/test.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  if __name__ == "__main__":
22
  prepare_data()
 
8
  data = pd.DataFrame(dataset['train'])
9
  if 'Unnamed: 0' in data.columns:
10
  data = data.drop('Unnamed: 0', axis=1)
11
+ # Drop rows with missing values for simplicity as done in EDA
12
  data = data.dropna()
13
  if 'CustomerID' in data:
14
  data = data.drop('CustomerID', axis=1)
15
  if 'Gender' in data:
16
  data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
17
+
18
+ # Save processed data to a specific location within the repo
19
+ output_dir = os.getenv('OUTPUT_DIR', 'data')
20
+ os.makedirs(output_dir, exist_ok=True)
21
+ data.to_csv(os.path.join(output_dir, 'processed.csv'), index=False)
22
+
23
+ # Save test data to a specific location within the repo
24
+ test_data = pd.DataFrame(dataset['test'])
25
+ if 'Unnamed: 0' in test_data.columns:
26
+ test_data = test_data.drop('Unnamed: 0', axis=1)
27
+ test_data = test_data.dropna() # Also drop missing for consistency with train
28
+ if 'CustomerID' in test_data:
29
+ test_data = test_data.drop('CustomerID', axis=1)
30
+ if 'Gender' in test_data:
31
+ test_data['Gender'] = test_data['Gender'].replace('Fe Male', 'Female')
32
+
33
+ test_data.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
34
+
35
+ print("Data prepared and saved to", output_dir)
36
 
37
  if __name__ == "__main__":
38
  prepare_data()
src/evaluate.py CHANGED
@@ -3,33 +3,77 @@ import os
3
  import pandas as pd
4
  import joblib
5
  import json
6
- from sklearn.metrics import accuracy_score
7
  from datasets import load_dataset
 
 
 
8
 
9
  def evaluate_model():
10
- model = joblib.load(os.getenv('MODEL_PATH', 'models/model.joblib'))
11
- data = pd.DataFrame(load_dataset("Shramik121/tourism-split-dataset")['test'])
12
- if 'Unnamed: 0' in data.columns:
13
- data = data.drop('Unnamed: 0', axis=1)
14
-
15
- num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
16
- 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  'NumberOfChildrenVisiting', 'MonthlyIncome']
18
- cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
19
  'MaritalStatus', 'Designation', 'CityTier']
20
-
 
21
  data[num_cols] = data[num_cols].fillna(data[num_cols].median())
22
  data[cat_cols] = data[cat_cols].fillna('Unknown')
23
-
24
- X = data.drop(columns=['ProdTaken'])
25
- y = data['ProdTaken']
26
-
27
- predictions = model.predict(X)
28
- accuracy = accuracy_score(y, predictions)
29
- results = {'accuracy': accuracy}
30
- with open('evaluation_results.json', 'w') as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  json.dump(results, f)
32
- print(f"Model accuracy: {accuracy}")
 
 
33
 
34
  if __name__ == "__main__":
35
  evaluate_model()
 
3
  import pandas as pd
4
  import joblib
5
  import json
6
+ from sklearn.metrics import accuracy_score, f1_score
7
  from datasets import load_dataset
8
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
9
+ from sklearn.compose import ColumnTransformer
10
+ from sklearn.pipeline import Pipeline # Import Pipeline
11
 
12
  def evaluate_model():
13
+ model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
14
+ test_data_path = os.getenv('TEST_DATA', 'data/test.csv')
15
+ evaluation_output_path = 'evaluation_results.json' # Define output path
16
+
17
+ if not os.path.exists(model_path):
18
+ print(f"Error: Model file not found at {model_path}")
19
+ results = {'error': f'Model file not found at {model_path}'}
20
+ with open(evaluation_output_path, 'w') as f:
21
+ json.dump(results, f)
22
+ return
23
+
24
+ if not os.path.exists(test_data_path):
25
+ print(f"Error: Test data file not found at {test_data_path}")
26
+ results = {'error': f'Test data file not found at {test_data_path}'}
27
+ with open(evaluation_output_path, 'w') as f:
28
+ json.dump(results, f)
29
+ return
30
+
31
+ model = joblib.load(model_path)
32
+ data = pd.read_csv(test_data_path)
33
+
34
+ # Apply the same preprocessing steps as in train.py to the test data
35
+ num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
36
+ 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
37
  'NumberOfChildrenVisiting', 'MonthlyIncome']
38
+ cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
39
  'MaritalStatus', 'Designation', 'CityTier']
40
+
41
+ # Handle missing values (consistent with training)
42
  data[num_cols] = data[num_cols].fillna(data[num_cols].median())
43
  data[cat_cols] = data[cat_cols].fillna('Unknown')
44
+
45
+ X_test = data.drop(columns=['ProdTaken'])
46
+ y_test = data['ProdTaken']
47
+
48
+ # Ensure the loaded model is a pipeline and can process the raw X_test
49
+ if isinstance(model, Pipeline):
50
+ predictions = model.predict(X_test)
51
+ else:
52
+ # If the loaded model is just the classifier, apply preprocessing manually
53
+ print("Warning: Loaded model is not a pipeline. Applying preprocessing manually.")
54
+ preprocessor = ColumnTransformer(
55
+ transformers=[
56
+ ('num', StandardScaler(), num_cols),
57
+ ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
58
+ ],
59
+ remainder='passthrough'
60
+ )
61
+ X_test_processed = preprocessor.fit_transform(X_test) # Use fit_transform for consistency
62
+ predictions = model.predict(X_test_processed)
63
+
64
+
65
+ accuracy = accuracy_score(y_test, predictions)
66
+ f1 = f1_score(y_test, predictions)
67
+
68
+ results = {
69
+ 'accuracy': accuracy,
70
+ 'f1_score': f1
71
+ }
72
+ with open(evaluation_output_path, 'w') as f:
73
  json.dump(results, f)
74
+
75
+ print(f"Model Accuracy: {accuracy}")
76
+ print(f"Model F1 Score: {f1}")
77
 
78
  if __name__ == "__main__":
79
  evaluate_model()
src/train.py CHANGED
@@ -9,36 +9,71 @@ import os
9
  from datasets import load_dataset
10
 
11
  def train_model():
12
- dataset = load_dataset("Shramik121/tourism-split-dataset")
13
- data = pd.DataFrame(dataset['train'])
14
- if 'Unnamed: 0' in data.columns:
15
- data = data.drop('Unnamed: 0', axis=1)
16
-
17
- num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
18
- 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  'NumberOfChildrenVisiting', 'MonthlyIncome']
20
- cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
21
  'MaritalStatus', 'Designation', 'CityTier']
22
-
 
23
  data[num_cols] = data[num_cols].fillna(data[num_cols].median())
24
  data[cat_cols] = data[cat_cols].fillna('Unknown')
25
-
26
  X = data.drop(columns=['ProdTaken'])
27
  y = data['ProdTaken']
28
-
29
  preprocessor = ColumnTransformer(
30
  transformers=[
31
  ('num', StandardScaler(), num_cols),
32
  ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
33
- ])
34
-
 
 
35
  pipeline = Pipeline(steps=[('preprocessor', preprocessor),
36
  ('classifier', RandomForestClassifier(random_state=42))])
37
-
38
  pipeline.fit(X, y)
39
-
40
- X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
41
- columns = X_encoded.columns.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  os.makedirs('models', exist_ok=True)
43
  joblib.dump(columns, 'models/columns.joblib')
44
  joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))
 
9
  from datasets import load_dataset
10
 
11
  def train_model():
12
+ # Load processed data
13
+ data_path = os.getenv('DATA_PATH', 'data/processed.csv')
14
+ if not os.path.exists(data_path):
15
+ print(f"Error: Data file not found at {data_path}")
16
+ # Fallback to loading from dataset if file not found (e.g., in initial run)
17
+ try:
18
+ dataset = load_dataset("Shramik121/tourism-split-dataset")
19
+ data = pd.DataFrame(dataset['train'])
20
+ if 'Unnamed: 0' in data.columns:
21
+ data = data.drop('Unnamed: 0', axis=1)
22
+ data = data.dropna()
23
+ if 'CustomerID' in data:
24
+ data = data.drop('CustomerID', axis=1)
25
+ if 'Gender' in data:
26
+ data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
27
+ print("Loaded data from Hugging Face dataset.")
28
+ except Exception as e:
29
+ print(f"Failed to load data from file or Hugging Face: {e}")
30
+ return # Exit if data cannot be loaded
31
+ else:
32
+ data = pd.read_csv(data_path)
33
+ print(f"Loaded data from {data_path}")
34
+
35
+ num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
36
+ 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
37
  'NumberOfChildrenVisiting', 'MonthlyIncome']
38
+ cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
39
  'MaritalStatus', 'Designation', 'CityTier']
40
+
41
+ # Handle missing values (should be minimal after data_prep, but for robustness)
42
  data[num_cols] = data[num_cols].fillna(data[num_cols].median())
43
  data[cat_cols] = data[cat_cols].fillna('Unknown')
44
+
45
  X = data.drop(columns=['ProdTaken'])
46
  y = data['ProdTaken']
47
+
48
  preprocessor = ColumnTransformer(
49
  transformers=[
50
  ('num', StandardScaler(), num_cols),
51
  ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
52
+ ],
53
+ remainder='passthrough' # Keep other columns (like Passport, OwnCar)
54
+ )
55
+
56
  pipeline = Pipeline(steps=[('preprocessor', preprocessor),
57
  ('classifier', RandomForestClassifier(random_state=42))])
58
+
59
  pipeline.fit(X, y)
60
+
61
+ # Extract and save the list of columns *after* preprocessing
62
+ # This is crucial for the prediction script
63
+ # We can create a dummy dataframe processed by the preprocessor to get column names
64
+ dummy_df = pd.DataFrame(columns=X.columns)
65
+ dummy_processed = preprocessor.transform(dummy_df)
66
+
67
+ # Get feature names from preprocessor
68
+ feature_names = []
69
+ for name, transformer, cols in preprocessor.transformers_:
70
+ if hasattr(transformer, 'get_feature_names_out'):
71
+ feature_names.extend(transformer.get_feature_names_out(cols))
72
+ else:
73
+ feature_names.extend(cols) # For transformers without get_feature_names_out
74
+
75
+ columns = feature_names
76
+
77
  os.makedirs('models', exist_ok=True)
78
  joblib.dump(columns, 'models/columns.joblib')
79
  joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))
src/train_colab_model.py CHANGED
@@ -8,36 +8,57 @@ from sklearn.pipeline import Pipeline
8
  from datasets import load_dataset
9
  import os
10
 
 
11
  dataset = load_dataset("Shramik121/tourism-split-dataset")
12
  data = pd.DataFrame(dataset['train'])
 
 
13
  if 'Unnamed: 0' in data.columns:
14
  data = data.drop('Unnamed: 0', axis=1)
 
 
 
 
 
 
 
 
 
15
 
16
- num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
17
- 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
 
18
  'NumberOfChildrenVisiting', 'MonthlyIncome']
19
- cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
20
  'MaritalStatus', 'Designation', 'CityTier']
21
 
22
- data[num_cols] = data[num_cols].fillna(data[num_cols].median())
23
- data[cat_cols] = data[cat_cols].fillna('Unknown')
24
-
25
- X = data.drop(columns=['ProdTaken'])
26
- y = data['ProdTaken']
27
-
28
  preprocessor = ColumnTransformer(
29
  transformers=[
30
  ('num', StandardScaler(), num_cols),
31
  ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
32
- ])
 
 
33
 
 
34
  pipeline = Pipeline(steps=[('preprocessor', preprocessor),
35
  ('classifier', RandomForestClassifier(random_state=42))])
36
 
 
37
  pipeline.fit(X, y)
38
 
39
- X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
40
- columns = X_encoded.columns.tolist()
 
 
 
 
 
 
 
 
 
 
41
  os.makedirs('/content/models', exist_ok=True)
42
  joblib.dump(columns, '/content/models/columns.joblib')
43
  joblib.dump(pipeline, '/content/models/best_rf_model.joblib')
 
8
  from datasets import load_dataset
9
  import os
10
 
11
+ # Load the data from the Hugging Face dataset
12
  dataset = load_dataset("Shramik121/tourism-split-dataset")
13
  data = pd.DataFrame(dataset['train'])
14
+
15
+ # Clean the data (same steps as in the EDA cell and data_prep)
16
  if 'Unnamed: 0' in data.columns:
17
  data = data.drop('Unnamed: 0', axis=1)
18
+ data_clean = data.dropna()
19
+ if 'CustomerID' in data_clean:
20
+ data_clean = data_clean.drop(columns=['CustomerID'])
21
+ if 'Gender' in data_clean:
22
+ data_clean['Gender'] = data_clean['Gender'].replace('Fe Male', 'Female')
23
+
24
+ # Define features and target
25
+ X = data_clean.drop('ProdTaken', axis=1)
26
+ y = data_clean['ProdTaken']
27
 
28
+ # Define preprocessing steps (consistent with train.py)
29
+ num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
30
+ 'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
31
  'NumberOfChildrenVisiting', 'MonthlyIncome']
32
+ cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
33
  'MaritalStatus', 'Designation', 'CityTier']
34
 
 
 
 
 
 
 
35
  preprocessor = ColumnTransformer(
36
  transformers=[
37
  ('num', StandardScaler(), num_cols),
38
  ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
39
+ ],
40
+ remainder='passthrough'
41
+ )
42
 
43
+ # Create the full pipeline
44
  pipeline = Pipeline(steps=[('preprocessor', preprocessor),
45
  ('classifier', RandomForestClassifier(random_state=42))])
46
 
47
+ # Train the pipeline
48
  pipeline.fit(X, y)
49
 
50
+ # Extract and save column names AFTER preprocessing
51
+ dummy_df = pd.DataFrame(columns=X.columns)
52
+ preprocessor.fit(dummy_df) # Fit preprocessor to get feature names
53
+ feature_names = []
54
+ for name, transformer, cols in preprocessor.transformers_:
55
+ if hasattr(transformer, 'get_feature_names_out'):
56
+ feature_names.extend(transformer.get_feature_names_out(cols))
57
+ else:
58
+ feature_names.extend(cols) # Fallback for transformers without get_feature_names_out
59
+
60
+ columns = feature_names
61
+
62
  os.makedirs('/content/models', exist_ok=True)
63
  joblib.dump(columns, '/content/models/columns.joblib')
64
  joblib.dump(pipeline, '/content/models/best_rf_model.joblib')
tests/test_model.py CHANGED
@@ -12,5 +12,6 @@ def test_columns_exists():
12
  assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
13
 
14
  def test_model_loads():
15
- model = joblib.load(os.getenv('MODEL_PATH', 'models/model.joblib'))
 
16
  assert model is not None, "Failed to load model"
 
12
  assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
13
 
14
  def test_model_loads():
15
+ model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
16
+ model = joblib.load(model_path)
17
  assert model is not None, "Failed to load model"