Spaces:

Shramik121
/

tourism-rf-model

Runtime error

App Files Files Community

Shramik121 commited on Oct 18, 2025

Commit

3768986

verified ·

1 Parent(s): 0551a30

Upload model and application files to Hugging Face Space

Browse files

Files changed (9) hide show

app.py +33 -5
deploy_tourism_model_diagnostic.py +131 -41
input_data.csv +4 -4
models/best_rf_model.joblib +3 -0
src/data_prep.py +20 -4
src/evaluate.py +63 -19
src/train.py +52 -17
src/train_colab_model.py +33 -12
tests/test_model.py +2 -1

app.py CHANGED Viewed

@@ -10,16 +10,26 @@ logger = logging.getLogger(__name__)
 app = Flask(__name__)
 base_dir = '/app' if os.path.exists('/app') else os.getcwd()
 logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
 try:
-    model = joblib.load(os.path.join(base_dir, "model.joblib"))
-    columns = joblib.load(os.path.join(base_dir, "columns.joblib"))
-    logger.info("Model and columns loaded successfully")
 except Exception as e:
     logger.error(f"Failed to load model or columns: {e}")
-    raise
 @app.route('/', methods=['GET'])
 def index():
@@ -29,20 +39,38 @@ def index():
 @app.route('/health', methods=['GET'])
 def health():
     logger.info("Health check endpoint called")
-    return jsonify({'status': 'healthy'})
 @app.route('/predict', methods=['POST'])
 def predict():
     try:
         data = request.get_json(force=True)
         logger.info(f"Predict endpoint called with data: {data}")
         input_df = pd.DataFrame(data)
         categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
         input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
         input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
         prediction = model.predict(input_encoded)
         logger.info(f"Prediction made: {prediction.tolist()}")
         return jsonify({'prediction': prediction.tolist()})
     except Exception as e:
         logger.error(f"Prediction failed: {e}")
         return jsonify({'error': str(e)}), 400

 app = Flask(__name__)
+# Determine the base directory based on the environment
 base_dir = '/app' if os.path.exists('/app') else os.getcwd()
 logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
 try:
+    model_path = os.path.join(base_dir, "model.joblib")
+    columns_path = os.path.join(base_dir, "columns.joblib")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"model.joblib not found at {model_path}")
+    if not os.path.exists(columns_path):
+        raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
+    model = joblib.load(model_path)
+    columns = joblib.load(columns_path)
+    logger.info(f"Model and columns loaded successfully from {base_dir}")
 except Exception as e:
     logger.error(f"Failed to load model or columns: {e}")
+    # In a real application, you might return an error response or have a fallback
+    raise # Re-raise the exception to indicate a critical startup failure
 @app.route('/', methods=['GET'])
 def index():
 @app.route('/health', methods=['GET'])
 def health():
     logger.info("Health check endpoint called")
+    # Check if the model and columns are loaded
+    if 'model' in globals() and 'columns' in globals():
+        return jsonify({'status': 'healthy', 'model_loaded': True})
+    else:
+        return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
 @app.route('/predict', methods=['POST'])
 def predict():
     try:
         data = request.get_json(force=True)
         logger.info(f"Predict endpoint called with data: {data}")
+        if not isinstance(data, dict) or not data:
+             return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
         input_df = pd.DataFrame(data)
+        # Ensure all expected columns are present and in the correct order
+        # This requires knowledge of the columns used during training preprocessing
         categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
+        # Apply the same one-hot encoding and reindexing as in training
         input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
+        # Reindex to match the training columns, filling missing with 0
         input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
         prediction = model.predict(input_encoded)
         logger.info(f"Prediction made: {prediction.tolist()}")
         return jsonify({'prediction': prediction.tolist()})
     except Exception as e:
         logger.error(f"Prediction failed: {e}")
         return jsonify({'error': str(e)}), 400
+# Note: waitress runs this app in Docker; don't call app.run()

deploy_tourism_model_diagnostic.py CHANGED Viewed

@@ -6,6 +6,7 @@ from huggingface_hub import HfApi, login, upload_folder
 import subprocess
 import shutil
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -19,7 +20,7 @@ def manage_dependencies():
         logging.warning("Installing required libraries...")
         subprocess.check_call(["pip", "install", "--no-cache-dir",
                               "numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
-                              "joblib==1.4.2", "dill==0.3.8", "huggingface_hub==0.23.0",
                               "flask==3.0.3", "waitress==3.0.0"])
         logging.info("Libraries installed successfully.")
     return True
@@ -75,39 +76,75 @@ def load_and_save_model(model_path):
     colab_model_path = "/content/models/best_rf_model.joblib"
     colab_columns_path = "/content/models/columns.joblib"
     model_path = model_path or default_model_path
-    if os.path.exists(colab_model_path):
-        shutil.copy(colab_model_path, "model.joblib")
-        logging.info(f"Model copied from {colab_model_path} to model.joblib")
-    elif os.path.exists(model_path):
-        shutil.copy(model_path, "model.joblib")
-        logging.info(f"Model copied from {model_path} to model.joblib")
     else:
-        logging.error(f"Model not found at {colab_model_path} or {model_path}")
         return False
-    if os.path.exists(colab_columns_path):
-        shutil.copy(colab_columns_path, "columns.joblib")
-        logging.info(f"Columns copied from {colab_columns_path} to columns.joblib")
-    elif os.path.exists(default_columns_path):
-        shutil.copy(default_columns_path, "columns.joblib")
-        logging.info(f"Columns copied from {default_columns_path} to columns.joblib")
-    else:
-        logging.error(f"Columns file not found at {colab_columns_path} or {default_columns_path}")
         return False
-    if not os.path.exists("model.joblib") or not os.path.exists("columns.joblib"):
-        logging.error("Model or columns files not found in deployment directory")
         return False
     return True
 def prepare_sample_data():
     from datasets import load_dataset
-    dataset = load_dataset("Shramik121/tourism-split-dataset")
-    sample_df = pd.DataFrame(dataset['test']).sample(3)
-    sample_df.drop(columns=['ProdTaken'], inplace=True, errors='ignore')
-    sample_df.to_csv("input_data.csv", index=False)
-    logging.info("Input data saved to input_data.csv")
 def create_hosting_script():
     hosting_script_content = """
@@ -122,16 +159,26 @@ logger = logging.getLogger(__name__)
 app = Flask(__name__)
 base_dir = '/app' if os.path.exists('/app') else os.getcwd()
 logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
 try:
-    model = joblib.load(os.path.join(base_dir, "model.joblib"))
-    columns = joblib.load(os.path.join(base_dir, "columns.joblib"))
-    logger.info("Model and columns loaded successfully")
 except Exception as e:
     logger.error(f"Failed to load model or columns: {e}")
-    raise
 @app.route('/', methods=['GET'])
 def index():
@@ -141,23 +188,41 @@ def index():
 @app.route('/health', methods=['GET'])
 def health():
     logger.info("Health check endpoint called")
-    return jsonify({'status': 'healthy'})
 @app.route('/predict', methods=['POST'])
 def predict():
     try:
         data = request.get_json(force=True)
         logger.info(f"Predict endpoint called with data: {data}")
         input_df = pd.DataFrame(data)
         categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
         input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
         input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
         prediction = model.predict(input_encoded)
         logger.info(f"Prediction made: {prediction.tolist()}")
         return jsonify({'prediction': prediction.tolist()})
     except Exception as e:
         logger.error(f"Prediction failed: {e}")
         return jsonify({'error': str(e)}), 400
 """
     with open("app.py", "w") as f:
         f.write(hosting_script_content)
@@ -165,17 +230,28 @@ def predict():
 def upload_to_huggingface(space_name):
     try:
-        api = HfApi()
-        api.create_repo(repo_id=space_name, repo_type="space", space_sdk="docker", private=False, exist_ok=True)
         logging.info(f"Created or verified Space: {space_name}")
         required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
         for file in required_files:
             if not os.path.exists(file):
-                logging.error(f"Required file {file} not found")
                 return False
-            logging.info(f"File {file} exists")
         upload_folder(
             folder_path=".",
             repo_id=space_name,
@@ -190,18 +266,32 @@ def upload_to_huggingface(space_name):
         return False
 if __name__ == "__main__":
     if manage_dependencies():
-        hf_token = os.getenv("HF_TOKEN")
         authenticated = authenticate(hf_token) if hf_token else False
         create_dockerfile()
         create_requirements()
-        if load_and_save_model(os.getenv("MODEL_PATH")):
             prepare_sample_data()
             create_hosting_script()
             if authenticated:
                 space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
                 upload_to_huggingface(space_name)
             else:
-                logging.warning("Skipping upload due to authentication failure")
         else:
-            logging.warning("Skipping data preparation and upload due to model loading failure")

 import subprocess
 import shutil
 import logging
+from google.colab import userdata # Import userdata for local testing if needed
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
         logging.warning("Installing required libraries...")
         subprocess.check_call(["pip", "install", "--no-cache-dir",
                               "numpy==1.26.4", "pandas==2.2.2", "scikit-learn==1.6.1",
+                              "joblib==1.4.2", "dill==0.3.8", "huggingface-hub==0.23.0",
                               "flask==3.0.3", "waitress==3.0.0"])
         logging.info("Libraries installed successfully.")
     return True
     colab_model_path = "/content/models/best_rf_model.joblib"
     colab_columns_path = "/content/models/columns.joblib"
     model_path = model_path or default_model_path
+    # Determine the source paths based on where the script is run
+    if os.path.exists(colab_model_path) and os.path.exists(colab_columns_path):
+        src_model = colab_model_path
+        src_columns = colab_columns_path
+        logging.info("Using model and columns from Colab specific path.")
+    elif os.path.exists(default_model_path) and os.path.exists(default_columns_path):
+        src_model = default_model_path
+        src_columns = default_columns_path
+        logging.info("Using model and columns from default models directory.")
     else:
+        logging.error(f"Model or columns files not found at {colab_model_path}, {colab_columns_path}, {default_model_path}, or {default_columns_path}")
         return False
+    # Define the destination paths in the current directory
+    dest_model = "model.joblib"
+    dest_columns = "columns.joblib"
+    try:
+        # Copy model file, avoiding SameFileError
+        if not os.path.exists(dest_model) or not os.path.samefile(src_model, dest_model):
+             shutil.copy(src_model, dest_model)
+             logging.info(f"Model copied from {src_model} to {dest_model}")
+        else:
+            logging.info(f"Model source and destination are the same ({src_model}), skipping copy.")
+        # Copy columns file, avoiding SameFileError
+        if not os.path.exists(dest_columns) or not os.path.samefile(src_columns, dest_columns):
+            shutil.copy(src_columns, dest_columns)
+            logging.info(f"Columns copied from {src_columns} to {dest_columns}")
+        else:
+             logging.info(f"Columns source and destination are the same ({src_columns}), skipping copy.")
+    except Exception as e:
+        logging.error(f"Error during file copy: {e}")
         return False
+    # Verify both files exist in the deployment directory
+    if not os.path.exists(dest_model) or not os.path.exists(dest_columns):
+        logging.error("Model or columns files not found in deployment directory after copy attempt.")
         return False
     return True
 def prepare_sample_data():
     from datasets import load_dataset
+    try:
+        dataset = load_dataset("Shramik121/tourism-split-dataset")
+        sample_df = pd.DataFrame(dataset['test']).sample(min(3, len(dataset['test'])))
+        sample_df.drop(columns=['ProdTaken', 'Unnamed: 0', '__index_level_0__'], inplace=True, errors='ignore') # Drop unnecessary columns
+        sample_df.to_csv("input_data.csv", index=False)
+        logging.info("Input data saved to input_data.csv")
+    except Exception as e:
+        logging.error(f"Failed to prepare sample data: {e}")
+        # Create a dummy sample data if loading fails
+        sample_inputs = {
+            'Age': [41.0], 'TypeofContact': ['Self Enquiry'], 'CityTier': [3],
+            'DurationOfPitch': [6.0], 'Occupation': ['Salaried'], 'Gender': ['Female'],
+            'NumberOfPersonVisiting': [3], 'NumberOfFollowups': [3.0],
+            'ProductPitched': ['Deluxe'], 'PreferredPropertyStar': [3.0],
+            'MaritalStatus': ['Single'], 'NumberOfTrips': [1.0], 'Passport': [1],
+            'PitchSatisfactionScore': [2], 'OwnCar': [1],
+            'NumberOfChildrenVisiting': [0.0], 'Designation': ['Manager'],
+            'MonthlyIncome': [20993.0]
+        }
+        input_df = pd.DataFrame(sample_inputs)
+        input_df.to_csv("input_data.csv", index=False)
+        logging.warning("Using dummy sample data due to loading failure.")
 def create_hosting_script():
     hosting_script_content = """
 app = Flask(__name__)
+# Determine the base directory based on the environment
 base_dir = '/app' if os.path.exists('/app') else os.getcwd()
 logger.info(f"Using base directory: {base_dir}, contents: {os.listdir(base_dir)}")
 try:
+    model_path = os.path.join(base_dir, "model.joblib")
+    columns_path = os.path.join(base_dir, "columns.joblib")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"model.joblib not found at {model_path}")
+    if not os.path.exists(columns_path):
+        raise FileNotFoundError(f"columns.joblib not found at {columns_path}")
+    model = joblib.load(model_path)
+    columns = joblib.load(columns_path)
+    logger.info(f"Model and columns loaded successfully from {base_dir}")
 except Exception as e:
     logger.error(f"Failed to load model or columns: {e}")
+    # In a real application, you might return an error response or have a fallback
+    raise # Re-raise the exception to indicate a critical startup failure
 @app.route('/', methods=['GET'])
 def index():
 @app.route('/health', methods=['GET'])
 def health():
     logger.info("Health check endpoint called")
+    # Check if the model and columns are loaded
+    if 'model' in globals() and 'columns' in globals():
+        return jsonify({'status': 'healthy', 'model_loaded': True})
+    else:
+        return jsonify({'status': 'unhealthy', 'model_loaded': False}), 500
 @app.route('/predict', methods=['POST'])
 def predict():
     try:
         data = request.get_json(force=True)
         logger.info(f"Predict endpoint called with data: {data}")
+        if not isinstance(data, dict) or not data:
+             return jsonify({'error': 'Invalid input data format. Expected a dictionary with list values.'}), 400
         input_df = pd.DataFrame(data)
+        # Ensure all expected columns are present and in the correct order
+        # This requires knowledge of the columns used during training preprocessing
         categorical_columns = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation', 'CityTier']
+        # Apply the same one-hot encoding and reindexing as in training
         input_encoded = pd.get_dummies(input_df, columns=categorical_columns, drop_first=True)
+        # Reindex to match the training columns, filling missing with 0
         input_encoded = input_encoded.reindex(columns=columns, fill_value=0)
         prediction = model.predict(input_encoded)
         logger.info(f"Prediction made: {prediction.tolist()}")
         return jsonify({'prediction': prediction.tolist()})
     except Exception as e:
         logger.error(f"Prediction failed: {e}")
         return jsonify({'error': str(e)}), 400
+# Note: waitress runs this app in Docker; don't call app.run()
 """
     with open("app.py", "w") as f:
         f.write(hosting_script_content)
 def upload_to_huggingface(space_name):
     try:
+        api = HfApi(token=os.getenv("HF_TOKEN"))
+        # Create the Space if it doesn't exist
+        api.create_repo(
+            repo_id=space_name,
+            repo_type="space",
+            space_sdk="docker",
+            private=False,
+            exist_ok=True
+        )
         logging.info(f"Created or verified Space: {space_name}")
+        # Verify files to be uploaded
         required_files = ['app.py', 'model.joblib', 'columns.joblib', 'input_data.csv', 'requirements.txt', 'Dockerfile']
+        logging.info("Checking required files for upload: %s", required_files)
         for file in required_files:
             if not os.path.exists(file):
+                logging.error(f"Required file {file} not found in deployment directory")
                 return False
+            else:
+                logging.info(f"File {file} exists in deployment directory")
+        # Upload files to the Space
         upload_folder(
             folder_path=".",
             repo_id=space_name,
         return False
 if __name__ == "__main__":
+    # Get HF_TOKEN from environment or Colab secrets for local testing
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        try:
+            hf_token = userdata.get('HF_TOKEN')
+            logging.info("Retrieved HF_TOKEN from Colab secrets.")
+        except:
+             logging.warning("HF_TOKEN not found in environment variables or Colab secrets.")
+             hf_token = None
     if manage_dependencies():
         authenticated = authenticate(hf_token) if hf_token else False
         create_dockerfile()
         create_requirements()
+        # No need to pass model_path from env here, load_and_save_model handles finding it
+        if load_and_save_model(None): # Pass None to let the function find the model
             prepare_sample_data()
             create_hosting_script()
             if authenticated:
                 space_name = os.getenv("SPACE_NAME", "Shramik121/tourism-rf-model")
                 upload_to_huggingface(space_name)
             else:
+                logging.warning("Skipping upload to Hugging Face due to authentication failure.")
         else:
+            logging.warning("Skipping data preparation, hosting script, and upload due to model loading failure.")
+    else:
+        logging.warning("Skipping execution due to dependency issues.")

input_data.csv CHANGED Viewed

@@ -1,4 +1,4 @@
-Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,__index_level_0__
-3430,40.0,Self Enquiry,3,28.0,Salaried,Male,3,5.0,Deluxe,3.0,Divorced,5.0,1,1,0,2.0,Manager,24798.0,2803
-3221,22.0,Self Enquiry,1,17.0,Salaried,Female,3,5.0,Basic,5.0,Single,3.0,0,3,1,1.0,Executive,20094.0,2618
-1839,31.0,Self Enquiry,1,15.0,Salaried,Male,2,4.0,Basic,4.0,Single,1.0,1,3,1,0.0,Executive,17657.0,1451

+Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
+37.0,Company Invited,2,9.0,Salaried,Male,4,5.0,Deluxe,3.0,Unmarried,3.0,0,3,1,1.0,Manager,22564.0
+34.0,Self Enquiry,1,21.0,Small Business,Female,4,4.0,Basic,4.0,Divorced,3.0,0,5,0,1.0,Executive,21434.0
+36.0,Self Enquiry,1,8.0,Salaried,Female,3,3.0,Basic,3.0,Married,5.0,0,5,1,0.0,Executive,17543.0

models/best_rf_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ac2e0071a5fe49d998eef5a5d0bdf4a41859d405c9c7f6646385d56ecf9c72a
+size 8651122

src/data_prep.py CHANGED Viewed

@@ -8,15 +8,31 @@ def prepare_data():
     data = pd.DataFrame(dataset['train'])
     if 'Unnamed: 0' in data.columns:
         data = data.drop('Unnamed: 0', axis=1)
     data = data.dropna()
     if 'CustomerID' in data:
         data = data.drop('CustomerID', axis=1)
     if 'Gender' in data:
         data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
-    os.makedirs('data', exist_ok=True)
-    data.to_csv('data/processed.csv', index=False)
-    data.to_csv('data/test.csv', index=False)
-    print("Data prepared and saved to data/processed.csv and data/test.csv")
 if __name__ == "__main__":
     prepare_data()

     data = pd.DataFrame(dataset['train'])
     if 'Unnamed: 0' in data.columns:
         data = data.drop('Unnamed: 0', axis=1)
+    # Drop rows with missing values for simplicity as done in EDA
     data = data.dropna()
     if 'CustomerID' in data:
         data = data.drop('CustomerID', axis=1)
     if 'Gender' in data:
         data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
+    # Save processed data to a specific location within the repo
+    output_dir = os.getenv('OUTPUT_DIR', 'data')
+    os.makedirs(output_dir, exist_ok=True)
+    data.to_csv(os.path.join(output_dir, 'processed.csv'), index=False)
+    # Save test data to a specific location within the repo
+    test_data = pd.DataFrame(dataset['test'])
+    if 'Unnamed: 0' in test_data.columns:
+        test_data = test_data.drop('Unnamed: 0', axis=1)
+    test_data = test_data.dropna() # Also drop missing for consistency with train
+    if 'CustomerID' in test_data:
+        test_data = test_data.drop('CustomerID', axis=1)
+    if 'Gender' in test_data:
+        test_data['Gender'] = test_data['Gender'].replace('Fe Male', 'Female')
+    test_data.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
+    print("Data prepared and saved to", output_dir)
 if __name__ == "__main__":
     prepare_data()

src/evaluate.py CHANGED Viewed

@@ -3,33 +3,77 @@ import os
 import pandas as pd
 import joblib
 import json
-from sklearn.metrics import accuracy_score
 from datasets import load_dataset
 def evaluate_model():
-    model = joblib.load(os.getenv('MODEL_PATH', 'models/model.joblib'))
-    data = pd.DataFrame(load_dataset("Shramik121/tourism-split-dataset")['test'])
-    if 'Unnamed: 0' in data.columns:
-        data = data.drop('Unnamed: 0', axis=1)
-    num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
-                'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
                 'NumberOfChildrenVisiting', 'MonthlyIncome']
-    cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
                 'MaritalStatus', 'Designation', 'CityTier']
     data[num_cols] = data[num_cols].fillna(data[num_cols].median())
     data[cat_cols] = data[cat_cols].fillna('Unknown')
-    X = data.drop(columns=['ProdTaken'])
-    y = data['ProdTaken']
-    predictions = model.predict(X)
-    accuracy = accuracy_score(y, predictions)
-    results = {'accuracy': accuracy}
-    with open('evaluation_results.json', 'w') as f:
         json.dump(results, f)
-    print(f"Model accuracy: {accuracy}")
 if __name__ == "__main__":
     evaluate_model()

 import pandas as pd
 import joblib
 import json
+from sklearn.metrics import accuracy_score, f1_score
 from datasets import load_dataset
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline # Import Pipeline
 def evaluate_model():
+    model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
+    test_data_path = os.getenv('TEST_DATA', 'data/test.csv')
+    evaluation_output_path = 'evaluation_results.json' # Define output path
+    if not os.path.exists(model_path):
+        print(f"Error: Model file not found at {model_path}")
+        results = {'error': f'Model file not found at {model_path}'}
+        with open(evaluation_output_path, 'w') as f:
+            json.dump(results, f)
+        return
+    if not os.path.exists(test_data_path):
+        print(f"Error: Test data file not found at {test_data_path}")
+        results = {'error': f'Test data file not found at {test_data_path}'}
+        with open(evaluation_output_path, 'w') as f:
+            json.dump(results, f)
+        return
+    model = joblib.load(model_path)
+    data = pd.read_csv(test_data_path)
+    # Apply the same preprocessing steps as in train.py to the test data
+    num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
+                'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
                 'NumberOfChildrenVisiting', 'MonthlyIncome']
+    cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
                 'MaritalStatus', 'Designation', 'CityTier']
+    # Handle missing values (consistent with training)
     data[num_cols] = data[num_cols].fillna(data[num_cols].median())
     data[cat_cols] = data[cat_cols].fillna('Unknown')
+    X_test = data.drop(columns=['ProdTaken'])
+    y_test = data['ProdTaken']
+    # Ensure the loaded model is a pipeline and can process the raw X_test
+    if isinstance(model, Pipeline):
+         predictions = model.predict(X_test)
+    else:
+         # If the loaded model is just the classifier, apply preprocessing manually
+         print("Warning: Loaded model is not a pipeline. Applying preprocessing manually.")
+         preprocessor = ColumnTransformer(
+            transformers=[
+                ('num', StandardScaler(), num_cols),
+                ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
+            ],
+            remainder='passthrough'
+         )
+         X_test_processed = preprocessor.fit_transform(X_test) # Use fit_transform for consistency
+         predictions = model.predict(X_test_processed)
+    accuracy = accuracy_score(y_test, predictions)
+    f1 = f1_score(y_test, predictions)
+    results = {
+        'accuracy': accuracy,
+        'f1_score': f1
+        }
+    with open(evaluation_output_path, 'w') as f:
         json.dump(results, f)
+    print(f"Model Accuracy: {accuracy}")
+    print(f"Model F1 Score: {f1}")
 if __name__ == "__main__":
     evaluate_model()

src/train.py CHANGED Viewed

@@ -9,36 +9,71 @@ import os
 from datasets import load_dataset
 def train_model():
-    dataset = load_dataset("Shramik121/tourism-split-dataset")
-    data = pd.DataFrame(dataset['train'])
-    if 'Unnamed: 0' in data.columns:
-        data = data.drop('Unnamed: 0', axis=1)
-    num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
-                'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
                 'NumberOfChildrenVisiting', 'MonthlyIncome']
-    cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
                 'MaritalStatus', 'Designation', 'CityTier']
     data[num_cols] = data[num_cols].fillna(data[num_cols].median())
     data[cat_cols] = data[cat_cols].fillna('Unknown')
     X = data.drop(columns=['ProdTaken'])
     y = data['ProdTaken']
     preprocessor = ColumnTransformer(
         transformers=[
             ('num', StandardScaler(), num_cols),
             ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
-        ])
     pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(random_state=42))])
     pipeline.fit(X, y)
-    X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
-    columns = X_encoded.columns.tolist()
     os.makedirs('models', exist_ok=True)
     joblib.dump(columns, 'models/columns.joblib')
     joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))

 from datasets import load_dataset
 def train_model():
+    # Load processed data
+    data_path = os.getenv('DATA_PATH', 'data/processed.csv')
+    if not os.path.exists(data_path):
+        print(f"Error: Data file not found at {data_path}")
+        # Fallback to loading from dataset if file not found (e.g., in initial run)
+        try:
+            dataset = load_dataset("Shramik121/tourism-split-dataset")
+            data = pd.DataFrame(dataset['train'])
+            if 'Unnamed: 0' in data.columns:
+                data = data.drop('Unnamed: 0', axis=1)
+            data = data.dropna()
+            if 'CustomerID' in data:
+                data = data.drop('CustomerID', axis=1)
+            if 'Gender' in data:
+                data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
+            print("Loaded data from Hugging Face dataset.")
+        except Exception as e:
+            print(f"Failed to load data from file or Hugging Face: {e}")
+            return # Exit if data cannot be loaded
+    else:
+        data = pd.read_csv(data_path)
+        print(f"Loaded data from {data_path}")
+    num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
+                'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
                 'NumberOfChildrenVisiting', 'MonthlyIncome']
+    cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
                 'MaritalStatus', 'Designation', 'CityTier']
+    # Handle missing values (should be minimal after data_prep, but for robustness)
     data[num_cols] = data[num_cols].fillna(data[num_cols].median())
     data[cat_cols] = data[cat_cols].fillna('Unknown')
     X = data.drop(columns=['ProdTaken'])
     y = data['ProdTaken']
     preprocessor = ColumnTransformer(
         transformers=[
             ('num', StandardScaler(), num_cols),
             ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
+        ],
+        remainder='passthrough' # Keep other columns (like Passport, OwnCar)
+        )
     pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(random_state=42))])
     pipeline.fit(X, y)
+    # Extract and save the list of columns *after* preprocessing
+    # This is crucial for the prediction script
+    # We can create a dummy dataframe processed by the preprocessor to get column names
+    dummy_df = pd.DataFrame(columns=X.columns)
+    dummy_processed = preprocessor.transform(dummy_df)
+    # Get feature names from preprocessor
+    feature_names = []
+    for name, transformer, cols in preprocessor.transformers_:
+        if hasattr(transformer, 'get_feature_names_out'):
+            feature_names.extend(transformer.get_feature_names_out(cols))
+        else:
+            feature_names.extend(cols) # For transformers without get_feature_names_out
+    columns = feature_names
     os.makedirs('models', exist_ok=True)
     joblib.dump(columns, 'models/columns.joblib')
     joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))

src/train_colab_model.py CHANGED Viewed

@@ -8,36 +8,57 @@ from sklearn.pipeline import Pipeline
 from datasets import load_dataset
 import os
 dataset = load_dataset("Shramik121/tourism-split-dataset")
 data = pd.DataFrame(dataset['train'])
 if 'Unnamed: 0' in data.columns:
     data = data.drop('Unnamed: 0', axis=1)
-num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
-            'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
             'NumberOfChildrenVisiting', 'MonthlyIncome']
-cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
             'MaritalStatus', 'Designation', 'CityTier']
-data[num_cols] = data[num_cols].fillna(data[num_cols].median())
-data[cat_cols] = data[cat_cols].fillna('Unknown')
-X = data.drop(columns=['ProdTaken'])
-y = data['ProdTaken']
 preprocessor = ColumnTransformer(
     transformers=[
         ('num', StandardScaler(), num_cols),
         ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
-    ])
 pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])
 pipeline.fit(X, y)
-X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
-columns = X_encoded.columns.tolist()
 os.makedirs('/content/models', exist_ok=True)
 joblib.dump(columns, '/content/models/columns.joblib')
 joblib.dump(pipeline, '/content/models/best_rf_model.joblib')

 from datasets import load_dataset
 import os
+# Load the data from the Hugging Face dataset
 dataset = load_dataset("Shramik121/tourism-split-dataset")
 data = pd.DataFrame(dataset['train'])
+# Clean the data (same steps as in the EDA cell and data_prep)
 if 'Unnamed: 0' in data.columns:
     data = data.drop('Unnamed: 0', axis=1)
+data_clean = data.dropna()
+if 'CustomerID' in data_clean:
+    data_clean = data_clean.drop(columns=['CustomerID'])
+if 'Gender' in data_clean:
+    data_clean['Gender'] = data_clean['Gender'].replace('Fe Male', 'Female')
+# Define features and target
+X = data_clean.drop('ProdTaken', axis=1)
+y = data_clean['ProdTaken']
+# Define preprocessing steps (consistent with train.py)
+num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
+            'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
             'NumberOfChildrenVisiting', 'MonthlyIncome']
+cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
             'MaritalStatus', 'Designation', 'CityTier']
 preprocessor = ColumnTransformer(
     transformers=[
         ('num', StandardScaler(), num_cols),
         ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
+    ],
+    remainder='passthrough'
+)
+# Create the full pipeline
 pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])
+# Train the pipeline
 pipeline.fit(X, y)
+# Extract and save column names AFTER preprocessing
+dummy_df = pd.DataFrame(columns=X.columns)
+preprocessor.fit(dummy_df) # Fit preprocessor to get feature names
+feature_names = []
+for name, transformer, cols in preprocessor.transformers_:
+    if hasattr(transformer, 'get_feature_names_out'):
+        feature_names.extend(transformer.get_feature_names_out(cols))
+    else:
+        feature_names.extend(cols) # Fallback for transformers without get_feature_names_out
+columns = feature_names
 os.makedirs('/content/models', exist_ok=True)
 joblib.dump(columns, '/content/models/columns.joblib')
 joblib.dump(pipeline, '/content/models/best_rf_model.joblib')

tests/test_model.py CHANGED Viewed

@@ -12,5 +12,6 @@ def test_columns_exists():
     assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
 def test_model_loads():
-    model = joblib.load(os.getenv('MODEL_PATH', 'models/model.joblib'))
     assert model is not None, "Failed to load model"

     assert os.path.exists(columns_path), f"Columns file not found at {columns_path}"
 def test_model_loads():
+    model_path = os.getenv('MODEL_PATH', 'models/model.joblib')
+    model = joblib.load(model_path)
     assert model is not None, "Failed to load model"