tourism-rf-model / src /train.py
Shramik121's picture
Upload model and application files to Hugging Face Space
c32ae57 verified
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
from datasets import load_dataset
def train_model():
data_path = os.getenv('DATA_PATH', 'data/processed.csv')
if not os.path.exists(data_path):
print(f"Error: Data file not found at {data_path}")
try:
dataset = load_dataset("Shramik121/tourism-split-dataset")
data = pd.DataFrame(dataset['train'])
if 'Unnamed: 0' in data.columns:
data = data.drop('Unnamed: 0', axis=1)
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
'NumberOfChildrenVisiting', 'MonthlyIncome']
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
'MaritalStatus', 'Designation', 'CityTier']
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
data[cat_cols] = data[cat_cols].fillna('Unknown')
if 'CustomerID' in data:
data = data.drop('CustomerID', axis=1)
if 'Gender' in data:
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
print("Loaded data from Hugging Face dataset.")
except Exception as e:
print(f"Failed to load data: {e}")
return
else:
data = pd.read_csv(data_path)
print(f"Loaded data from {data_path}")
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
'NumberOfChildrenVisiting', 'MonthlyIncome']
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
'MaritalStatus', 'Designation', 'CityTier']
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
data[cat_cols] = data[cat_cols].fillna('Unknown')
X = data.drop(columns=['ProdTaken'])
y = data['ProdTaken']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), num_cols),
('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
],
remainder='passthrough'
)
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier(random_state=42))
])
# Fit the pipeline on the training data
pipeline.fit(X, y)
# Extract feature names from the fitted preprocessor
feature_names = []
for name, transformer, cols in pipeline.named_steps['preprocessor'].transformers_:
if name == 'remainder':
feature_names.extend(cols)
elif hasattr(transformer, 'get_feature_names_out'):
feature_names.extend(transformer.get_feature_names_out(cols))
else:
feature_names.extend(cols)
columns = feature_names
os.makedirs('models', exist_ok=True)
joblib.dump(columns, 'models/columns.joblib')
joblib.dump(pipeline, os.getenv('MODEL_OUTPUT', 'models/model.joblib'))
print("Model and columns saved to models/")
if __name__ == "__main__":
train_model()