tourism-rf-model / src /data_prep.py
Shramik121's picture
Upload model and application files to Hugging Face Space
343148c verified
import pandas as pd
from datasets import load_dataset
import os
def prepare_data():
dataset = load_dataset("Shramik121/tourism-split-dataset")
data = pd.DataFrame(dataset['train'])
if 'Unnamed: 0' in data.columns:
data = data.drop('Unnamed: 0', axis=1)
num_cols = ['Age', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups',
'PreferredPropertyStar', 'NumberOfTrips', 'PitchSatisfactionScore',
'NumberOfChildrenVisiting', 'MonthlyIncome']
cat_cols = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
'MaritalStatus', 'Designation', 'CityTier']
data[num_cols] = data[num_cols].fillna(data[num_cols].median())
data[cat_cols] = data[cat_cols].fillna('Unknown')
if 'CustomerID' in data:
data = data.drop('CustomerID', axis=1)
if 'Gender' in data:
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
output_dir = os.getenv('OUTPUT_DIR', 'data')
os.makedirs(output_dir, exist_ok=True)
data.to_csv(os.path.join(output_dir, 'processed.csv'), index=False)
test_data = pd.DataFrame(dataset['test'])
if 'Unnamed: 0' in test_data.columns:
test_data = test_data.drop('Unnamed: 0', axis=1)
test_data[num_cols] = test_data[num_cols].fillna(data[num_cols].median())
test_data[cat_cols] = test_data[cat_cols].fillna('Unknown')
if 'CustomerID' in test_data:
test_data = test_data.drop('CustomerID', axis=1)
if 'Gender' in test_data:
test_data['Gender'] = test_data['Gender'].replace('Fe Male', 'Female')
test_data.to_csv(os.path.join(output_dir, 'test.csv'), index=False)
print("Data prepared and saved to", output_dir)
if __name__ == "__main__":
prepare_data()