Debashre2824's picture
Upload Streamlit app and model
65f82fd verified
# for data manipulation
import pandas as pd
import sklearn
# for creating a folder
import os
# for data preprocessing and pipeline creation
from sklearn.model_selection import train_test_split
# for converting text data in to numerical representation
from sklearn.preprocessing import LabelEncoder
# for hugging face space authentication to upload files
from huggingface_hub import login, HfApi
# Define constants for the dataset and output paths
api = HfApi(token=os.getenv("HF_TOKEN"))
DATASET_PATH = "hf://datasets/Debashre2824/tourism_predicton/tourism.csv"
df = pd.read_csv(DATASET_PATH)
print("Dataset loaded successfully.")
##Data Clean up ###
##Drop the uniqueidentifier
# Drop the unique identifier
df.drop(columns=['Unnamed: 0'], inplace=True)
df.drop(columns=['CustomerID'], inplace=True)
# Encoding the categorical 'Designation' column
label_encoder = LabelEncoder()
df['Designation'] = label_encoder.fit_transform(df['Designation'])
#dropping designation also
df.drop(columns=['Designation'], inplace=True)
target_col = 'ProdTaken'
# Split into X (features) and y (target)
X = df.drop(columns=[target_col])
y = df[target_col]
# Perform train-test split
Xtrain, Xtest, ytrain, ytest = train_test_split(
X, y, test_size=0.2, random_state=42
)
Xtrain.to_csv("Xtrain.csv",index=False)
Xtest.to_csv("Xtest.csv",index=False)
ytrain.to_csv("ytrain.csv",index=False)
ytest.to_csv("ytest.csv",index=False)
files = ["Xtrain.csv","Xtest.csv","ytrain.csv","ytest.csv"]
for file_path in files:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_path.split("/")[-1], # just the filename
repo_id="Debashre2824/tourism_predicton",
repo_type="dataset",
)