File size: 1,768 Bytes
281128a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login, HfApi

HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_REPO = os.getenv("DATASET_REPO", "dhani10/tourism-app-dataset")
CSV = os.getenv("CSV_PATH", "tourism.csv")

if not HF_TOKEN:
    raise SystemExit("HF_TOKEN is required")

print(f"[data_prep] reading {CSV}")
df = pd.read_csv(CSV)

for c in ("CustomerID", "Unnamed: 0"):
    if c in df.columns:
        df.drop(columns=[c], inplace=True)
df["Gender"] = df["Gender"].replace("Fe Male", "Female")

for col in ["Age","DurationOfPitch","NumberOfTrips","NumberOfFollowups","MonthlyIncome","PreferredPropertyStar"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())
for col in ["TypeofContact","Occupation","Gender","ProductPitched","MaritalStatus","Designation"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

X = df.drop("ProdTaken", axis=1)
y = df["ProdTaken"].astype(int)
train_df, test_df = train_test_split(pd.concat([X, y], axis=1), test_size=0.2, random_state=42, stratify=y)

out = Path("artifacts"); out.mkdir(exist_ok=True, parents=True)
train_df.to_csv(out/"train.csv", index=False)
test_df.to_csv(out/"test.csv", index=False)

print("[data_prep] login + upload to HF dataset hub")
login(HF_TOKEN)
api = HfApi(token=HF_TOKEN)
api.create_repo(repo_id=DATASET_REPO, repo_type="dataset", exist_ok=True)
api.upload_file(path_or_fileobj=str(out/"train.csv"), path_in_repo="data/train.csv",
                repo_id=DATASET_REPO, repo_type="dataset")
api.upload_file(path_or_fileobj=str(out/"test.csv"),  path_in_repo="data/test.csv",
                repo_id=DATASET_REPO, repo_type="dataset")
print("[data_prep] done")