Spaces:
Sleeping
Sleeping
File size: 915 Bytes
1e5b98a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# src/data_prep.py
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from .config import RAW_DATA_PATH, PROCESSED_DATA_DIR, TARGET_COLUMN, RANDOM_STATE
def main() -> None:
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(RAW_DATA_PATH, parse_dates=["application_date"])
train_df, valid_df = train_test_split(
df,
test_size=0.2,
random_state=RANDOM_STATE,
stratify=df[TARGET_COLUMN],
)
train_path = PROCESSED_DATA_DIR / "train.csv"
valid_path = PROCESSED_DATA_DIR / "valid.csv"
train_df.to_csv(train_path, index=False)
valid_df.to_csv(valid_path, index=False)
print(f"Train guardado en: {train_path} ({len(train_df)} filas)")
print(f"Valid guardado en: {valid_path} ({len(valid_df)} filas)")
if __name__ == "__main__":
main()
|