Spaces:

Canstralian
/

PyLintPro

Paused

App Files Files Community

Canstralian commited on Feb 2, 2025

Commit

d58fb21

verified ·

1 Parent(s): 4c5305b

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (1) hide show

data/data_preprocessing.py +41 -0

data/data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import LabelEncoder
+def load_data(file_path):
+    """Load dataset from a CSV file."""
+    return pd.read_csv(file_path)
+def handle_missing_values(df):
+    """Handle missing values in the dataset."""
+    # Impute numerical columns with the median
+    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
+    imputer = SimpleImputer(strategy='median')
+    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
+    # Impute categorical columns with the most frequent value
+    categorical_cols = df.select_dtypes(include=['object']).columns
+    imputer = SimpleImputer(strategy='most_frequent')
+    df[categorical_cols] = imputer.fit_transform(df[categorical_cols])
+    return df
+def encode_categorical_variables(df):
+    """Encode categorical variables using Label Encoding."""
+    categorical_cols = df.select_dtypes(include=['object']).columns
+    label_encoder = LabelEncoder()
+    for col in categorical_cols:
+        df[col] = label_encoder.fit_transform(df[col])
+    return df
+def preprocess_data(file_path):
+    """Load, preprocess, and return the dataset."""
+    df = load_data(file_path)
+    df = handle_missing_values(df)
+    df = encode_categorical_variables(df)
+    return df
+if __name__ == "__main__":
+    file_path = 'path_to_your_data.csv'  # Replace with your actual file path
+    processed_data = preprocess_data(file_path)
+    processed_data.to_csv('processed_data.csv', index=False)