Spaces:
Sleeping
Sleeping
File size: 3,125 Bytes
12161ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import pandas as pd
import os
# Step 1: Extract
def extract_data(file_path):
"""Extracts data from a CSV file."""
try:
data = pd.read_csv(file_path)
print("Data extraction successful.")
return data
except Exception as e:
print(f"Error in data extraction: {e}")
return None
# Step 2: Transform
def transform_data(data):
"""Transforms the data by cleaning and adding new features."""
try:
# Drop rows with missing values
data_cleaned = data.dropna().copy()
# Add a new column for Tax (assuming a flat 10% tax rate on salary)
# data_cleaned["tax"] = data_cleaned["salary"] * 0.1
data_cleaned.loc[:, "tax"] = data_cleaned["salary"] * 0.1
# Calculate net salary after tax
# data_cleaned["net_salary"] = data_cleaned["salary"] - data_cleaned["tax"]
data_cleaned.loc[:, "net_salary"] = data_cleaned["salary"] - data_cleaned["tax"]
# data_cleaned["net_salary"] = model.predict(X)
print("Data transformation successful.")
return data_cleaned
except Exception as e:
print(f"Error in data transformation: {e}")
return None
# # Step 3: Load
# def load_data(data, output_file_path):
# """Loads the transformed data into a new CSV file."""
# try:
# data.to_csv(output_file_path, index=False)
# print(f"Data loaded successfully to {output_file_path}.")
# except Exception as e:
# print(f"Error in data loading: {e}")
# # Main ETL function
# def etl_process(input_file, output_file):
# data = extract_data(input_file)
# if data is not None:
# transformed_data = transform_data(data)
# if transformed_data is not None:
# load_data(transformed_data, output_file)
# if __name__ == "__main__":
# input_file = "input_data.csv"
# output_file = "output_data.csv"
# etl_process(input_file, output_file)
# Step 3: Load
def load_data(data, output_file_path):
"""Loads the transformed data into a new CSV file."""
try:
# Assurer que le dossier `data/` existe
output_dir = os.path.dirname(output_file_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"📂 Created missing directory: {output_dir}")
# Sauvegarde du fichier
data.to_csv(output_file_path, index=False)
print(f"✅ Data loaded successfully to {output_file_path}.")
except Exception as e:
print(f"❌ Error in data loading: {e}")
# Main ETL function
def etl_process(input_file, output_file):
print("🚀 Starting ETL Process...")
data = extract_data(input_file)
if data is not None:
transformed_data = transform_data(data)
if transformed_data is not None:
load_data(transformed_data, output_file)
print("✅ ETL Process Completed!")
if __name__ == "__main__":
input_file = "data/input_data.csv" # Assurez-vous que le fichier est bien là
output_file = "data/output_data.csv" # Sauvegarde bien dans `data/`
etl_process(input_file, output_file)
|