Spaces:

p2ov
/

streamlit_app

Sleeping

File size: 3,125 Bytes

12161ea

import pandas as pd
import os


# Step 1: Extract
def extract_data(file_path):
    """Extracts data from a CSV file."""
    try:
        data = pd.read_csv(file_path)
        print("Data extraction successful.")
        return data
    except Exception as e:
        print(f"Error in data extraction: {e}")
        return None


# Step 2: Transform
def transform_data(data):
    """Transforms the data by cleaning and adding new features."""
    try:
        # Drop rows with missing values
        data_cleaned = data.dropna().copy()

        # Add a new column for Tax (assuming a flat 10% tax rate on salary)
        # data_cleaned["tax"] = data_cleaned["salary"] * 0.1
        data_cleaned.loc[:, "tax"] = data_cleaned["salary"] * 0.1

        # Calculate net salary after tax
        # data_cleaned["net_salary"] = data_cleaned["salary"] - data_cleaned["tax"]
        data_cleaned.loc[:, "net_salary"] = data_cleaned["salary"] - data_cleaned["tax"]

        # data_cleaned["net_salary"] = model.predict(X)

        print("Data transformation successful.")
        return data_cleaned
    except Exception as e:
        print(f"Error in data transformation: {e}")
        return None


# # Step 3: Load
# def load_data(data, output_file_path):
#     """Loads the transformed data into a new CSV file."""
#     try:
#         data.to_csv(output_file_path, index=False)
#         print(f"Data loaded successfully to {output_file_path}.")
#     except Exception as e:
#         print(f"Error in data loading: {e}")


# # Main ETL function
# def etl_process(input_file, output_file):
#     data = extract_data(input_file)
#     if data is not None:
#         transformed_data = transform_data(data)
#         if transformed_data is not None:
#             load_data(transformed_data, output_file)


# if __name__ == "__main__":
#     input_file = "input_data.csv"
#     output_file = "output_data.csv"
#     etl_process(input_file, output_file)


# Step 3: Load
def load_data(data, output_file_path):
    """Loads the transformed data into a new CSV file."""
    try:
        # Assurer que le dossier `data/` existe
        output_dir = os.path.dirname(output_file_path)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"📂 Created missing directory: {output_dir}")

        # Sauvegarde du fichier
        data.to_csv(output_file_path, index=False)
        print(f"✅ Data loaded successfully to {output_file_path}.")
    except Exception as e:
        print(f"❌ Error in data loading: {e}")


# Main ETL function
def etl_process(input_file, output_file):
    print("🚀 Starting ETL Process...")

    data = extract_data(input_file)
    if data is not None:
        transformed_data = transform_data(data)
        if transformed_data is not None:
            load_data(transformed_data, output_file)

    print("✅ ETL Process Completed!")


if __name__ == "__main__":
    input_file = "data/input_data.csv"  # Assurez-vous que le fichier est bien là
    output_file = "data/output_data.csv"  # Sauvegarde bien dans `data/`

    etl_process(input_file, output_file)