Spaces:

03chrisk
/

air_quality_test

Build error

App Files Files Community

03chrisk commited on Sep 19, 2024

Commit

6a9e30e

1 Parent(s): 9fcc62f

creating a pipeline

Browse files

Files changed (5) hide show

air-quality-forecast/data_pipeline.py +90 -16
air-quality-forecast/utils.py +30 -0
data/processed/v1_merged_weather_data.csv +0 -0
data/processed/v2_merged_selected_features_with_missing.csv +0 -0
notebooks/n2_exploratory_data_analysis.ipynb +0 -0

air-quality-forecast/data_pipeline.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import pandas as pd
 import os
 class PreprocessingPipeline:
     def __init__(self):
@@ -10,51 +12,123 @@ class PreprocessingPipeline:
         :param raw_data_path: Path to the raw data directory
         :param processed_data_path: Path to the processed data directory
         """
         project_root = os.path.dirname(os.path.dirname(__file__))
-        raw_data_path = os.path.join(project_root, 'data', 'raw')
-        print(raw_data_path)
-        for file in os.listdir(raw_data_path):
-            print(file)
     def load_raw_data(self):
         """
         Load the raw data from the specified path.
         :return: Raw data as a Pandas DataFrame
         """
-        # TO DO: Implement loading of raw data
-        pass
-    def merge_raw_data(self, raw_data):
         """
         Merge the raw data with additional data from the specified path.
         :param raw_data: Raw data as a Pandas DataFrame
         :return: Merged data as a Pandas DataFrame
         """
-        # TO DO: Implement merging of raw data
-        pass
-    def preprocess_data(self, raw_data):
         """
         Preprocess the raw data according to the steps outlined in the notebooks.
         :param raw_data: Raw data as a Pandas DataFrame
         :return: Preprocessed data as a Pandas DataFrame
         """
-        # TO DO: Implement preprocessing steps
-        pass
-    def save_to_csv(self, name, data):
         """
         Save the preprocessed data to the specified path.
         :param preprocessed_data: Preprocessed data as a Pandas DataFrame
         """
-        # TO DO: Implement saving of preprocessed data
-        pass
     def run_pipeline(self):
         """

 import pandas as pd
 import os
+from utils import FeatureSelector
 class PreprocessingPipeline:
     def __init__(self):
         :param raw_data_path: Path to the raw data directory
         :param processed_data_path: Path to the processed data directory
         """
+        # Global project root path
         project_root = os.path.dirname(os.path.dirname(__file__))
+        # Path to the raw data directory
+        self.raw_data_path = os.path.join(project_root, 'data', 'raw')
+        # Path to the processed data directory
+        self.processed_data_path = os.path.join(project_root, 'data', 'processed')
+        # Initializing the raw datasets
+        self.raw_griftpark_data, self.raw_utrecht_data = self.load_raw_data()
+        # Initializing the merged dataset
+        self.merged_data = self.merge_raw_data()
     def load_raw_data(self):
         """
         Load the raw data from the specified path.
         :return: Raw data as a Pandas DataFrame
         """
+        # Load the first data file
+        raw_griftpark_data = pd.read_csv(os.path.join(self.raw_data_path, 'v1_raw_griftpark,-utrecht-air-quality.csv'))
+        # Load the second data file
+        raw_utrecht_data = pd.read_csv(os.path.join(self.raw_data_path, 'v1_utrecht 2014-01-29 to 2024-09-11.csv'))
+        return raw_griftpark_data, raw_utrecht_data
+    def merge_raw_data(self):
         """
         Merge the raw data with additional data from the specified path.
         :param raw_data: Raw data as a Pandas DataFrame
         :return: Merged data as a Pandas DataFrame
         """
+        raw_additional_data = self.raw_utrecht_data
+        griftpark_data = self.raw_griftpark_data
+        # Convert the 'date' column to datetime format and format the datetime column to 'dd/mm/yyyy'
+        raw_additional_data['datetime'] = pd.to_datetime(raw_additional_data['datetime'], format='%Y-%m-%d').dt.strftime('%d/%m/%Y')
+        # Merge the additional data with the raw data
+        merged_df = pd.merge(griftpark_data, raw_additional_data, left_on='date', right_on='datetime')
+        # Save the merged data
+        self.save_to_csv('v1_merged_weather_data.csv', merged_df, self.processed_data_path)
+        return merged_df
+    def select_features(self, data):
+        """
+        Select the relevant features from the raw data.
+        :param data: Raw data as a Pandas DataFrame
+        :return: Data with selected features as a Pandas DataFrame
+        """
+        #Remove textual/uninformative features
+        cols_to_drop = FeatureSelector.uninformative_columns()
+        data.drop(cols_to_drop, axis=1, inplace=True)
+        #Rename wrongly named columns
+        data = FeatureSelector.rename_initial_columns(data)
+        #Convert columns to numeric
+        data = FeatureSelector.change_to_numeric(data)
+        #Calculate correlations between features and O3/NO2
+        selected_columns = FeatureSelector.select_cols_by_correlation(data)
+        #Add domain knowledge columns
+        domain_knowledge_columns = ['precip','windspeed', 'winddir']
+        selected_columns = selected_columns + domain_knowledge_columns
+        return data[selected_columns]
+    def apply_time_shift(self, data, t = 3):
+        """
+        Applies the time shift to the dataset and adds the shifted columns.
+        """
+        all_cols = data.columns
+        for t in range(1,t+1):
+            for col in all_cols:
+                data[[f'{col} - day {t}']] = data[[col]].shift(-t)
+        for t in range(1,t):
+            for col in ['o3', 'no2']:
+                data[[f'{col} + day {t}']] = data[[col]].shift(t)
+        data[data.columns] = data[data.columns].apply(pd.to_numeric)
+        return data
+    def preprocess_data(self, data):
         """
         Preprocess the raw data according to the steps outlined in the notebooks.
         :param raw_data: Raw data as a Pandas DataFrame
         :return: Preprocessed data as a Pandas DataFrame
         """
+        data = self.select_features(data)
+        data = self.apply_time_shift(data)
+        data.drop(['pm25','pm10','temp','humidity','visibility','solarradiation','precip','windspeed','winddir'], axis=1, inplace=True)
+        data.drop(index=['29/01/2014','30/01/2014','31/01/2014', '10/09/2024', '11/09/2024'], inplace=True)
+        return data
+    def save_to_csv(self, name, data, path):
         """
         Save the preprocessed data to the specified path.
         :param preprocessed_data: Preprocessed data as a Pandas DataFrame
         """
+        data.to_csv(os.path.join(path, name))
     def run_pipeline(self):
         """

air-quality-forecast/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+File with utilities
+"""
+class FeatureSelector:
+    def uninformative_columns() -> list:
+        """ Those columns provide no information that the model can use"""
+        return ["Unnamed: 0", 'name', 'datetime', 'sunrise', 'sunset', 'preciptype', 'conditions', 'description', 'icon', 'stations']
+    def rename_initial_columns(data):
+        """ Rename the columns of the datasets to remove whitespaces."""
+        data = data.rename(columns={" pm25": "pm25", " pm10": "pm10", " o3": "o3", " no2": "no2", " so2": "so2"})
+        return data
+    def change_to_numeric(data):
+        """ Change each entry to a numerical value."""
+        data.loc[:, data.columns != 'date'] = data.loc[:, data.columns != 'date'].apply(pd.to_numeric, errors='coerce')
+        return data
+    def select_cols_by_correlation(data) -> list:
+        """ Select columns based on correlation criteria."""
+        #Step 1: Calculate correlations between features and O3/NO2
+        corr_no2 = abs(data.loc[:, data.columns != 'date'].corr()['no2'])
+        corr_o3 = abs(data.loc[:, data.columns != 'date'].corr()['o3'])
+        #Step 2: Remove the columns not correlated with any of the labels
+        columns_above_threshold = (corr_no2 > 0.3) | (corr_o3 > 0.3)
+        selected_columns = columns_above_threshold[columns_above_threshold].index
+        #Step 3: Remove the columns with high correlations with each other (chosen by manual inspection of the correlation matrix)
+        to_remove = ['feelslikemax', 'feelslikemin', 'feelslike', 'tempmin', 'tempmax', 'dew', 'solarenergy', 'uvindex']
+        selected_columns = [item for item in selected_columns if item not in to_remove]
+        return selected_columns

data/processed/v1_merged_weather_data.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/processed/v2_merged_selected_features_with_missing.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/n2_exploratory_data_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff