| import pandas as pd | |
| from sqlalchemy import create_engine | |
| import os | |
| def run_etl(): | |
| file_path = "data/T_ONTIME_REPORTING_20260228_131510/flights.csv" | |
| db_engine = create_engine('sqlite:///data/flights_database.db') | |
| print("ETL Process now start...") | |
| try: | |
| df = pd.read_csv(file_path, low_memory=False) | |
| print(f"Data Loaded successfuly,: {df.shape[0]} rows find it.") | |
| except FileNotFoundError: | |
| print(f"Error: file not found: {file_path}") | |
| return | |
| df = df.dropna(subset=['ARR_DELAY', 'DEP_DELAY']) | |
| if 'CANCELLED' in df.columns: | |
| df = df[df['CANCELLED'] == 0] | |
| df['is_delayed'] = (df['ARR_DELAY'] > 15).astype(int) | |
| print("Cleaning aur Labeling completed,") | |
| if not os.path.exists('data'): | |
| os.makedirs('data') | |
| sample_size = min(10000, len(df)) | |
| sample_df = df.sample(n=sample_size) | |
| sample_df.to_sql('cleaned_flights', con=db_engine, if_exists='replace', index=False) | |
| print(f"Data saved to SQL Database,") | |
| print(f"Database Location: data/flights_database.db") | |
| if __name__ == "__main__": | |
| run_etl() |