flightt / etl.py
saad1BM's picture
Upload 10 files
cc12750 verified
import pandas as pd
from sqlalchemy import create_engine
import os
def run_etl():
file_path = "data/T_ONTIME_REPORTING_20260228_131510/flights.csv"
db_engine = create_engine('sqlite:///data/flights_database.db')
print("ETL Process now start...")
try:
df = pd.read_csv(file_path, low_memory=False)
print(f"Data Loaded successfuly,: {df.shape[0]} rows find it.")
except FileNotFoundError:
print(f"Error: file not found: {file_path}")
return
df = df.dropna(subset=['ARR_DELAY', 'DEP_DELAY'])
if 'CANCELLED' in df.columns:
df = df[df['CANCELLED'] == 0]
df['is_delayed'] = (df['ARR_DELAY'] > 15).astype(int)
print("Cleaning aur Labeling completed,")
if not os.path.exists('data'):
os.makedirs('data')
sample_size = min(10000, len(df))
sample_df = df.sample(n=sample_size)
sample_df.to_sql('cleaned_flights', con=db_engine, if_exists='replace', index=False)
print(f"Data saved to SQL Database,")
print(f"Database Location: data/flights_database.db")
if __name__ == "__main__":
run_etl()