AQI_Predictor_Qamar / append_and_clean_historical_data.py
github-actions[bot]
Automated backend deployment for 2026-04-05
23bb02f
import pandas as pd
import sys
# --- Configuration ---
MAIN_HISTORICAL_FILE = "data/karachi_daily_data_5_years.csv"
NEW_DAILY_DATA_FILE = "data/last_7_days_daily_data.csv"
TIMEZONE = 'Asia/Karachi' # Define timezone as a constant for consistency
def append_and_clean_historical_data(main_file, new_data_file):
"""
Efficiently appends new daily data to the main historical dataset.
This function is idempotent: it combines data, removes any overlaps by
keeping the newest data, and saves a clean, sorted historical file.
It robustly handles both timezone-aware and timezone-naive data.
"""
print("--- Starting historical data update process ---")
# --- Step 1: Load the main historical dataset ---
try:
df_main = pd.read_csv(main_file, parse_dates=['timestamp'])
# === FIX 1: LOCALIZE THE MAIN DATAFRAME'S TIMESTAMP ===
# If the timestamp column is naive, assign the correct timezone.
# If it's already aware, this will correctly convert it.
if df_main['timestamp'].dt.tz is None:
df_main['timestamp'] = df_main['timestamp'].dt.tz_localize(TIMEZONE)
else:
df_main['timestamp'] = df_main['timestamp'].dt.tz_convert(TIMEZONE)
print(f"Loaded and standardized {len(df_main)} records from the main historical file.")
except FileNotFoundError:
print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
sys.exit(1)
# --- Step 2: Load the new daily data ---
try:
df_new = pd.read_csv(new_data_file, parse_dates=['timestamp'])
# === FIX 2: LOCALIZE THE NEW DATAFRAME'S TIMESTAMP (just in case) ===
# This ensures the new data is also tz-aware before concatenation.
if df_new['timestamp'].dt.tz is None:
df_new['timestamp'] = df_new['timestamp'].dt.tz_localize(TIMEZONE)
else:
df_new['timestamp'] = df_new['timestamp'].dt.tz_convert(TIMEZONE)
print(f"Loaded and standardized {len(df_new)} new daily records to be merged.")
except FileNotFoundError:
print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
return
# --- Step 3: Combine and De-duplicate (This will now work) ---
print("Combining datasets and removing duplicates...")
# Now that both dataframes have tz-aware timestamps, this is safe.
df_combined = pd.concat([df_main, df_new], ignore_index=True)
df_final = df_combined.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last')
print(f"-> Combined records: {len(df_combined)}")
print(f"-> Records after de-duplication: {len(df_final)}")
# --- Step 4: Save the updated historical dataset ---
df_final.to_csv(main_file, index=False)
print(f"\n✅ --- SUCCESS --- ✅")
print(f"Main historical dataset '{main_file}' has been updated.")
print(f"It now contains {len(df_final)} clean, unique daily records.")
# --- Run the script ---
if __name__ == "__main__":
append_and_clean_historical_data(MAIN_HISTORICAL_FILE, NEW_DAILY_DATA_FILE)