import pandas as pd import sys # --- Configuration --- MAIN_HISTORICAL_FILE = "data/karachi_daily_data_5_years.csv" NEW_DAILY_DATA_FILE = "data/last_7_days_daily_data.csv" TIMEZONE = 'Asia/Karachi' # Define timezone as a constant for consistency def append_and_clean_historical_data(main_file, new_data_file): """ Efficiently appends new daily data to the main historical dataset. This function is idempotent: it combines data, removes any overlaps by keeping the newest data, and saves a clean, sorted historical file. It robustly handles both timezone-aware and timezone-naive data. """ print("--- Starting historical data update process ---") # --- Step 1: Load the main historical dataset --- try: df_main = pd.read_csv(main_file, parse_dates=['timestamp']) # === FIX 1: LOCALIZE THE MAIN DATAFRAME'S TIMESTAMP === # If the timestamp column is naive, assign the correct timezone. # If it's already aware, this will correctly convert it. if df_main['timestamp'].dt.tz is None: df_main['timestamp'] = df_main['timestamp'].dt.tz_localize(TIMEZONE) else: df_main['timestamp'] = df_main['timestamp'].dt.tz_convert(TIMEZONE) print(f"Loaded and standardized {len(df_main)} records from the main historical file.") except FileNotFoundError: print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.") sys.exit(1) # --- Step 2: Load the new daily data --- try: df_new = pd.read_csv(new_data_file, parse_dates=['timestamp']) # === FIX 2: LOCALIZE THE NEW DATAFRAME'S TIMESTAMP (just in case) === # This ensures the new data is also tz-aware before concatenation. if df_new['timestamp'].dt.tz is None: df_new['timestamp'] = df_new['timestamp'].dt.tz_localize(TIMEZONE) else: df_new['timestamp'] = df_new['timestamp'].dt.tz_convert(TIMEZONE) print(f"Loaded and standardized {len(df_new)} new daily records to be merged.") except FileNotFoundError: print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.") return # --- Step 3: Combine and De-duplicate (This will now work) --- print("Combining datasets and removing duplicates...") # Now that both dataframes have tz-aware timestamps, this is safe. df_combined = pd.concat([df_main, df_new], ignore_index=True) df_final = df_combined.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last') print(f"-> Combined records: {len(df_combined)}") print(f"-> Records after de-duplication: {len(df_final)}") # --- Step 4: Save the updated historical dataset --- df_final.to_csv(main_file, index=False) print(f"\nāœ… --- SUCCESS --- āœ…") print(f"Main historical dataset '{main_file}' has been updated.") print(f"It now contains {len(df_final)} clean, unique daily records.") # --- Run the script --- if __name__ == "__main__": append_and_clean_historical_data(MAIN_HISTORICAL_FILE, NEW_DAILY_DATA_FILE)