Spaces:
Running
Running
File size: 3,154 Bytes
23bb02f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import pandas as pd
import sys
# --- Configuration ---
MAIN_HISTORICAL_FILE = "data/karachi_daily_data_5_years.csv"
NEW_DAILY_DATA_FILE = "data/last_7_days_daily_data.csv"
TIMEZONE = 'Asia/Karachi' # Define timezone as a constant for consistency
def append_and_clean_historical_data(main_file, new_data_file):
"""
Efficiently appends new daily data to the main historical dataset.
This function is idempotent: it combines data, removes any overlaps by
keeping the newest data, and saves a clean, sorted historical file.
It robustly handles both timezone-aware and timezone-naive data.
"""
print("--- Starting historical data update process ---")
# --- Step 1: Load the main historical dataset ---
try:
df_main = pd.read_csv(main_file, parse_dates=['timestamp'])
# === FIX 1: LOCALIZE THE MAIN DATAFRAME'S TIMESTAMP ===
# If the timestamp column is naive, assign the correct timezone.
# If it's already aware, this will correctly convert it.
if df_main['timestamp'].dt.tz is None:
df_main['timestamp'] = df_main['timestamp'].dt.tz_localize(TIMEZONE)
else:
df_main['timestamp'] = df_main['timestamp'].dt.tz_convert(TIMEZONE)
print(f"Loaded and standardized {len(df_main)} records from the main historical file.")
except FileNotFoundError:
print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
sys.exit(1)
# --- Step 2: Load the new daily data ---
try:
df_new = pd.read_csv(new_data_file, parse_dates=['timestamp'])
# === FIX 2: LOCALIZE THE NEW DATAFRAME'S TIMESTAMP (just in case) ===
# This ensures the new data is also tz-aware before concatenation.
if df_new['timestamp'].dt.tz is None:
df_new['timestamp'] = df_new['timestamp'].dt.tz_localize(TIMEZONE)
else:
df_new['timestamp'] = df_new['timestamp'].dt.tz_convert(TIMEZONE)
print(f"Loaded and standardized {len(df_new)} new daily records to be merged.")
except FileNotFoundError:
print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
return
# --- Step 3: Combine and De-duplicate (This will now work) ---
print("Combining datasets and removing duplicates...")
# Now that both dataframes have tz-aware timestamps, this is safe.
df_combined = pd.concat([df_main, df_new], ignore_index=True)
df_final = df_combined.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last')
print(f"-> Combined records: {len(df_combined)}")
print(f"-> Records after de-duplication: {len(df_final)}")
# --- Step 4: Save the updated historical dataset ---
df_final.to_csv(main_file, index=False)
print(f"\n✅ --- SUCCESS --- ✅")
print(f"Main historical dataset '{main_file}' has been updated.")
print(f"It now contains {len(df_final)} clean, unique daily records.")
# --- Run the script ---
if __name__ == "__main__":
append_and_clean_historical_data(MAIN_HISTORICAL_FILE, NEW_DAILY_DATA_FILE) |