File size: 3,154 Bytes
23bb02f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import sys

# --- Configuration ---
MAIN_HISTORICAL_FILE = "data/karachi_daily_data_5_years.csv"
NEW_DAILY_DATA_FILE = "data/last_7_days_daily_data.csv"
TIMEZONE = 'Asia/Karachi' # Define timezone as a constant for consistency

def append_and_clean_historical_data(main_file, new_data_file):
    """
    Efficiently appends new daily data to the main historical dataset.
    
    This function is idempotent: it combines data, removes any overlaps by
    keeping the newest data, and saves a clean, sorted historical file.
    It robustly handles both timezone-aware and timezone-naive data.
    """
    print("--- Starting historical data update process ---")

    # --- Step 1: Load the main historical dataset ---
    try:
        df_main = pd.read_csv(main_file, parse_dates=['timestamp'])
        
        # === FIX 1: LOCALIZE THE MAIN DATAFRAME'S TIMESTAMP ===
        # If the timestamp column is naive, assign the correct timezone.
        # If it's already aware, this will correctly convert it.
        if df_main['timestamp'].dt.tz is None:
            df_main['timestamp'] = df_main['timestamp'].dt.tz_localize(TIMEZONE)
        else:
            df_main['timestamp'] = df_main['timestamp'].dt.tz_convert(TIMEZONE)
            
        print(f"Loaded and standardized {len(df_main)} records from the main historical file.")
    except FileNotFoundError:
        print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
        sys.exit(1)

    # --- Step 2: Load the new daily data ---
    try:
        df_new = pd.read_csv(new_data_file, parse_dates=['timestamp'])
        
        # === FIX 2: LOCALIZE THE NEW DATAFRAME'S TIMESTAMP (just in case) ===
        # This ensures the new data is also tz-aware before concatenation.
        if df_new['timestamp'].dt.tz is None:
            df_new['timestamp'] = df_new['timestamp'].dt.tz_localize(TIMEZONE)
        else:
            df_new['timestamp'] = df_new['timestamp'].dt.tz_convert(TIMEZONE)

        print(f"Loaded and standardized {len(df_new)} new daily records to be merged.")
    except FileNotFoundError:
        print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
        return

    # --- Step 3: Combine and De-duplicate (This will now work) ---
    print("Combining datasets and removing duplicates...")
    
    # Now that both dataframes have tz-aware timestamps, this is safe.
    df_combined = pd.concat([df_main, df_new], ignore_index=True)
    
    df_final = df_combined.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last')
    
    print(f"-> Combined records: {len(df_combined)}")
    print(f"-> Records after de-duplication: {len(df_final)}")
    
    # --- Step 4: Save the updated historical dataset ---
    df_final.to_csv(main_file, index=False)
    
    print(f"\n✅ --- SUCCESS --- ✅")
    print(f"Main historical dataset '{main_file}' has been updated.")
    print(f"It now contains {len(df_final)} clean, unique daily records.")


# --- Run the script ---
if __name__ == "__main__":
    append_and_clean_historical_data(MAIN_HISTORICAL_FILE, NEW_DAILY_DATA_FILE)