Spaces:

Qar-Raz
/

AQI_Predictor_Qamar

Running

AQI_Predictor_Qamar / append_and_clean_historical_data.py

github-actions[bot]

Automated backend deployment for 2026-04-05

23bb02f about 20 hours ago

3.15 kB

	import pandas as pd
	import sys

	# --- Configuration ---
	MAIN_HISTORICAL_FILE = "data/karachi_daily_data_5_years.csv"
	NEW_DAILY_DATA_FILE = "data/last_7_days_daily_data.csv"
	TIMEZONE = 'Asia/Karachi' # Define timezone as a constant for consistency

	def append_and_clean_historical_data(main_file, new_data_file):
	"""
	Efficiently appends new daily data to the main historical dataset.

	This function is idempotent: it combines data, removes any overlaps by
	keeping the newest data, and saves a clean, sorted historical file.
	It robustly handles both timezone-aware and timezone-naive data.
	"""
	print("--- Starting historical data update process ---")

	# --- Step 1: Load the main historical dataset ---
	try:
	df_main = pd.read_csv(main_file, parse_dates=['timestamp'])

	# === FIX 1: LOCALIZE THE MAIN DATAFRAME'S TIMESTAMP ===
	# If the timestamp column is naive, assign the correct timezone.
	# If it's already aware, this will correctly convert it.
	if df_main['timestamp'].dt.tz is None:
	df_main['timestamp'] = df_main['timestamp'].dt.tz_localize(TIMEZONE)
	else:
	df_main['timestamp'] = df_main['timestamp'].dt.tz_convert(TIMEZONE)

	print(f"Loaded and standardized {len(df_main)} records from the main historical file.")
	except FileNotFoundError:
	print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
	sys.exit(1)

	# --- Step 2: Load the new daily data ---
	try:
	df_new = pd.read_csv(new_data_file, parse_dates=['timestamp'])

	# === FIX 2: LOCALIZE THE NEW DATAFRAME'S TIMESTAMP (just in case) ===
	# This ensures the new data is also tz-aware before concatenation.
	if df_new['timestamp'].dt.tz is None:
	df_new['timestamp'] = df_new['timestamp'].dt.tz_localize(TIMEZONE)
	else:
	df_new['timestamp'] = df_new['timestamp'].dt.tz_convert(TIMEZONE)

	print(f"Loaded and standardized {len(df_new)} new daily records to be merged.")
	except FileNotFoundError:
	print(f"!!! ERROR: New daily data file '{new_data_file}' not found. Aborting.")
	return

	# --- Step 3: Combine and De-duplicate (This will now work) ---
	print("Combining datasets and removing duplicates...")

	# Now that both dataframes have tz-aware timestamps, this is safe.
	df_combined = pd.concat([df_main, df_new], ignore_index=True)

	df_final = df_combined.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last')

	print(f"-> Combined records: {len(df_combined)}")
	print(f"-> Records after de-duplication: {len(df_final)}")

	# --- Step 4: Save the updated historical dataset ---
	df_final.to_csv(main_file, index=False)

	print(f"\n✅ --- SUCCESS --- ✅")
	print(f"Main historical dataset '{main_file}' has been updated.")
	print(f"It now contains {len(df_final)} clean, unique daily records.")


	# --- Run the script ---
	if __name__ == "__main__":
	append_and_clean_historical_data(MAIN_HISTORICAL_FILE, NEW_DAILY_DATA_FILE)