library(arrow) library(dplyr) # Load data tavg_dataset <- open_dataset("www/data/tabs/tavg_long.parquet") prec_dataset <- open_dataset("www/data/tabs/prec_long.parquet") prec_meta <- read.csv("www/data/tabs/prec_meta.csv") %>% rename( ID = GHCN_ID, LATITUDE = Latitude, LONGITUDE = Longitude, STNELEV = Elevation, NAME = Station_Name ) prec_avail <- read.csv("www/data/tabs/prec_availability.csv") prec_stations_data <- merge(prec_meta, prec_avail, by = "ID") print(paste("Prec Meta Rows:", nrow(prec_meta))) print(paste("Prec Avail Rows:", nrow(prec_avail))) print(paste("Prec Stations Data Rows:", nrow(prec_stations_data))) # Simulate Inputs year_range <- c(2000, 2010) month_number <- 1 # January # Test Filter Parquet print("Filtering Parquet...") filtered_data <- prec_dataset %>% filter( VALUE >= -90, YEAR >= year_range[1], YEAR <= year_range[2], MONTH == month_number ) %>% group_by(ID) %>% summarize(mean_value = mean(VALUE, na.rm = TRUE)) %>% collect() print(paste("Filtered Data Rows:", nrow(filtered_data))) if (nrow(filtered_data) > 0) { print(head(filtered_data)) } # Test Filter Stations print("Filtering Stations...") stations_result <- prec_stations_data %>% filter( first_year <= year_range[1], last_year >= year_range[2], ID %in% filtered_data$ID ) %>% left_join(filtered_data, by = "ID") print(paste("Stations Result Rows:", nrow(stations_result))) if (nrow(stations_result) > 0) { print(head(stations_result)) } else { print("No stations found after filtering.") print("Check if filtered_data IDs match prec_stations_data IDs.") # Check intersection common_ids <- intersect(filtered_data$ID, prec_stations_data$ID) print(paste("Common IDs:", length(common_ids))) if (length(common_ids) > 0) { print("There are common IDs, so the issue might be first_year/last_year filter.") print(head(prec_stations_data[prec_stations_data$ID %in% common_ids, ])) } }