File size: 2,072 Bytes
f34a7ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(arrow)
library(dplyr)

# Load data
tavg_dataset <- open_dataset("www/data/tabs/tavg_long.parquet")
prec_dataset <- open_dataset("www/data/tabs/prec_long.parquet")

prec_meta <- read.csv("www/data/tabs/prec_meta.csv") %>%
    rename(
        ID = GHCN_ID,
        LATITUDE = Latitude,
        LONGITUDE = Longitude,
        STNELEV = Elevation,
        NAME = Station_Name
    )
prec_avail <- read.csv("www/data/tabs/prec_availability.csv")
prec_stations_data <- merge(prec_meta, prec_avail, by = "ID")

print(paste("Prec Meta Rows:", nrow(prec_meta)))
print(paste("Prec Avail Rows:", nrow(prec_avail)))
print(paste("Prec Stations Data Rows:", nrow(prec_stations_data)))

# Simulate Inputs
year_range <- c(2000, 2010)
month_number <- 1 # January

# Test Filter Parquet
print("Filtering Parquet...")
filtered_data <- prec_dataset %>%
    filter(
        VALUE >= -90,
        YEAR >= year_range[1],
        YEAR <= year_range[2],
        MONTH == month_number
    ) %>%
    group_by(ID) %>%
    summarize(mean_value = mean(VALUE, na.rm = TRUE)) %>%
    collect()

print(paste("Filtered Data Rows:", nrow(filtered_data)))
if (nrow(filtered_data) > 0) {
    print(head(filtered_data))
}

# Test Filter Stations
print("Filtering Stations...")
stations_result <- prec_stations_data %>%
    filter(
        first_year <= year_range[1],
        last_year >= year_range[2],
        ID %in% filtered_data$ID
    ) %>%
    left_join(filtered_data, by = "ID")

print(paste("Stations Result Rows:", nrow(stations_result)))
if (nrow(stations_result) > 0) {
    print(head(stations_result))
} else {
    print("No stations found after filtering.")
    print("Check if filtered_data IDs match prec_stations_data IDs.")

    # Check intersection
    common_ids <- intersect(filtered_data$ID, prec_stations_data$ID)
    print(paste("Common IDs:", length(common_ids)))

    if (length(common_ids) > 0) {
        print("There are common IDs, so the issue might be first_year/last_year filter.")
        print(head(prec_stations_data[prec_stations_data$ID %in% common_ids, ]))
    }
}