library(rvest) library(dplyr) library(stringr) library(readr) library(httr) #' Scrape JMA Data (Daily, Hourly, 10-Minute, Monthly) #' #' Fetches weather data for a specific station, year, month, and optionally day. #' #' @param block_no Station Block Number (ID). #' @param year Year (numeric). #' @param month Month (numeric). #' @param day Day (numeric, required for Hourly/10-Minute resolutions). #' @param prec_no Prefecture Number (ID). #' @param type Station type ("s1" or "a1"). #' @param resolution Resolution ("Daily", "Hourly", "10 Minutes", "Monthly") #' #' @return A data frame containing the data, or NULL if failed. get_jma_data <- function(block_no, year, month, day = NULL, prec_no, type = "s1", resolution = "Daily") { # Map resolution to URL part res_code <- "daily" if (resolution == "Hourly") res_code <- "hourly" if (resolution == "10 Minutes" || resolution == "10-Minute") res_code <- "10min" if (resolution == "Monthly") res_code <- "monthly" # URL construction - include day for hourly/10min if (resolution %in% c("Daily", "Monthly")) { url <- sprintf( "https://www.data.jma.go.jp/obd/stats/etrn/view/%s_%s.php?prec_no=%s&block_no=%s&year=%d&month=%d&day=&view=", res_code, type, prec_no, block_no, year, month ) } else { # Hourly and 10-minute require day parameter if (is.null(day)) { warning("Hourly/10-Minute resolution requires a day parameter") return(NULL) } url <- sprintf( "https://www.data.jma.go.jp/obd/stats/etrn/view/%s_%s.php?prec_no=%s&block_no=%s&year=%d&month=%d&day=%d&view=", res_code, type, prec_no, block_no, year, month, day ) } message(sprintf("Downloading %s data from: %s", resolution, url)) message(sprintf("Downloading %s data from: %s", resolution, url)) # Use httr::GET with timeout to prevent hanging page <- tryCatch( { resp <- httr::GET(url, httr::timeout(10)) if (httr::status_code(resp) != 200) { warning(sprintf("Failed to download data: HTTP %s", httr::status_code(resp))) return(NULL) } read_html(resp) }, error = function(e) { warning("Failed to download data: ", e) return(NULL) } ) if (is.null(page)) { return(NULL) } # Find the weather data table (monthly tables are shorter) tables <- html_table(page, fill = TRUE, header = FALSE) weather_table <- NULL min_rows <- ifelse(resolution == "Monthly", 10, 20) for (t in tables) { if (nrow(t) > min_rows && ncol(t) > 3) { weather_table <- t break } } if (is.null(weather_table)) { return(NULL) } if (nrow(weather_table) <= 3) { return(NULL) } if (nrow(weather_table) <= 3) { return(NULL) } # Column Selection based on resolution and station type cols <- NULL col_names <- NULL time_col_name <- "Day" # Default df <- NULL # Skip header rows (row 1 is main header, row 2 might be sub-header/units) # For simple parsing, we rely on Row 1 for keywords. if (resolution == "Monthly") { # Monthly tables might have 1 or 2 header rows. Row 1 usually contains the variable names. data_rows <- weather_table[2:nrow(weather_table), ] } else { data_rows <- weather_table[2:nrow(weather_table), ] } # Dynamic Column Parsing based on Header Names # Header is always in row 1 for these tables, but Row 2 has sub-headers (Mean/Max/Min) for Monthly header_row <- as.character(weather_table[1, ]) header_row_2 <- if (nrow(weather_table) >= 2) as.character(weather_table[2, ]) else rep("", length(header_row)) # Define mappings (Keyword -> English Name) col_map <- list() col_indices <- c() col_final_names <- c() # helper to find index matching both row 1 (main) and row 2 (sub) pattern find_col <- function(p1, p2 = NULL) { i1 <- grep(p1, header_row) if (length(i1) == 0) return(NULL) if (is.null(p2)) return(i1[1]) # Check sub-headers at these indices matches <- i1[grep(p2, header_row_2[i1])] if (length(matches) > 0) return(matches[1]) return(NULL) } # 1. Time / Day / Month if (resolution == "Monthly") { idx <- grep("月", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]); col_final_names <- c(col_final_names, "Month") } } else if (resolution == "Daily") { idx <- grep("^日", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]); col_final_names <- c(col_final_names, "Day") } } else if (resolution == "Hourly") { idx <- grep("^時", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]); col_final_names <- c(col_final_names, "Hour") } } else { idx <- grep("^時", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]); col_final_names <- c(col_final_names, "Time") } } # 2. Pressure if (resolution == "Monthly") { idx_sta <- find_col("気圧", "現地|平均") if (is.null(idx_sta)) { match <- grep("気圧", header_row) if (length(match) > 0) idx_sta <- match[1] } idx_sea <- find_col("気圧", "海面") if (is.null(idx_sea) && length(grep("気圧", header_row)) > 1) { match <- grep("気圧", header_row) idx_sea <- match[2] } if (!is.null(idx_sta)) { col_indices <- c(col_indices, idx_sta); col_final_names <- c(col_final_names, "Pressure") } if (!is.null(idx_sea)) { col_indices <- c(col_indices, idx_sea); col_final_names <- c(col_final_names, "Pressure_Sea_Level") } # Precip idx_pr <- find_col("降水量", "合計") if (is.null(idx_pr)) idx_pr <- grep("降水量", header_row)[1] if (!is.null(idx_pr)) { col_indices <- c(col_indices, idx_pr); col_final_names <- c(col_final_names, "Precipitation") } # Temp idx_tm <- find_col("気温", "平均") if (is.null(idx_tm)) idx_tm <- grep("気温", header_row)[1] idx_th <- find_col("気温", "最高") idx_tl <- find_col("気温", "最低") if (!is.null(idx_tm)) { col_indices <- c(col_indices, idx_tm); col_final_names <- c(col_final_names, "Temp_Mean") } if (!is.null(idx_th)) { col_indices <- c(col_indices, idx_th); col_final_names <- c(col_final_names, "Temp_Max") } if (!is.null(idx_tl)) { col_indices <- c(col_indices, idx_tl); col_final_names <- c(col_final_names, "Temp_Min") } # Humidity idx_hm <- find_col("湿度", "平均") if (is.null(idx_hm)) idx_hm <- grep("湿度", header_row)[1] idx_hl <- find_col("湿度", "最小") if (!is.null(idx_hm)) { col_indices <- c(col_indices, idx_hm); col_final_names <- c(col_final_names, "Humidity") } if (!is.null(idx_hl)) { col_indices <- c(col_indices, idx_hl); col_final_names <- c(col_final_names, "Humidity_Min") } # Wind idx_ws <- find_col("風速", "平均風速|平均") if (is.null(idx_ws)) idx_ws <- grep("風速", header_row)[1] idx_ws_max <- find_col("風速", "最大風速") if (!is.null(idx_ws)) { col_indices <- c(col_indices, idx_ws); col_final_names <- c(col_final_names, "Wind_Speed") } if (!is.null(idx_ws_max)) { col_indices <- c(col_indices, idx_ws_max); col_final_names <- c(col_final_names, "Wind_Speed_Max") } } else if (resolution == "Daily") { # Pressure id_p_sta <- find_col("気圧", "現地") if (is.null(id_p_sta)) id_p_sta <- grep("気圧", header_row)[1] id_p_sea <- find_col("気圧", "海面") if (is.null(id_p_sea) && length(grep("気圧", header_row)) > 1) { # assumption: 2nd pressure col is Sea Level id_p_sea <- grep("気圧", header_row)[2] } if (!is.null(id_p_sta)) { col_indices <- c(col_indices, id_p_sta); col_final_names <- c(col_final_names, "Pressure") } if (!is.null(id_p_sea)) { col_indices <- c(col_indices, id_p_sea); col_final_names <- c(col_final_names, "Pressure_Sea_Level") } # Precip id_pr_tot <- find_col("降水量", "合計") if (is.null(id_pr_tot)) id_pr_tot <- grep("降水量", header_row)[1] id_pr_1h <- find_col("降水量", "最大1時間") if (is.null(id_pr_1h) && length(grep("降水量", header_row)) >= 2) id_pr_1h <- grep("降水量", header_row)[2] id_pr_10m <- find_col("降水量", "最大10分") if (is.null(id_pr_10m) && length(grep("降水量", header_row)) >= 3) id_pr_10m <- grep("降水量", header_row)[3] if (!is.null(id_pr_tot)) { col_indices <- c(col_indices, id_pr_tot); col_final_names <- c(col_final_names, "Precipitation") } if (!is.null(id_pr_1h)) { col_indices <- c(col_indices, id_pr_1h); col_final_names <- c(col_final_names, "Precipitation_Max_1h") } if (!is.null(id_pr_10m)) { col_indices <- c(col_indices, id_pr_10m); col_final_names <- c(col_final_names, "Precipitation_Max_10min") } # Temp # Positional Fallback: 1=Mean, 2=Max, 3=Min temp_cols <- grep("気温", header_row) if (length(temp_cols) > 0) { # Try explicit first id_tm <- find_col("気温", "平均") id_th <- find_col("気温", "最高") id_tl <- find_col("気温", "最低") # If explicit fails but we have 3 columns, assume Mean/Max/Min if (is.null(id_tm) && length(temp_cols) >= 1) id_tm <- temp_cols[1] if (is.null(id_th) && length(temp_cols) >= 2) id_th <- temp_cols[2] if (is.null(id_tl) && length(temp_cols) >= 3) id_tl <- temp_cols[3] if (!is.null(id_tm)) { col_indices <- c(col_indices, id_tm); col_final_names <- c(col_final_names, "Temp_Mean") } if (!is.null(id_th)) { col_indices <- c(col_indices, id_th); col_final_names <- c(col_final_names, "Temp_Max") } if (!is.null(id_tl)) { col_indices <- c(col_indices, id_tl); col_final_names <- c(col_final_names, "Temp_Min") } } # Humidity # Positional: 1=Mean, 2=Min hum_cols <- grep("湿度", header_row) if (length(hum_cols) > 0) { id_hm <- find_col("湿度", "平均") id_hl <- find_col("湿度", "最小") if (is.null(id_hm) && length(hum_cols) >= 1) id_hm <- hum_cols[1] if (is.null(id_hl) && length(hum_cols) >= 2) id_hl <- hum_cols[2] if (!is.null(id_hm)) { col_indices <- c(col_indices, id_hm); col_final_names <- c(col_final_names, "Humidity") } if (!is.null(id_hl)) { col_indices <- c(col_indices, id_hl); col_final_names <- c(col_final_names, "Humidity_Min") } } # Wind # Positional: 1=Mean, 2=Max, 3=MaxDir, 4=Gust, 5=GustDir wind_cols <- grep("風速", header_row) if (length(wind_cols) > 0) { id_ws <- find_col("風速", "平均風速") if (is.null(id_ws) && length(wind_cols) >= 1) id_ws <- wind_cols[1] id_ws_max <- find_col("風速", "最大風速") if (is.null(id_ws_max) && length(wind_cols) >= 2) id_ws_max <- wind_cols[2] id_ws_gust <- find_col("風速", "最大瞬間") if (is.null(id_ws_gust) && length(wind_cols) >= 4) id_ws_gust <- wind_cols[4] if (!is.null(id_ws)) { col_indices <- c(col_indices, id_ws); col_final_names <- c(col_final_names, "Wind_Speed") } if (!is.null(id_ws_max)) { col_indices <- c(col_indices, id_ws_max); col_final_names <- c(col_final_names, "Wind_Max_Speed") } if (!is.null(id_ws_gust)) { col_indices <- c(col_indices, id_ws_gust); col_final_names <- c(col_final_names, "Wind_Gust_Speed") } } } else { # Hourly/10-min # Pressure # Hourly usually has unique columns for Station and Sea Level idx_sta <- find_col("気圧", "現地") if (is.null(idx_sta)) { # fallback for simple hourly tables match <- grep("気圧", header_row) # If "気圧" but no subheader (unlikely for specific), or multiple if (length(match) > 0) idx_sta <- match[1] } idx_sea <- find_col("気圧", "海面") if (is.null(idx_sea) && length(grep("気圧", header_row)) > 1) { match <- grep("気圧", header_row) idx_sea <- match[2] } if (!is.null(idx_sta)) { col_indices <- c(col_indices, idx_sta); col_final_names <- c(col_final_names, "Pressure") } if (!is.null(idx_sea)) { col_indices <- c(col_indices, idx_sea); col_final_names <- c(col_final_names, "Pressure_Sea_Level") } # Precip idx_pr <- grep("降水量", header_row) if (length(idx_pr) > 0) { col_indices <- c(col_indices, idx_pr[1]); col_final_names <- c(col_final_names, "Precipitation") } # Temp idx_tm <- grep("気温", header_row) if (length(idx_tm) > 0) { col_indices <- c(col_indices, idx_tm[1]); col_final_names <- c(col_final_names, "Temperature") } # Humidity idx_rh <- grep("湿度", header_row) if (length(idx_rh) > 0) { col_indices <- c(col_indices, idx_rh[1]); col_final_names <- c(col_final_names, "Humidity") } # Wind Speed # Usually Row 2 has "風速" (Mean Speed) idx <- find_col("風速", "風速") # If Row 2 fails (some tables simple), fall back to Row 1 strict if (is.null(idx)) idx <- grep("風速", header_row)[1] if (!is.null(idx)) { col_indices <- c(col_indices, idx) col_final_names <- c(col_final_names, "Wind_Speed") } # Wind Direction # Row 2 explicitly "風向" idx_dir <- find_col("風向", "風向") # If Row 2 fails, try unique Row 1 match? # But commonly Row 1 is "Wind Speed/Direction" merged. # Fallback: if header_row contains "風向" but NOT "風速" at that index? if (is.null(idx_dir)) { matches <- grep("風向", header_row) if (length(matches) > 0) idx_dir <- matches[length(matches)] # Take the *last* one if ambiguous? Unsafe. } if (!is.null(idx_dir)) { col_indices <- c(col_indices, idx_dir) col_final_names <- c(col_final_names, "Wind_Direction") } } # 7. Sunshine idx <- grep("日照", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) suffix <- if (resolution == "10 Minutes" || resolution == "10-Minute") "_Minutes" else "_Hours" col_final_names <- c(col_final_names, paste0("Sunshine", suffix)) } # 8. Snow if (resolution == "Monthly") { idx_fall <- find_col("雪", "降雪") idx_depth <- find_col("雪", "最深積雪") idx_days <- grep("雪日数", header_row_2) # Snow days is usually separate or under Atm Phenomena if (!is.null(idx_fall)) { col_indices <- c(col_indices, idx_fall); col_final_names <- c(col_final_names, "Snowfall") } if (!is.null(idx_depth)) { col_indices <- c(col_indices, idx_depth); col_final_names <- c(col_final_names, "Snow_Depth") } if (length(idx_days) > 0) { col_indices <- c(col_indices, idx_days[1]); col_final_names <- c(col_final_names, "Snow_Days") } # Other Days idx_fog <- grep("霧日数", header_row_2) if (length(idx_fog) > 0) { col_indices <- c(col_indices, idx_fog[1]); col_final_names <- c(col_final_names, "Fog_Days") } idx_thunder <- grep("雷日数", header_row_2) if (length(idx_thunder) > 0) { col_indices <- c(col_indices, idx_thunder[1]); col_final_names <- c(col_final_names, "Thunder_Days") } } else { # Daily/Hourly/10-min # Hourly might have "雪" in Row 1, and "降雪"/"積雪" in Row 2 idx_fall <- find_col("雪|降雪", "降雪") if (is.null(idx_fall)) idx_fall <- grep("降雪", header_row)[1] # Fallback for Daily simple if (!is.null(idx_fall) && !is.na(idx_fall)) { col_indices <- c(col_indices, idx_fall) col_final_names <- c(col_final_names, "Snowfall") } idx_depth <- find_col("雪|積雪", "積雪") if (is.null(idx_depth)) idx_depth <- grep("積雪", header_row)[1] if (!is.null(idx_depth) && !is.na(idx_depth)) { col_indices <- c(col_indices, idx_depth) col_final_names <- c(col_final_names, "Snow_Depth") } } # 9. Additional Parameters (Dew Point, Vapor, Solar, Cloud, Visibility) # Usually only available in Hourly/10-min or specialized Daily tables # Dew Point idx <- grep("露点温度", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) col_final_names <- c(col_final_names, "Dew_Point") } # Vapor Pressure idx <- grep("蒸気圧", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) col_final_names <- c(col_final_names, "Vapor_Pressure") } # Global Solar Radiation idx <- grep("全天日射", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) col_final_names <- c(col_final_names, "Solar_Radiation") } # Cloud Cover idx <- grep("雲量", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) col_final_names <- c(col_final_names, "Cloud_Cover") } # Visibility idx <- grep("視程", header_row) if (length(idx) > 0) { col_indices <- c(col_indices, idx[1]) col_final_names <- c(col_final_names, "Visibility") } # Extract Data (Skip row 1 header) data_rows <- weather_table[2:nrow(weather_table), ] # Deduplicate indices (just in case) # Keep strictly unique indices to avoid column duplication errors if (length(col_indices) > 0) { # Check for NAs matching valid_mask <- !is.na(col_indices) col_indices <- col_indices[valid_mask] col_final_names <- col_final_names[valid_mask] # Deduplication based on index # We need to keep the FIRST occurrence or iterate # actually, simply checking duplications: dupe_mask <- !duplicated(col_indices) col_indices <- col_indices[dupe_mask] col_final_names <- col_final_names[dupe_mask] } if (length(col_indices) == 0) return(NULL) df <- data_rows[, col_indices, drop = FALSE] colnames(df) <- col_final_names suppressWarnings({ clean_numeric <- function(x) { if (all(is.na(x))) { return(x) } x <- as.character(x) x <- str_remove_all(x, "[\\]\\)\\\\\\u00A0]") x[x == "///" | x == "--" | x == "" | x == "×" | x == "×"] <- NA as.numeric(x) } # For Monthly, add Year/Month and clean if (resolution == "Monthly") { df$Month <- clean_numeric(df$Month) numeric_cols <- setdiff(names(df), "Month") for (col in numeric_cols) { df[[col]] <- clean_numeric(df[[col]]) } # Filter out non-numeric months (e.g. sub-headers) BEFORE creating Date # Also filter to the specific requested month to avoid returning the whole year # (since get_jma_range_data calls this iteratively for each month) df <- df %>% filter(!is.na(Month)) %>% filter(Month == month) %>% mutate( Year = year, Date = as.Date(sprintf("%04d-%02d-01", Year, Month)) ) %>% select(Year, Month, everything()) } else if (resolution == "Daily") { df <- df %>% mutate( Year = year, Month = month, across(everything(), clean_numeric) ) %>% select(Year, Month, everything()) %>% filter(!is.na(Day)) } else { # For Hourly/10-min, add Year/Month/Day df <- df %>% mutate( Year = year, Month = month, Day = day ) # Clean numeric columns (except Time/Hour/Date AND Wind_Direction) numeric_cols <- setdiff(names(df), c("Time", "Hour", "Year", "Month", "Day", "Wind_Direction")) for (col in numeric_cols) { df[[col]] <- clean_numeric(df[[col]]) } # Clean Wind_Direction specifically (keep as character) if ("Wind_Direction" %in% names(df)) { df$Wind_Direction <- as.character(df$Wind_Direction) df$Wind_Direction <- str_remove_all(df$Wind_Direction, "[\\]\\)\\\\\\u00A0]") df$Wind_Direction[df$Wind_Direction == "///" | df$Wind_Direction == "--" | df$Wind_Direction == "" | df$Wind_Direction == "×"] <- NA # Convert to Degrees wind_dir_map <- c( "北" = 360, "N" = 360, "北北東" = 22.5, "NNE" = 22.5, "北東" = 45, "NE" = 45, "東北東" = 67.5, "ENE" = 67.5, "東" = 90, "E" = 90, "東南東" = 112.5, "ESE" = 112.5, "南東" = 135, "SE" = 135, "南南東" = 157.5, "SSE" = 157.5, "南" = 180, "S" = 180, "南南西" = 202.5, "SSW" = 202.5, "南西" = 225, "SW" = 225, "西南西" = 247.5, "WSW" = 247.5, "西" = 270, "W" = 270, "西北西" = 292.5, "WNW" = 292.5, "北西" = 315, "NW" = 315, "北北西" = 337.5, "NNW" = 337.5, "静穏" = NA, "Calm" = NA ) df$Wind_Direction_Deg <- wind_dir_map[df$Wind_Direction] } # For Hour column, extract just the number if ("Hour" %in% names(df)) { # Filter out sub-header rows (e.g. "時") df <- df %>% filter(Hour != "時") df$Hour <- as.numeric(as.character(df$Hour)) df <- df %>% filter(!is.na(Hour)) } # For Time column (10-min), filter standard sub-headers if ("Time" %in% names(df)) { df <- df %>% filter(Time != "時分") } # Reorder columns df <- df %>% select(Year, Month, Day, everything()) } }) return(df) } #' Scrape JMA data for a Date Range (Daily/Monthly by month, Hourly/10-min by day) #' #' @param block_no Station ID #' @param start_date Date object #' @param end_date Date object #' @param prec_no Prefecture ID #' @param type Station Type (s1 or a1) #' @param resolution Resolution string ("Daily", "Hourly", "10 Minutes", "Monthly") get_jma_range_data <- function(block_no, start_date, end_date, prec_no, type = "s1", resolution = "Daily") { results <- list() if (resolution %in% c("Daily", "Monthly")) { # For Daily/Monthly: iterate months dates <- seq(as.Date(format(start_date, "%Y-%m-01")), as.Date(format(end_date, "%Y-%m-01")), by = "month" ) for (d in dates) { yr <- as.numeric(format(as.Date(d, origin = "1970-01-01"), "%Y")) mo <- as.numeric(format(as.Date(d, origin = "1970-01-01"), "%m")) df <- get_jma_data(block_no, yr, mo, day = NULL, prec_no, type, resolution) if (!is.null(df)) { if ("Date" %in% names(df)) { df_filtered <- df %>% filter(Date >= start_date & Date <= end_date) %>% select(-Date) } else if ("Day" %in% names(df)) { df_filtered <- df %>% mutate(Date = as.Date(sprintf("%04d-%02d-%02d", Year, Month, Day))) %>% filter(Date >= start_date & Date <= end_date) %>% select(-Date) } else { df_filtered <- df } results[[paste(yr, mo, sep = "_")]] <- df_filtered } Sys.sleep(0.1) } } else { # For Hourly/10-min: iterate each day in the range all_days <- seq(start_date, end_date, by = "day") for (d in all_days) { d_date <- as.Date(d, origin = "1970-01-01") yr <- as.numeric(format(d_date, "%Y")) mo <- as.numeric(format(d_date, "%m")) dy <- as.numeric(format(d_date, "%d")) df <- get_jma_data(block_no, yr, mo, day = dy, prec_no, type, resolution) if (!is.null(df)) { results[[paste(yr, mo, dy, sep = "_")]] <- df } Sys.sleep(0.1) } } if (length(results) == 0) { return(NULL) } bind_rows(results) }