jma / funs /finalize_metadata.R
alexdum's picture
first commit
57537fb
library(dplyr)
library(readr)
library(jsonlite)
library(stringr)
# 1. Read the AMeDAS table for English names
amedas_raw <- fromJSON("https://www.jma.go.jp/bosai/amedas/const/amedastable.json")
to_decimal_degrees <- function(deg, minutes) {
ifelse(is.na(deg) | is.na(minutes), NA_real_,
ifelse(deg < 0, deg - minutes / 60, deg + minutes / 60))
}
parse_amedas_coord <- function(coord) {
if (is.null(coord) || length(coord) < 2) return(NA_real_)
to_decimal_degrees(as.numeric(coord[1]), as.numeric(coord[2]))
}
normalize_name <- function(name) {
name %>%
str_replace_all("[\\((].*?[\\))]", "") %>%
str_squish()
}
coord_round <- function(x) round(x, 4)
amedas_entries <- imap_dfr(amedas_raw, function(entry, station_id) {
name_jp <- if (!is.null(entry$kjName) && nzchar(entry$kjName)) entry$kjName else station_id
name_en <- if (!is.null(entry$enName) && nzchar(entry$enName)) entry$enName else NA_character_
tibble(
ID = as.character(station_id),
Name_JP = name_jp,
Name_EN = name_en,
Lat = parse_amedas_coord(entry$lat),
Lon = parse_amedas_coord(entry$lon)
)
})
# 2. Read the full stations list we got from the map (1,445 stations)
# I'll re-read it from the backup or the first version if possible,
# but I'll use the current one and deduplicate by Japanese name and coordinates.
stations <- read_csv("data/jma_stations_full.csv", show_col_types = FALSE)
# 3. Fix the IDs and prec_no
# For stations where prec_no is NA, try to get it from others with similar IDs or names.
mappings <- stations %>%
filter(!is.na(prec_no)) %>%
select(Name_JP, prec_no) %>%
distinct()
stations_fixed <- stations %>%
left_join(mappings, by = "Name_JP") %>%
mutate(prec_no = ifelse(is.na(prec_no.x), prec_no.y, prec_no.x)) %>%
select(-prec_no.x, -prec_no.y)
# 4. Deduplicate to get exactly the 1,445 stations
# We'll prefer S1 type metadata if both exist for the same Name_JP
stations_final <- stations_fixed %>%
arrange(Name_JP, desc(Type)) %>%
distinct(Name_JP, .keep_all = TRUE)
# 5. Ensure Name_EN is populated
amedas_match <- amedas_entries %>%
mutate(
Name_JP_norm = normalize_name(Name_JP),
Lat_round = coord_round(Lat),
Lon_round = coord_round(Lon)
) %>%
distinct(Name_JP_norm, Lat_round, Lon_round, .keep_all = TRUE)
stations_final <- stations_final %>%
left_join(
amedas_entries %>%
select(ID, Name_EN) %>%
rename(Name_EN_json = Name_EN),
by = "ID"
) %>%
mutate(
Name_JP_norm = normalize_name(Name_JP),
Lat_round = coord_round(Lat),
Lon_round = coord_round(Lon)
) %>%
left_join(
amedas_match %>%
select(Name_JP_norm, Lat_round, Lon_round, Name_EN) %>%
rename(Name_EN_match = Name_EN),
by = c("Name_JP_norm", "Lat_round", "Lon_round")
) %>%
mutate(Name_EN = coalesce(Name_EN, Name_EN_json, Name_EN_match)) %>%
select(-Name_EN_json, -Name_EN_match, -Name_JP_norm, -Lat_round, -Lon_round)
# 6. Save final file
write_csv(stations_final, "data/jma_stations_full.csv")
message(sprintf("Final station count: %d", nrow(stations_final)))