meteogate / explore_mapping.R
alexdum's picture
Sync current state to Hugging Face
0ac7b9d
library(jsonlite)
library(dplyr)
library(readr)
library(curl)
# 1. Get Official Station List Resources
dataset_slug <- "informations-sur-les-stations-meteo-france-metadonnees"
api_url <- paste0("https://www.data.gouv.fr/api/1/datasets/", dataset_slug, "/")
print("Fetching metadata dataset info...")
meta <- fromJSON(api_url)
# Find a CSV resource logic refined
resources <- meta$resources
print("All Resource Titles:")
print(resources$title)
# Regex to find metadata file
# Looking for "liste" AND "stations", or "metadonnees"
meta_idx <- grep("liste.*stations|postes|metadonnees", resources$title, ignore.case = TRUE)
csv_idx <- which(resources$format == "csv")
# Intersect
target_indices <- intersect(meta_idx, csv_idx)
if (length(target_indices) == 0) {
print("No obvious metadata CSV found. Checking ANY CSV with 'stations'...")
target_indices <- intersect(grep("stations", resources$title, ignore.case = TRUE), csv_idx)
}
# Initialize official_stations to NULL or an empty data frame
official_stations <- NULL
if (length(target_indices) > 0) {
target_res <- resources[target_indices[1], ]
print(paste("Selected Resource:", target_res$title))
print(paste("URL:", target_res$url))
# It appears to be comma-separated
official_stations <- read_csv(target_res$url, show_col_types = FALSE)
print("Official Stations Columns:")
print(colnames(official_stations))
# Count hourly
hourly_stations <- official_stations %>% filter(is_hourly == TRUE)
print(paste("Total Stations:", nrow(official_stations)))
print(paste("Hourly Stations:", nrow(hourly_stations)))
print(paste("Open Hourly Stations:", nrow(hourly_stations %>% filter(is_open == TRUE))))
# 2. Get Synop List
synop_url <- "https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/postesSynop.csv"
synop_stations <- read_delim(synop_url,
delim = ";", show_col_types = FALSE,
col_types = cols(ID = col_character(), .default = col_character())
)
# 3. Join Attempt
print("Attempting Fuzzy Join on Location...")
# Check official columns
# We saw: id, lat, lon, department_id, is_hourly
if (!all(c("lat", "lon") %in% names(official_stations))) {
print("Expected lat/lon columns not found.")
} else {
# Prepare Synop
synop_join <- synop_stations %>%
mutate(
lat_round = round(as.numeric(Latitude), 3),
lon_round = round(as.numeric(Longitude), 3)
)
# Prepare Official
official_join <- official_stations %>%
mutate(
lat_round = round(as.numeric(lat), 3),
lon_round = round(as.numeric(lon), 3)
) %>%
filter(is_hourly == TRUE) # Only keep stations with hourly data
# Join
joined <- inner_join(synop_join, official_join, by = c("lat_round", "lon_round"))
print(paste("Matches found:", nrow(joined)))
# Select columns for mapping
# We need: id_synop (ID), id_clim (id), key_lat, key_lon, department (department_id)
mapping <- joined %>%
select(
id_synop = ID,
id_clim = id,
name_synop = Nom,
name_clim = name,
department = department_id
) %>%
distinct(id_synop, .keep_all = TRUE)
print(head(mapping))
# Write to file
write_delim(mapping, "station_mapping_auto.csv", delim = ";")
print("Mapping saved to station_mapping_auto.csv")
}
} else {
print("No suitable metadata resource found.")
}