OwenStOnge commited on
Commit
9881fae
·
verified ·
1 Parent(s): cef55c1

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +75 -66
app.R CHANGED
@@ -2044,95 +2044,104 @@ observeEvent(input$upload_hf_btn, {
2044
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
  repo_id <- "CoastalBaseball/2026MasterDataset"
 
2047
 
2048
- upload_to_hf <- function(new_data, filename, label) {
2049
- scrape_status_msg(paste0("Downloading existing ", label, "..."))
2050
 
2051
- existing <- tryCatch({
 
 
2052
  resp <- httr::GET(
2053
- paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", filename),
2054
- httr::add_headers(Authorization = paste("Bearer", hf_token))
 
2055
  )
2056
  if (httr::status_code(resp) == 200) {
2057
- tmp_dl <- tempfile(fileext = ".parquet")
2058
- writeBin(httr::content(resp, as = "raw"), tmp_dl)
2059
- d <- arrow::read_parquet(tmp_dl)
2060
- file.remove(tmp_dl)
2061
- d
2062
- } else { NULL }
2063
- }, error = function(e) { NULL })
2064
-
2065
- existing_rows <- if (!is.null(existing)) nrow(existing) else 0
2066
- scraped_rows <- nrow(new_data)
2067
-
2068
- if (existing_rows > 0) {
2069
- scrape_status_msg(paste0("Merging ", label, "..."))
2070
- combined <- bind_rows(existing, new_data)
2071
- rm(existing); gc()
2072
-
2073
- if ("PitchUID" %in% names(combined)) {
2074
- combined <- combined %>% distinct(PitchUID, .keep_all = TRUE)
2075
  } else {
2076
- combined <- combined %>% distinct()
 
2077
  }
 
 
 
 
 
 
2078
  } else {
2079
- combined <- new_data
2080
- rm(existing); gc()
2081
  }
2082
 
2083
- new_rows <- nrow(combined) - existing_rows
 
 
2084
 
2085
- scrape_status_msg(paste0("Uploading ", label, " (", nrow(combined), " rows)..."))
2086
 
2087
- tmp <- tempfile(fileext = ".parquet")
2088
- arrow::write_parquet(combined, tmp)
2089
- rm(combined); gc()
2090
 
2091
- result <- tryCatch({
2092
- hf <- reticulate::import("huggingface_hub")
2093
- api <- hf$HfApi()
2094
- api$upload_file(
2095
- path_or_fileobj = tmp,
2096
- path_in_repo = filename,
2097
- repo_id = repo_id,
2098
- repo_type = "dataset",
2099
- token = hf_token
2100
- )
2101
- paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", nrow(new_data), " + ", existing_rows, " existing = deduped)")
2102
- }, error = function(e) {
2103
- paste0(label, " upload error: ", e$message)
2104
- })
2105
 
2106
- file.remove(tmp)
2107
- gc()
2108
- return(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2109
  }
2110
 
2111
- # Upload main dataset first
2112
- main_file <- paste0(input$scrape_source, "_2026_master.parquet")
2113
- msg1 <- upload_to_hf(scraped_data(), main_file, "Master Dataset")
2114
- gc()
2115
-
2116
  if (input$scrape_source == "pbp") {
2117
- # Filter THEN upload one at a time to save memory
2118
- coastal_pitchers <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
2119
 
2120
- msg2 <- if (nrow(coastal_pitchers) > 0) {
2121
- upload_to_hf(coastal_pitchers, "CoastalPitchers2026.parquet", "Coastal Pitchers")
 
2122
  } else { "Coastal Pitchers: No matching rows" }
 
2123
 
2124
- rm(coastal_pitchers); gc()
2125
-
2126
- coastal_hitters <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2127
-
2128
- msg3 <- if (nrow(coastal_hitters) > 0) {
2129
- upload_to_hf(coastal_hitters, "CoastalHitters2026.parquet", "Coastal Hitters")
2130
  } else { "Coastal Hitters: No matching rows" }
2131
-
2132
- rm(coastal_hitters); gc()
2133
 
2134
  scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2135
- } else {
 
 
 
 
 
 
2136
  scrape_status_msg(msg1)
2137
  }
2138
  })
 
2044
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
  repo_id <- "CoastalBaseball/2026MasterDataset"
2047
+ timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
2048
 
2049
+ upload_to_hf <- function(new_data, folder, index_file, label) {
2050
+ scrape_status_msg(paste0("Checking existing UIDs for ", label, "..."))
2051
 
2052
+ # Download tiny UID index
2053
+ existing_uids <- tryCatch({
2054
+ tmp_idx <- tempfile(fileext = ".csv.gz")
2055
  resp <- httr::GET(
2056
+ paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
2057
+ httr::add_headers(Authorization = paste("Bearer", hf_token)),
2058
+ httr::write_disk(tmp_idx, overwrite = TRUE)
2059
  )
2060
  if (httr::status_code(resp) == 200) {
2061
+ d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
2062
+ file.remove(tmp_idx)
2063
+ d$PitchUID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2064
  } else {
2065
+ file.remove(tmp_idx)
2066
+ character(0)
2067
  }
2068
+ }, error = function(e) { character(0) })
2069
+
2070
+ # Filter to only new rows
2071
+ scraped_rows <- nrow(new_data)
2072
+ if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
2073
+ new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
2074
  } else {
2075
+ new_only <- new_data
 
2076
  }
2077
 
2078
+ if (nrow(new_only) == 0) {
2079
+ return(paste0(label, ": 0 new rows (all ", scraped_rows, " already exist)"))
2080
+ }
2081
 
2082
+ scrape_status_msg(paste0("Uploading ", nrow(new_only), " new rows for ", label, "..."))
2083
 
2084
+ hf <- reticulate::import("huggingface_hub")
2085
+ api <- hf$HfApi()
 
2086
 
2087
+ # Upload new data as timestamped parquet into folder
2088
+ tmp_data <- tempfile(fileext = ".parquet")
2089
+ arrow::write_parquet(new_only, tmp_data)
 
 
 
 
 
 
 
 
 
 
 
2090
 
2091
+ api$upload_file(
2092
+ path_or_fileobj = tmp_data,
2093
+ path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
2094
+ repo_id = repo_id,
2095
+ repo_type = "dataset",
2096
+ token = hf_token
2097
+ )
2098
+ file.remove(tmp_data)
2099
+
2100
+ # Update UID index (append new UIDs)
2101
+ all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
2102
+ tmp_idx <- tempfile(fileext = ".csv.gz")
2103
+ gz <- gzfile(tmp_idx, "w")
2104
+ write.csv(all_uids, gz, row.names = FALSE)
2105
+ close(gz)
2106
+
2107
+ api$upload_file(
2108
+ path_or_fileobj = tmp_idx,
2109
+ path_in_repo = index_file,
2110
+ repo_id = repo_id,
2111
+ repo_type = "dataset",
2112
+ token = hf_token
2113
+ )
2114
+ file.remove(tmp_idx)
2115
+
2116
+ rm(new_only, all_uids); gc()
2117
+ paste0(label, ": ", scraped_rows, " scraped, ", nrow(new_only), " new rows added (", length(existing_uids) + nrow(new_only), " total)")
2118
  }
2119
 
2120
+ # Determine folder based on scrape source
 
 
 
 
2121
  if (input$scrape_source == "pbp") {
2122
+ msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
2123
+ gc()
2124
 
2125
+ cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
2126
+ msg2 <- if (nrow(cp) > 0) {
2127
+ upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
2128
  } else { "Coastal Pitchers: No matching rows" }
2129
+ rm(cp); gc()
2130
 
2131
+ ch <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2132
+ msg3 <- if (nrow(ch) > 0) {
2133
+ upload_to_hf(ch, "coastal_hitters", "coastal_hitters_uid_index.csv.gz", "Coastal Hitters")
 
 
 
2134
  } else { "Coastal Hitters: No matching rows" }
2135
+ rm(ch); gc()
 
2136
 
2137
  scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2138
+
2139
+ } else if (input$scrape_source == "pos") {
2140
+ msg1 <- upload_to_hf(scraped_data(), "pos", "pos_uid_index.csv.gz", "Positional Dataset")
2141
+ scrape_status_msg(msg1)
2142
+
2143
+ } else if (input$scrape_source == "ncaa") {
2144
+ msg1 <- upload_to_hf(scraped_data(), "ncaa_pbp", "ncaa_pbp_uid_index.csv.gz", "NCAA PBP Dataset")
2145
  scrape_status_msg(msg1)
2146
  }
2147
  })