OwenStOnge commited on
Commit
f3ab272
·
verified ·
1 Parent(s): 76bbd9f

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +73 -43
app.R CHANGED
@@ -2044,77 +2044,107 @@ observeEvent(input$upload_hf_btn, {
2044
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
  repo_id <- "CoastalBaseball/2026MasterDataset"
 
2047
 
2048
- tryCatch({
2049
- # Step 1: Get existing UIDs
2050
- scrape_status_msg("Step 1: Downloading UID index...")
2051
- tmp_idx <- tempfile(fileext = ".csv.gz")
2052
- resp <- httr::GET(
2053
- paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/pbp_uid_index.csv.gz"),
2054
- httr::add_headers(Authorization = paste("Bearer", hf_token)),
2055
- httr::write_disk(tmp_idx, overwrite = TRUE)
2056
- )
2057
- existing_uids <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)$PitchUID
2058
- file.remove(tmp_idx)
2059
- scrape_status_msg(paste0("Step 1 done: ", length(existing_uids), " existing UIDs"))
2060
 
2061
- # Step 2: Filter to new rows
2062
- scrape_status_msg("Step 2: Filtering new rows...")
2063
- new_only <- scraped_data() %>% filter(!PitchUID %in% existing_uids)
2064
- scrape_status_msg(paste0("Step 2 done: ", nrow(new_only), " new rows"))
 
 
 
 
 
 
 
 
 
 
 
 
2065
 
2066
- if (nrow(new_only) == 0) {
2067
- scrape_status_msg("No new rows to upload.")
2068
- return()
 
 
 
2069
  }
2070
 
2071
- # Step 3: Write parquet
2072
- scrape_status_msg("Step 3: Writing parquet...")
2073
- tmp_data <- tempfile(fileext = ".parquet")
2074
- arrow::write_parquet(new_only, tmp_data)
2075
- scrape_status_msg(paste0("Step 3 done: ", round(file.size(tmp_data) / 1024^2, 1), " MB file"))
 
 
 
2076
 
2077
- # Step 4: Upload parquet
2078
- scrape_status_msg("Step 4: Uploading parquet...")
2079
  hf <- reticulate::import("huggingface_hub")
2080
  api <- hf$HfApi()
2081
- timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
 
 
2082
 
2083
  api$upload_file(
2084
  path_or_fileobj = tmp_data,
2085
- path_in_repo = paste0("pbp/", timestamp, ".parquet"),
2086
  repo_id = repo_id,
2087
  repo_type = "dataset",
2088
  token = hf_token
2089
  )
2090
  file.remove(tmp_data)
2091
- scrape_status_msg("Step 4 done: Parquet uploaded")
2092
 
2093
- # Step 5: Update index
2094
- scrape_status_msg("Step 5: Updating UID index...")
2095
- all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
2096
- rm(new_only, existing_uids); gc()
2097
 
2098
- tmp_idx2 <- tempfile(fileext = ".csv.gz")
2099
- gz <- gzfile(tmp_idx2, "w")
 
2100
  write.csv(all_uids, gz, row.names = FALSE)
2101
  close(gz)
2102
 
2103
  api$upload_file(
2104
- path_or_fileobj = tmp_idx2,
2105
- path_in_repo = "pbp_uid_index.csv.gz",
2106
  repo_id = repo_id,
2107
  repo_type = "dataset",
2108
  token = hf_token
2109
  )
2110
- file.remove(tmp_idx2)
2111
- rm(all_uids); gc()
2112
 
2113
- scrape_status_msg("Step 5 done: ALL COMPLETE")
 
 
 
 
 
 
2114
 
2115
- }, error = function(e) {
2116
- scrape_status_msg(paste0("FAILED at: ", e$message))
2117
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2118
  })
2119
 
2120
  }
 
2044
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
  repo_id <- "CoastalBaseball/2026MasterDataset"
2047
+ timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
2048
 
2049
+ upload_to_hf <- function(new_data, folder, index_file, label) {
2050
+ scrape_status_msg(paste0("Checking existing UIDs for ", label, "..."))
 
 
 
 
 
 
 
 
 
 
2051
 
2052
+ existing_uids <- tryCatch({
2053
+ tmp_idx <- tempfile(fileext = ".csv.gz")
2054
+ resp <- httr::GET(
2055
+ paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
2056
+ httr::add_headers(Authorization = paste("Bearer", hf_token)),
2057
+ httr::write_disk(tmp_idx, overwrite = TRUE)
2058
+ )
2059
+ if (httr::status_code(resp) == 200) {
2060
+ d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
2061
+ file.remove(tmp_idx)
2062
+ d$PitchUID
2063
+ } else {
2064
+ file.remove(tmp_idx)
2065
+ character(0)
2066
+ }
2067
+ }, error = function(e) { character(0) })
2068
 
2069
+ scraped_rows <- nrow(new_data)
2070
+
2071
+ if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
2072
+ new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
2073
+ } else {
2074
+ new_only <- new_data
2075
  }
2076
 
2077
+ new_rows <- nrow(new_only)
2078
+ total_after <- length(existing_uids) + new_rows
2079
+
2080
+ if (new_rows == 0) {
2081
+ return(paste0(label, ": ", scraped_rows, " rows scraped, 0 new rows added (", length(existing_uids), " total)"))
2082
+ }
2083
+
2084
+ scrape_status_msg(paste0("Uploading ", new_rows, " new rows for ", label, "..."))
2085
 
 
 
2086
  hf <- reticulate::import("huggingface_hub")
2087
  api <- hf$HfApi()
2088
+
2089
+ tmp_data <- tempfile(fileext = ".parquet")
2090
+ arrow::write_parquet(new_only, tmp_data)
2091
 
2092
  api$upload_file(
2093
  path_or_fileobj = tmp_data,
2094
+ path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
2095
  repo_id = repo_id,
2096
  repo_type = "dataset",
2097
  token = hf_token
2098
  )
2099
  file.remove(tmp_data)
 
2100
 
2101
+ scrape_status_msg(paste0("Updating ", label, " index..."))
 
 
 
2102
 
2103
+ all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
2104
+ tmp_idx <- tempfile(fileext = ".csv.gz")
2105
+ gz <- gzfile(tmp_idx, "w")
2106
  write.csv(all_uids, gz, row.names = FALSE)
2107
  close(gz)
2108
 
2109
  api$upload_file(
2110
+ path_or_fileobj = tmp_idx,
2111
+ path_in_repo = index_file,
2112
  repo_id = repo_id,
2113
  repo_type = "dataset",
2114
  token = hf_token
2115
  )
2116
+ file.remove(tmp_idx)
 
2117
 
2118
+ rm(new_only, all_uids); gc()
2119
+ paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", total_after, " total)")
2120
+ }
2121
+
2122
+ if (input$scrape_source == "pbp") {
2123
+ msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
2124
+ gc()
2125
 
2126
+ cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
2127
+ msg2 <- if (nrow(cp) > 0) {
2128
+ upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
2129
+ } else { "Coastal Pitchers: No matching rows" }
2130
+ rm(cp); gc()
2131
+
2132
+ ch <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2133
+ msg3 <- if (nrow(ch) > 0) {
2134
+ upload_to_hf(ch, "coastal_hitters", "coastal_hitters_uid_index.csv.gz", "Coastal Hitters")
2135
+ } else { "Coastal Hitters: No matching rows" }
2136
+ rm(ch); gc()
2137
+
2138
+ scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2139
+
2140
+ } else if (input$scrape_source == "pos") {
2141
+ msg1 <- upload_to_hf(scraped_data(), "pos", "pos_uid_index.csv.gz", "Positional Dataset")
2142
+ scrape_status_msg(msg1)
2143
+
2144
+ } else if (input$scrape_source == "ncaa") {
2145
+ msg1 <- upload_to_hf(scraped_data(), "ncaa_pbp", "ncaa_pbp_uid_index.csv.gz", "NCAA PBP Dataset")
2146
+ scrape_status_msg(msg1)
2147
+ }
2148
  })
2149
 
2150
  }