OwenStOnge commited on
Commit
76bbd9f
·
verified ·
1 Parent(s): a1b0038

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +59 -16
app.R CHANGED
@@ -2042,35 +2042,78 @@ observe({
2042
  observeEvent(input$upload_hf_btn, {
2043
  req(scraped_data())
2044
 
2045
- scrape_status_msg("Starting upload...")
2046
- scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
2047
- round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
2048
-
2049
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2050
- scrape_status_msg(paste0("Token found: ", nchar(hf_token) > 0))
2051
 
2052
- # Test just the index download first
2053
  tryCatch({
2054
- scrape_status_msg("Downloading UID index...")
 
2055
  tmp_idx <- tempfile(fileext = ".csv.gz")
2056
  resp <- httr::GET(
2057
- paste0("https://huggingface.co/datasets/CoastalBaseball/2026MasterDataset/resolve/main/pbp_uid_index.csv.gz"),
2058
  httr::add_headers(Authorization = paste("Bearer", hf_token)),
2059
  httr::write_disk(tmp_idx, overwrite = TRUE)
2060
  )
2061
- scrape_status_msg(paste0("Index download status: ", httr::status_code(resp)))
2062
-
2063
- d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
2064
- scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
2065
  file.remove(tmp_idx)
2066
- rm(d); gc()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2067
 
2068
- scrape_status_msg("Index test passed. Now testing Python import...")
 
2069
  hf <- reticulate::import("huggingface_hub")
2070
- scrape_status_msg("Python import succeeded")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2071
 
2072
  }, error = function(e) {
2073
- scrape_status_msg(paste0("FAILED: ", e$message))
2074
  })
2075
  })
2076
 
 
2042
  observeEvent(input$upload_hf_btn, {
2043
  req(scraped_data())
2044
 
 
 
 
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
+ repo_id <- "CoastalBaseball/2026MasterDataset"
2047
 
 
2048
  tryCatch({
2049
+ # Step 1: Get existing UIDs
2050
+ scrape_status_msg("Step 1: Downloading UID index...")
2051
  tmp_idx <- tempfile(fileext = ".csv.gz")
2052
  resp <- httr::GET(
2053
+ paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/pbp_uid_index.csv.gz"),
2054
  httr::add_headers(Authorization = paste("Bearer", hf_token)),
2055
  httr::write_disk(tmp_idx, overwrite = TRUE)
2056
  )
2057
+ existing_uids <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)$PitchUID
 
 
 
2058
  file.remove(tmp_idx)
2059
+ scrape_status_msg(paste0("Step 1 done: ", length(existing_uids), " existing UIDs"))
2060
+
2061
+ # Step 2: Filter to new rows
2062
+ scrape_status_msg("Step 2: Filtering new rows...")
2063
+ new_only <- scraped_data() %>% filter(!PitchUID %in% existing_uids)
2064
+ scrape_status_msg(paste0("Step 2 done: ", nrow(new_only), " new rows"))
2065
+
2066
+ if (nrow(new_only) == 0) {
2067
+ scrape_status_msg("No new rows to upload.")
2068
+ return()
2069
+ }
2070
+
2071
+ # Step 3: Write parquet
2072
+ scrape_status_msg("Step 3: Writing parquet...")
2073
+ tmp_data <- tempfile(fileext = ".parquet")
2074
+ arrow::write_parquet(new_only, tmp_data)
2075
+ scrape_status_msg(paste0("Step 3 done: ", round(file.size(tmp_data) / 1024^2, 1), " MB file"))
2076
 
2077
+ # Step 4: Upload parquet
2078
+ scrape_status_msg("Step 4: Uploading parquet...")
2079
  hf <- reticulate::import("huggingface_hub")
2080
+ api <- hf$HfApi()
2081
+ timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
2082
+
2083
+ api$upload_file(
2084
+ path_or_fileobj = tmp_data,
2085
+ path_in_repo = paste0("pbp/", timestamp, ".parquet"),
2086
+ repo_id = repo_id,
2087
+ repo_type = "dataset",
2088
+ token = hf_token
2089
+ )
2090
+ file.remove(tmp_data)
2091
+ scrape_status_msg("Step 4 done: Parquet uploaded")
2092
+
2093
+ # Step 5: Update index
2094
+ scrape_status_msg("Step 5: Updating UID index...")
2095
+ all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
2096
+ rm(new_only, existing_uids); gc()
2097
+
2098
+ tmp_idx2 <- tempfile(fileext = ".csv.gz")
2099
+ gz <- gzfile(tmp_idx2, "w")
2100
+ write.csv(all_uids, gz, row.names = FALSE)
2101
+ close(gz)
2102
+
2103
+ api$upload_file(
2104
+ path_or_fileobj = tmp_idx2,
2105
+ path_in_repo = "pbp_uid_index.csv.gz",
2106
+ repo_id = repo_id,
2107
+ repo_type = "dataset",
2108
+ token = hf_token
2109
+ )
2110
+ file.remove(tmp_idx2)
2111
+ rm(all_uids); gc()
2112
+
2113
+ scrape_status_msg("Step 5 done: ALL COMPLETE")
2114
 
2115
  }, error = function(e) {
2116
+ scrape_status_msg(paste0("FAILED at: ", e$message))
2117
  })
2118
  })
2119