Spaces:

CoastalBaseball
/

DataProcess

Sleeping

App Files Files Community

OwenStOnge commited on Mar 4

Commit

76bbd9f

verified ·

1 Parent(s): a1b0038

Update app.R

Browse files

Files changed (1) hide show

app.R +59 -16

app.R CHANGED Viewed

@@ -2042,35 +2042,78 @@ observe({
 observeEvent(input$upload_hf_btn, {
   req(scraped_data())
-  scrape_status_msg("Starting upload...")
-  scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
-                           round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
   hf_token <- Sys.getenv("HF_WRITE_TOKEN")
-  scrape_status_msg(paste0("Token found: ", nchar(hf_token) > 0))
-  # Test just the index download first
   tryCatch({
-    scrape_status_msg("Downloading UID index...")
     tmp_idx <- tempfile(fileext = ".csv.gz")
     resp <- httr::GET(
-      paste0("https://huggingface.co/datasets/CoastalBaseball/2026MasterDataset/resolve/main/pbp_uid_index.csv.gz"),
       httr::add_headers(Authorization = paste("Bearer", hf_token)),
       httr::write_disk(tmp_idx, overwrite = TRUE)
     )
-    scrape_status_msg(paste0("Index download status: ", httr::status_code(resp)))
-    d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
-    scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
     file.remove(tmp_idx)
-    rm(d); gc()
-    scrape_status_msg("Index test passed. Now testing Python import...")
     hf <- reticulate::import("huggingface_hub")
-    scrape_status_msg("Python import succeeded")
   }, error = function(e) {
-    scrape_status_msg(paste0("FAILED: ", e$message))
   })
 })

 observeEvent(input$upload_hf_btn, {
   req(scraped_data())
   hf_token <- Sys.getenv("HF_WRITE_TOKEN")
+  repo_id <- "CoastalBaseball/2026MasterDataset"
   tryCatch({
+    # Step 1: Get existing UIDs
+    scrape_status_msg("Step 1: Downloading UID index...")
     tmp_idx <- tempfile(fileext = ".csv.gz")
     resp <- httr::GET(
+      paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/pbp_uid_index.csv.gz"),
       httr::add_headers(Authorization = paste("Bearer", hf_token)),
       httr::write_disk(tmp_idx, overwrite = TRUE)
     )
+    existing_uids <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)$PitchUID
     file.remove(tmp_idx)
+    scrape_status_msg(paste0("Step 1 done: ", length(existing_uids), " existing UIDs"))
+    # Step 2: Filter to new rows
+    scrape_status_msg("Step 2: Filtering new rows...")
+    new_only <- scraped_data() %>% filter(!PitchUID %in% existing_uids)
+    scrape_status_msg(paste0("Step 2 done: ", nrow(new_only), " new rows"))
+    if (nrow(new_only) == 0) {
+      scrape_status_msg("No new rows to upload.")
+      return()
+    }
+    # Step 3: Write parquet
+    scrape_status_msg("Step 3: Writing parquet...")
+    tmp_data <- tempfile(fileext = ".parquet")
+    arrow::write_parquet(new_only, tmp_data)
+    scrape_status_msg(paste0("Step 3 done: ", round(file.size(tmp_data) / 1024^2, 1), " MB file"))
+    # Step 4: Upload parquet
+    scrape_status_msg("Step 4: Uploading parquet...")
     hf <- reticulate::import("huggingface_hub")
+    api <- hf$HfApi()
+    timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
+    api$upload_file(
+      path_or_fileobj = tmp_data,
+      path_in_repo = paste0("pbp/", timestamp, ".parquet"),
+      repo_id = repo_id,
+      repo_type = "dataset",
+      token = hf_token
+    )
+    file.remove(tmp_data)
+    scrape_status_msg("Step 4 done: Parquet uploaded")
+    # Step 5: Update index
+    scrape_status_msg("Step 5: Updating UID index...")
+    all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
+    rm(new_only, existing_uids); gc()
+    tmp_idx2 <- tempfile(fileext = ".csv.gz")
+    gz <- gzfile(tmp_idx2, "w")
+    write.csv(all_uids, gz, row.names = FALSE)
+    close(gz)
+    api$upload_file(
+      path_or_fileobj = tmp_idx2,
+      path_in_repo = "pbp_uid_index.csv.gz",
+      repo_id = repo_id,
+      repo_type = "dataset",
+      token = hf_token
+    )
+    file.remove(tmp_idx2)
+    rm(all_uids); gc()
+    scrape_status_msg("Step 5 done: ALL COMPLETE")
   }, error = function(e) {
+    scrape_status_msg(paste0("FAILED at: ", e$message))
   })
 })