Spaces:

CoastalBaseball
/

DataProcess

Sleeping

App Files Files Community

OwenStOnge commited on Mar 4

Commit

a1b0038

verified ·

1 Parent(s): 9881fae

Update app.R

Browse files

Files changed (1) hide show

app.R +24 -96

app.R CHANGED Viewed

@@ -2042,108 +2042,36 @@ observe({
 observeEvent(input$upload_hf_btn, {
   req(scraped_data())
   hf_token <- Sys.getenv("HF_WRITE_TOKEN")
-  repo_id <- "CoastalBaseball/2026MasterDataset"
-  timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
-  upload_to_hf <- function(new_data, folder, index_file, label) {
-    scrape_status_msg(paste0("Checking existing UIDs for ", label, "..."))
-    # Download tiny UID index
-    existing_uids <- tryCatch({
-      tmp_idx <- tempfile(fileext = ".csv.gz")
-      resp <- httr::GET(
-        paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
-        httr::add_headers(Authorization = paste("Bearer", hf_token)),
-        httr::write_disk(tmp_idx, overwrite = TRUE)
-      )
-      if (httr::status_code(resp) == 200) {
-        d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
-        file.remove(tmp_idx)
-        d$PitchUID
-      } else {
-        file.remove(tmp_idx)
-        character(0)
-      }
-    }, error = function(e) { character(0) })
-    # Filter to only new rows
-    scraped_rows <- nrow(new_data)
-    if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
-      new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
-    } else {
-      new_only <- new_data
-    }
-    if (nrow(new_only) == 0) {
-      return(paste0(label, ": 0 new rows (all ", scraped_rows, " already exist)"))
-    }
-    scrape_status_msg(paste0("Uploading ", nrow(new_only), " new rows for ", label, "..."))
-    hf <- reticulate::import("huggingface_hub")
-    api <- hf$HfApi()
-    # Upload new data as timestamped parquet into folder
-    tmp_data <- tempfile(fileext = ".parquet")
-    arrow::write_parquet(new_only, tmp_data)
-    api$upload_file(
-      path_or_fileobj = tmp_data,
-      path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
-      repo_id = repo_id,
-      repo_type = "dataset",
-      token = hf_token
-    )
-    file.remove(tmp_data)
-    # Update UID index (append new UIDs)
-    all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
     tmp_idx <- tempfile(fileext = ".csv.gz")
-    gz <- gzfile(tmp_idx, "w")
-    write.csv(all_uids, gz, row.names = FALSE)
-    close(gz)
-    api$upload_file(
-      path_or_fileobj = tmp_idx,
-      path_in_repo = index_file,
-      repo_id = repo_id,
-      repo_type = "dataset",
-      token = hf_token
     )
-    file.remove(tmp_idx)
-    rm(new_only, all_uids); gc()
-    paste0(label, ": ", scraped_rows, " scraped, ", nrow(new_only), " new rows added (", length(existing_uids) + nrow(new_only), " total)")
-  }
-  # Determine folder based on scrape source
-  if (input$scrape_source == "pbp") {
-    msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
-    gc()
-    cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
-    msg2 <- if (nrow(cp) > 0) {
-      upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
-    } else { "Coastal Pitchers: No matching rows" }
-    rm(cp); gc()
-    ch <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
-    msg3 <- if (nrow(ch) > 0) {
-      upload_to_hf(ch, "coastal_hitters", "coastal_hitters_uid_index.csv.gz", "Coastal Hitters")
-    } else { "Coastal Hitters: No matching rows" }
-    rm(ch); gc()
-    scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
-  } else if (input$scrape_source == "pos") {
-    msg1 <- upload_to_hf(scraped_data(), "pos", "pos_uid_index.csv.gz", "Positional Dataset")
-    scrape_status_msg(msg1)
-  } else if (input$scrape_source == "ncaa") {
-    msg1 <- upload_to_hf(scraped_data(), "ncaa_pbp", "ncaa_pbp_uid_index.csv.gz", "NCAA PBP Dataset")
-    scrape_status_msg(msg1)
-  }
 })
 }

 observeEvent(input$upload_hf_btn, {
   req(scraped_data())
+  scrape_status_msg("Starting upload...")
+  scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
+                           round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
   hf_token <- Sys.getenv("HF_WRITE_TOKEN")
+  scrape_status_msg(paste0("Token found: ", nchar(hf_token) > 0))
+  # Test just the index download first
+  tryCatch({
+    scrape_status_msg("Downloading UID index...")
     tmp_idx <- tempfile(fileext = ".csv.gz")
+    resp <- httr::GET(
+      paste0("https://huggingface.co/datasets/CoastalBaseball/2026MasterDataset/resolve/main/pbp_uid_index.csv.gz"),
+      httr::add_headers(Authorization = paste("Bearer", hf_token)),
+      httr::write_disk(tmp_idx, overwrite = TRUE)
     )
+    scrape_status_msg(paste0("Index download status: ", httr::status_code(resp)))
+    d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
+    scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
+    file.remove(tmp_idx)
+    rm(d); gc()
+    scrape_status_msg("Index test passed. Now testing Python import...")
+    hf <- reticulate::import("huggingface_hub")
+    scrape_status_msg("Python import succeeded")
+  }, error = function(e) {
+    scrape_status_msg(paste0("FAILED: ", e$message))
+  })
 })
 }