OwenStOnge commited on
Commit
a1b0038
·
verified ·
1 Parent(s): 9881fae

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +24 -96
app.R CHANGED
@@ -2042,108 +2042,36 @@ observe({
2042
  observeEvent(input$upload_hf_btn, {
2043
  req(scraped_data())
2044
 
 
 
 
 
2045
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2046
- repo_id <- "CoastalBaseball/2026MasterDataset"
2047
- timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
2048
-
2049
- upload_to_hf <- function(new_data, folder, index_file, label) {
2050
- scrape_status_msg(paste0("Checking existing UIDs for ", label, "..."))
2051
-
2052
- # Download tiny UID index
2053
- existing_uids <- tryCatch({
2054
- tmp_idx <- tempfile(fileext = ".csv.gz")
2055
- resp <- httr::GET(
2056
- paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
2057
- httr::add_headers(Authorization = paste("Bearer", hf_token)),
2058
- httr::write_disk(tmp_idx, overwrite = TRUE)
2059
- )
2060
- if (httr::status_code(resp) == 200) {
2061
- d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
2062
- file.remove(tmp_idx)
2063
- d$PitchUID
2064
- } else {
2065
- file.remove(tmp_idx)
2066
- character(0)
2067
- }
2068
- }, error = function(e) { character(0) })
2069
-
2070
- # Filter to only new rows
2071
- scraped_rows <- nrow(new_data)
2072
- if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
2073
- new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
2074
- } else {
2075
- new_only <- new_data
2076
- }
2077
-
2078
- if (nrow(new_only) == 0) {
2079
- return(paste0(label, ": 0 new rows (all ", scraped_rows, " already exist)"))
2080
- }
2081
-
2082
- scrape_status_msg(paste0("Uploading ", nrow(new_only), " new rows for ", label, "..."))
2083
-
2084
- hf <- reticulate::import("huggingface_hub")
2085
- api <- hf$HfApi()
2086
-
2087
- # Upload new data as timestamped parquet into folder
2088
- tmp_data <- tempfile(fileext = ".parquet")
2089
- arrow::write_parquet(new_only, tmp_data)
2090
-
2091
- api$upload_file(
2092
- path_or_fileobj = tmp_data,
2093
- path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
2094
- repo_id = repo_id,
2095
- repo_type = "dataset",
2096
- token = hf_token
2097
- )
2098
- file.remove(tmp_data)
2099
-
2100
- # Update UID index (append new UIDs)
2101
- all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
2102
  tmp_idx <- tempfile(fileext = ".csv.gz")
2103
- gz <- gzfile(tmp_idx, "w")
2104
- write.csv(all_uids, gz, row.names = FALSE)
2105
- close(gz)
2106
-
2107
- api$upload_file(
2108
- path_or_fileobj = tmp_idx,
2109
- path_in_repo = index_file,
2110
- repo_id = repo_id,
2111
- repo_type = "dataset",
2112
- token = hf_token
2113
  )
2114
- file.remove(tmp_idx)
2115
-
2116
- rm(new_only, all_uids); gc()
2117
- paste0(label, ": ", scraped_rows, " scraped, ", nrow(new_only), " new rows added (", length(existing_uids) + nrow(new_only), " total)")
2118
- }
2119
-
2120
- # Determine folder based on scrape source
2121
- if (input$scrape_source == "pbp") {
2122
- msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
2123
- gc()
2124
-
2125
- cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
2126
- msg2 <- if (nrow(cp) > 0) {
2127
- upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
2128
- } else { "Coastal Pitchers: No matching rows" }
2129
- rm(cp); gc()
2130
 
2131
- ch <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2132
- msg3 <- if (nrow(ch) > 0) {
2133
- upload_to_hf(ch, "coastal_hitters", "coastal_hitters_uid_index.csv.gz", "Coastal Hitters")
2134
- } else { "Coastal Hitters: No matching rows" }
2135
- rm(ch); gc()
2136
-
2137
- scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2138
 
2139
- } else if (input$scrape_source == "pos") {
2140
- msg1 <- upload_to_hf(scraped_data(), "pos", "pos_uid_index.csv.gz", "Positional Dataset")
2141
- scrape_status_msg(msg1)
2142
 
2143
- } else if (input$scrape_source == "ncaa") {
2144
- msg1 <- upload_to_hf(scraped_data(), "ncaa_pbp", "ncaa_pbp_uid_index.csv.gz", "NCAA PBP Dataset")
2145
- scrape_status_msg(msg1)
2146
- }
2147
  })
2148
 
2149
  }
 
2042
  observeEvent(input$upload_hf_btn, {
2043
  req(scraped_data())
2044
 
2045
+ scrape_status_msg("Starting upload...")
2046
+ scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
2047
+ round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
2048
+
2049
  hf_token <- Sys.getenv("HF_WRITE_TOKEN")
2050
+ scrape_status_msg(paste0("Token found: ", nchar(hf_token) > 0))
2051
+
2052
+ # Test just the index download first
2053
+ tryCatch({
2054
+ scrape_status_msg("Downloading UID index...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2055
  tmp_idx <- tempfile(fileext = ".csv.gz")
2056
+ resp <- httr::GET(
2057
+ paste0("https://huggingface.co/datasets/CoastalBaseball/2026MasterDataset/resolve/main/pbp_uid_index.csv.gz"),
2058
+ httr::add_headers(Authorization = paste("Bearer", hf_token)),
2059
+ httr::write_disk(tmp_idx, overwrite = TRUE)
 
 
 
 
 
 
2060
  )
2061
+ scrape_status_msg(paste0("Index download status: ", httr::status_code(resp)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2062
 
2063
+ d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
2064
+ scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
2065
+ file.remove(tmp_idx)
2066
+ rm(d); gc()
 
 
 
2067
 
2068
+ scrape_status_msg("Index test passed. Now testing Python import...")
2069
+ hf <- reticulate::import("huggingface_hub")
2070
+ scrape_status_msg("Python import succeeded")
2071
 
2072
+ }, error = function(e) {
2073
+ scrape_status_msg(paste0("FAILED: ", e$message))
2074
+ })
 
2075
  })
2076
 
2077
  }