Spaces:
Running
Running
Update app.R
Browse files
app.R
CHANGED
|
@@ -2042,35 +2042,78 @@ observe({
|
|
| 2042 |
observeEvent(input$upload_hf_btn, {
|
| 2043 |
req(scraped_data())
|
| 2044 |
|
| 2045 |
-
scrape_status_msg("Starting upload...")
|
| 2046 |
-
scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
|
| 2047 |
-
round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
|
| 2048 |
-
|
| 2049 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2050 |
-
|
| 2051 |
|
| 2052 |
-
# Test just the index download first
|
| 2053 |
tryCatch({
|
| 2054 |
-
|
|
|
|
| 2055 |
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2056 |
resp <- httr::GET(
|
| 2057 |
-
paste0("https://huggingface.co/datasets/
|
| 2058 |
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2059 |
httr::write_disk(tmp_idx, overwrite = TRUE)
|
| 2060 |
)
|
| 2061 |
-
|
| 2062 |
-
|
| 2063 |
-
d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
|
| 2064 |
-
scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
|
| 2065 |
file.remove(tmp_idx)
|
| 2066 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2067 |
|
| 2068 |
-
|
|
|
|
| 2069 |
hf <- reticulate::import("huggingface_hub")
|
| 2070 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2071 |
|
| 2072 |
}, error = function(e) {
|
| 2073 |
-
scrape_status_msg(paste0("FAILED: ", e$message))
|
| 2074 |
})
|
| 2075 |
})
|
| 2076 |
|
|
|
|
| 2042 |
observeEvent(input$upload_hf_btn, {
|
| 2043 |
req(scraped_data())
|
| 2044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2045 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2046 |
+
repo_id <- "CoastalBaseball/2026MasterDataset"
|
| 2047 |
|
|
|
|
| 2048 |
tryCatch({
|
| 2049 |
+
# Step 1: Get existing UIDs
|
| 2050 |
+
scrape_status_msg("Step 1: Downloading UID index...")
|
| 2051 |
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2052 |
resp <- httr::GET(
|
| 2053 |
+
paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/pbp_uid_index.csv.gz"),
|
| 2054 |
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2055 |
httr::write_disk(tmp_idx, overwrite = TRUE)
|
| 2056 |
)
|
| 2057 |
+
existing_uids <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)$PitchUID
|
|
|
|
|
|
|
|
|
|
| 2058 |
file.remove(tmp_idx)
|
| 2059 |
+
scrape_status_msg(paste0("Step 1 done: ", length(existing_uids), " existing UIDs"))
|
| 2060 |
+
|
| 2061 |
+
# Step 2: Filter to new rows
|
| 2062 |
+
scrape_status_msg("Step 2: Filtering new rows...")
|
| 2063 |
+
new_only <- scraped_data() %>% filter(!PitchUID %in% existing_uids)
|
| 2064 |
+
scrape_status_msg(paste0("Step 2 done: ", nrow(new_only), " new rows"))
|
| 2065 |
+
|
| 2066 |
+
if (nrow(new_only) == 0) {
|
| 2067 |
+
scrape_status_msg("No new rows to upload.")
|
| 2068 |
+
return()
|
| 2069 |
+
}
|
| 2070 |
+
|
| 2071 |
+
# Step 3: Write parquet
|
| 2072 |
+
scrape_status_msg("Step 3: Writing parquet...")
|
| 2073 |
+
tmp_data <- tempfile(fileext = ".parquet")
|
| 2074 |
+
arrow::write_parquet(new_only, tmp_data)
|
| 2075 |
+
scrape_status_msg(paste0("Step 3 done: ", round(file.size(tmp_data) / 1024^2, 1), " MB file"))
|
| 2076 |
|
| 2077 |
+
# Step 4: Upload parquet
|
| 2078 |
+
scrape_status_msg("Step 4: Uploading parquet...")
|
| 2079 |
hf <- reticulate::import("huggingface_hub")
|
| 2080 |
+
api <- hf$HfApi()
|
| 2081 |
+
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
|
| 2082 |
+
|
| 2083 |
+
api$upload_file(
|
| 2084 |
+
path_or_fileobj = tmp_data,
|
| 2085 |
+
path_in_repo = paste0("pbp/", timestamp, ".parquet"),
|
| 2086 |
+
repo_id = repo_id,
|
| 2087 |
+
repo_type = "dataset",
|
| 2088 |
+
token = hf_token
|
| 2089 |
+
)
|
| 2090 |
+
file.remove(tmp_data)
|
| 2091 |
+
scrape_status_msg("Step 4 done: Parquet uploaded")
|
| 2092 |
+
|
| 2093 |
+
# Step 5: Update index
|
| 2094 |
+
scrape_status_msg("Step 5: Updating UID index...")
|
| 2095 |
+
all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
|
| 2096 |
+
rm(new_only, existing_uids); gc()
|
| 2097 |
+
|
| 2098 |
+
tmp_idx2 <- tempfile(fileext = ".csv.gz")
|
| 2099 |
+
gz <- gzfile(tmp_idx2, "w")
|
| 2100 |
+
write.csv(all_uids, gz, row.names = FALSE)
|
| 2101 |
+
close(gz)
|
| 2102 |
+
|
| 2103 |
+
api$upload_file(
|
| 2104 |
+
path_or_fileobj = tmp_idx2,
|
| 2105 |
+
path_in_repo = "pbp_uid_index.csv.gz",
|
| 2106 |
+
repo_id = repo_id,
|
| 2107 |
+
repo_type = "dataset",
|
| 2108 |
+
token = hf_token
|
| 2109 |
+
)
|
| 2110 |
+
file.remove(tmp_idx2)
|
| 2111 |
+
rm(all_uids); gc()
|
| 2112 |
+
|
| 2113 |
+
scrape_status_msg("Step 5 done: ALL COMPLETE")
|
| 2114 |
|
| 2115 |
}, error = function(e) {
|
| 2116 |
+
scrape_status_msg(paste0("FAILED at: ", e$message))
|
| 2117 |
})
|
| 2118 |
})
|
| 2119 |
|