Spaces:
Running
Running
Update app.R
Browse files
app.R
CHANGED
|
@@ -2044,77 +2044,107 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2044 |
|
| 2045 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2046 |
repo_id <- "CoastalBaseball/2026MasterDataset"
|
|
|
|
| 2047 |
|
| 2048 |
-
|
| 2049 |
-
|
| 2050 |
-
scrape_status_msg("Step 1: Downloading UID index...")
|
| 2051 |
-
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2052 |
-
resp <- httr::GET(
|
| 2053 |
-
paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/pbp_uid_index.csv.gz"),
|
| 2054 |
-
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2055 |
-
httr::write_disk(tmp_idx, overwrite = TRUE)
|
| 2056 |
-
)
|
| 2057 |
-
existing_uids <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)$PitchUID
|
| 2058 |
-
file.remove(tmp_idx)
|
| 2059 |
-
scrape_status_msg(paste0("Step 1 done: ", length(existing_uids), " existing UIDs"))
|
| 2060 |
|
| 2061 |
-
|
| 2062 |
-
|
| 2063 |
-
|
| 2064 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2065 |
|
| 2066 |
-
|
| 2067 |
-
|
| 2068 |
-
|
|
|
|
|
|
|
|
|
|
| 2069 |
}
|
| 2070 |
|
| 2071 |
-
|
| 2072 |
-
|
| 2073 |
-
|
| 2074 |
-
|
| 2075 |
-
|
|
|
|
|
|
|
|
|
|
| 2076 |
|
| 2077 |
-
# Step 4: Upload parquet
|
| 2078 |
-
scrape_status_msg("Step 4: Uploading parquet...")
|
| 2079 |
hf <- reticulate::import("huggingface_hub")
|
| 2080 |
api <- hf$HfApi()
|
| 2081 |
-
|
|
|
|
|
|
|
| 2082 |
|
| 2083 |
api$upload_file(
|
| 2084 |
path_or_fileobj = tmp_data,
|
| 2085 |
-
path_in_repo = paste0("
|
| 2086 |
repo_id = repo_id,
|
| 2087 |
repo_type = "dataset",
|
| 2088 |
token = hf_token
|
| 2089 |
)
|
| 2090 |
file.remove(tmp_data)
|
| 2091 |
-
scrape_status_msg("Step 4 done: Parquet uploaded")
|
| 2092 |
|
| 2093 |
-
|
| 2094 |
-
scrape_status_msg("Step 5: Updating UID index...")
|
| 2095 |
-
all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
|
| 2096 |
-
rm(new_only, existing_uids); gc()
|
| 2097 |
|
| 2098 |
-
|
| 2099 |
-
|
|
|
|
| 2100 |
write.csv(all_uids, gz, row.names = FALSE)
|
| 2101 |
close(gz)
|
| 2102 |
|
| 2103 |
api$upload_file(
|
| 2104 |
-
path_or_fileobj =
|
| 2105 |
-
path_in_repo =
|
| 2106 |
repo_id = repo_id,
|
| 2107 |
repo_type = "dataset",
|
| 2108 |
token = hf_token
|
| 2109 |
)
|
| 2110 |
-
file.remove(
|
| 2111 |
-
rm(all_uids); gc()
|
| 2112 |
|
| 2113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2114 |
|
| 2115 |
-
|
| 2116 |
-
|
| 2117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2118 |
})
|
| 2119 |
|
| 2120 |
}
|
|
|
|
| 2044 |
|
| 2045 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2046 |
repo_id <- "CoastalBaseball/2026MasterDataset"
|
| 2047 |
+
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
|
| 2048 |
|
| 2049 |
+
upload_to_hf <- function(new_data, folder, index_file, label) {
|
| 2050 |
+
scrape_status_msg(paste0("Checking existing UIDs for ", label, "..."))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2051 |
|
| 2052 |
+
existing_uids <- tryCatch({
|
| 2053 |
+
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2054 |
+
resp <- httr::GET(
|
| 2055 |
+
paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
|
| 2056 |
+
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2057 |
+
httr::write_disk(tmp_idx, overwrite = TRUE)
|
| 2058 |
+
)
|
| 2059 |
+
if (httr::status_code(resp) == 200) {
|
| 2060 |
+
d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
|
| 2061 |
+
file.remove(tmp_idx)
|
| 2062 |
+
d$PitchUID
|
| 2063 |
+
} else {
|
| 2064 |
+
file.remove(tmp_idx)
|
| 2065 |
+
character(0)
|
| 2066 |
+
}
|
| 2067 |
+
}, error = function(e) { character(0) })
|
| 2068 |
|
| 2069 |
+
scraped_rows <- nrow(new_data)
|
| 2070 |
+
|
| 2071 |
+
if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
|
| 2072 |
+
new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
|
| 2073 |
+
} else {
|
| 2074 |
+
new_only <- new_data
|
| 2075 |
}
|
| 2076 |
|
| 2077 |
+
new_rows <- nrow(new_only)
|
| 2078 |
+
total_after <- length(existing_uids) + new_rows
|
| 2079 |
+
|
| 2080 |
+
if (new_rows == 0) {
|
| 2081 |
+
return(paste0(label, ": ", scraped_rows, " rows scraped, 0 new rows added (", length(existing_uids), " total)"))
|
| 2082 |
+
}
|
| 2083 |
+
|
| 2084 |
+
scrape_status_msg(paste0("Uploading ", new_rows, " new rows for ", label, "..."))
|
| 2085 |
|
|
|
|
|
|
|
| 2086 |
hf <- reticulate::import("huggingface_hub")
|
| 2087 |
api <- hf$HfApi()
|
| 2088 |
+
|
| 2089 |
+
tmp_data <- tempfile(fileext = ".parquet")
|
| 2090 |
+
arrow::write_parquet(new_only, tmp_data)
|
| 2091 |
|
| 2092 |
api$upload_file(
|
| 2093 |
path_or_fileobj = tmp_data,
|
| 2094 |
+
path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
|
| 2095 |
repo_id = repo_id,
|
| 2096 |
repo_type = "dataset",
|
| 2097 |
token = hf_token
|
| 2098 |
)
|
| 2099 |
file.remove(tmp_data)
|
|
|
|
| 2100 |
|
| 2101 |
+
scrape_status_msg(paste0("Updating ", label, " index..."))
|
|
|
|
|
|
|
|
|
|
| 2102 |
|
| 2103 |
+
all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
|
| 2104 |
+
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2105 |
+
gz <- gzfile(tmp_idx, "w")
|
| 2106 |
write.csv(all_uids, gz, row.names = FALSE)
|
| 2107 |
close(gz)
|
| 2108 |
|
| 2109 |
api$upload_file(
|
| 2110 |
+
path_or_fileobj = tmp_idx,
|
| 2111 |
+
path_in_repo = index_file,
|
| 2112 |
repo_id = repo_id,
|
| 2113 |
repo_type = "dataset",
|
| 2114 |
token = hf_token
|
| 2115 |
)
|
| 2116 |
+
file.remove(tmp_idx)
|
|
|
|
| 2117 |
|
| 2118 |
+
rm(new_only, all_uids); gc()
|
| 2119 |
+
paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", total_after, " total)")
|
| 2120 |
+
}
|
| 2121 |
+
|
| 2122 |
+
if (input$scrape_source == "pbp") {
|
| 2123 |
+
msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
|
| 2124 |
+
gc()
|
| 2125 |
|
| 2126 |
+
cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
|
| 2127 |
+
msg2 <- if (nrow(cp) > 0) {
|
| 2128 |
+
upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
|
| 2129 |
+
} else { "Coastal Pitchers: No matching rows" }
|
| 2130 |
+
rm(cp); gc()
|
| 2131 |
+
|
| 2132 |
+
ch <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
|
| 2133 |
+
msg3 <- if (nrow(ch) > 0) {
|
| 2134 |
+
upload_to_hf(ch, "coastal_hitters", "coastal_hitters_uid_index.csv.gz", "Coastal Hitters")
|
| 2135 |
+
} else { "Coastal Hitters: No matching rows" }
|
| 2136 |
+
rm(ch); gc()
|
| 2137 |
+
|
| 2138 |
+
scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
|
| 2139 |
+
|
| 2140 |
+
} else if (input$scrape_source == "pos") {
|
| 2141 |
+
msg1 <- upload_to_hf(scraped_data(), "pos", "pos_uid_index.csv.gz", "Positional Dataset")
|
| 2142 |
+
scrape_status_msg(msg1)
|
| 2143 |
+
|
| 2144 |
+
} else if (input$scrape_source == "ncaa") {
|
| 2145 |
+
msg1 <- upload_to_hf(scraped_data(), "ncaa_pbp", "ncaa_pbp_uid_index.csv.gz", "NCAA PBP Dataset")
|
| 2146 |
+
scrape_status_msg(msg1)
|
| 2147 |
+
}
|
| 2148 |
})
|
| 2149 |
|
| 2150 |
}
|