Spaces:
Running
Running
Update app.R
Browse files
app.R
CHANGED
|
@@ -2056,7 +2056,9 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2056 |
if (httr::status_code(resp) == 200) {
|
| 2057 |
tmp_dl <- tempfile(fileext = ".parquet")
|
| 2058 |
writeBin(httr::content(resp, as = "raw"), tmp_dl)
|
| 2059 |
-
arrow::read_parquet(tmp_dl)
|
|
|
|
|
|
|
| 2060 |
} else { NULL }
|
| 2061 |
}, error = function(e) { NULL })
|
| 2062 |
|
|
@@ -2066,6 +2068,8 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2066 |
if (existing_rows > 0) {
|
| 2067 |
scrape_status_msg(paste0("Merging ", label, "..."))
|
| 2068 |
combined <- bind_rows(existing, new_data)
|
|
|
|
|
|
|
| 2069 |
if ("PitchUID" %in% names(combined)) {
|
| 2070 |
combined <- combined %>% distinct(PitchUID, .keep_all = TRUE)
|
| 2071 |
} else {
|
|
@@ -2073,6 +2077,7 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2073 |
}
|
| 2074 |
} else {
|
| 2075 |
combined <- new_data
|
|
|
|
| 2076 |
}
|
| 2077 |
|
| 2078 |
new_rows <- nrow(combined) - existing_rows
|
|
@@ -2081,9 +2086,10 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2081 |
|
| 2082 |
tmp <- tempfile(fileext = ".parquet")
|
| 2083 |
arrow::write_parquet(combined, tmp)
|
|
|
|
| 2084 |
|
| 2085 |
result <- tryCatch({
|
| 2086 |
-
hf <- import("huggingface_hub")
|
| 2087 |
api <- hf$HfApi()
|
| 2088 |
api$upload_file(
|
| 2089 |
path_or_fileobj = tmp,
|
|
@@ -2092,7 +2098,7 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2092 |
repo_type = "dataset",
|
| 2093 |
token = hf_token
|
| 2094 |
)
|
| 2095 |
-
paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", nrow(
|
| 2096 |
}, error = function(e) {
|
| 2097 |
paste0(label, " upload error: ", e$message)
|
| 2098 |
})
|
|
@@ -2102,21 +2108,29 @@ observeEvent(input$upload_hf_btn, {
|
|
| 2102 |
return(result)
|
| 2103 |
}
|
| 2104 |
|
|
|
|
| 2105 |
main_file <- paste0(input$scrape_source, "_2026_master.parquet")
|
| 2106 |
msg1 <- upload_to_hf(scraped_data(), main_file, "Master Dataset")
|
|
|
|
| 2107 |
|
| 2108 |
if (input$scrape_source == "pbp") {
|
|
|
|
| 2109 |
coastal_pitchers <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
|
| 2110 |
-
coastal_hitters <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
|
| 2111 |
|
| 2112 |
msg2 <- if (nrow(coastal_pitchers) > 0) {
|
| 2113 |
upload_to_hf(coastal_pitchers, "CoastalPitchers2026.parquet", "Coastal Pitchers")
|
| 2114 |
} else { "Coastal Pitchers: No matching rows" }
|
| 2115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2116 |
msg3 <- if (nrow(coastal_hitters) > 0) {
|
| 2117 |
upload_to_hf(coastal_hitters, "CoastalHitters2026.parquet", "Coastal Hitters")
|
| 2118 |
} else { "Coastal Hitters: No matching rows" }
|
| 2119 |
|
|
|
|
|
|
|
| 2120 |
scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
|
| 2121 |
} else {
|
| 2122 |
scrape_status_msg(msg1)
|
|
|
|
| 2056 |
if (httr::status_code(resp) == 200) {
|
| 2057 |
tmp_dl <- tempfile(fileext = ".parquet")
|
| 2058 |
writeBin(httr::content(resp, as = "raw"), tmp_dl)
|
| 2059 |
+
d <- arrow::read_parquet(tmp_dl)
|
| 2060 |
+
file.remove(tmp_dl)
|
| 2061 |
+
d
|
| 2062 |
} else { NULL }
|
| 2063 |
}, error = function(e) { NULL })
|
| 2064 |
|
|
|
|
| 2068 |
if (existing_rows > 0) {
|
| 2069 |
scrape_status_msg(paste0("Merging ", label, "..."))
|
| 2070 |
combined <- bind_rows(existing, new_data)
|
| 2071 |
+
rm(existing); gc()
|
| 2072 |
+
|
| 2073 |
if ("PitchUID" %in% names(combined)) {
|
| 2074 |
combined <- combined %>% distinct(PitchUID, .keep_all = TRUE)
|
| 2075 |
} else {
|
|
|
|
| 2077 |
}
|
| 2078 |
} else {
|
| 2079 |
combined <- new_data
|
| 2080 |
+
rm(existing); gc()
|
| 2081 |
}
|
| 2082 |
|
| 2083 |
new_rows <- nrow(combined) - existing_rows
|
|
|
|
| 2086 |
|
| 2087 |
tmp <- tempfile(fileext = ".parquet")
|
| 2088 |
arrow::write_parquet(combined, tmp)
|
| 2089 |
+
rm(combined); gc()
|
| 2090 |
|
| 2091 |
result <- tryCatch({
|
| 2092 |
+
hf <- reticulate::import("huggingface_hub")
|
| 2093 |
api <- hf$HfApi()
|
| 2094 |
api$upload_file(
|
| 2095 |
path_or_fileobj = tmp,
|
|
|
|
| 2098 |
repo_type = "dataset",
|
| 2099 |
token = hf_token
|
| 2100 |
)
|
| 2101 |
+
paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", nrow(new_data), " + ", existing_rows, " existing = deduped)")
|
| 2102 |
}, error = function(e) {
|
| 2103 |
paste0(label, " upload error: ", e$message)
|
| 2104 |
})
|
|
|
|
| 2108 |
return(result)
|
| 2109 |
}
|
| 2110 |
|
| 2111 |
+
# Upload main dataset first
|
| 2112 |
main_file <- paste0(input$scrape_source, "_2026_master.parquet")
|
| 2113 |
msg1 <- upload_to_hf(scraped_data(), main_file, "Master Dataset")
|
| 2114 |
+
gc()
|
| 2115 |
|
| 2116 |
if (input$scrape_source == "pbp") {
|
| 2117 |
+
# Filter THEN upload one at a time to save memory
|
| 2118 |
coastal_pitchers <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
|
|
|
|
| 2119 |
|
| 2120 |
msg2 <- if (nrow(coastal_pitchers) > 0) {
|
| 2121 |
upload_to_hf(coastal_pitchers, "CoastalPitchers2026.parquet", "Coastal Pitchers")
|
| 2122 |
} else { "Coastal Pitchers: No matching rows" }
|
| 2123 |
|
| 2124 |
+
rm(coastal_pitchers); gc()
|
| 2125 |
+
|
| 2126 |
+
coastal_hitters <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
|
| 2127 |
+
|
| 2128 |
msg3 <- if (nrow(coastal_hitters) > 0) {
|
| 2129 |
upload_to_hf(coastal_hitters, "CoastalHitters2026.parquet", "Coastal Hitters")
|
| 2130 |
} else { "Coastal Hitters: No matching rows" }
|
| 2131 |
|
| 2132 |
+
rm(coastal_hitters); gc()
|
| 2133 |
+
|
| 2134 |
scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
|
| 2135 |
} else {
|
| 2136 |
scrape_status_msg(msg1)
|