Spaces:
Running
Running
Update app.R
Browse files
app.R
CHANGED
|
@@ -2042,108 +2042,36 @@ observe({
|
|
| 2042 |
observeEvent(input$upload_hf_btn, {
|
| 2043 |
req(scraped_data())
|
| 2044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2045 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2046 |
-
|
| 2047 |
-
|
| 2048 |
-
|
| 2049 |
-
|
| 2050 |
-
scrape_status_msg(
|
| 2051 |
-
|
| 2052 |
-
# Download tiny UID index
|
| 2053 |
-
existing_uids <- tryCatch({
|
| 2054 |
-
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2055 |
-
resp <- httr::GET(
|
| 2056 |
-
paste0("https://huggingface.co/datasets/", repo_id, "/resolve/main/", index_file),
|
| 2057 |
-
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2058 |
-
httr::write_disk(tmp_idx, overwrite = TRUE)
|
| 2059 |
-
)
|
| 2060 |
-
if (httr::status_code(resp) == 200) {
|
| 2061 |
-
d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
|
| 2062 |
-
file.remove(tmp_idx)
|
| 2063 |
-
d$PitchUID
|
| 2064 |
-
} else {
|
| 2065 |
-
file.remove(tmp_idx)
|
| 2066 |
-
character(0)
|
| 2067 |
-
}
|
| 2068 |
-
}, error = function(e) { character(0) })
|
| 2069 |
-
|
| 2070 |
-
# Filter to only new rows
|
| 2071 |
-
scraped_rows <- nrow(new_data)
|
| 2072 |
-
if (length(existing_uids) > 0 && "PitchUID" %in% names(new_data)) {
|
| 2073 |
-
new_only <- new_data %>% filter(!PitchUID %in% existing_uids)
|
| 2074 |
-
} else {
|
| 2075 |
-
new_only <- new_data
|
| 2076 |
-
}
|
| 2077 |
-
|
| 2078 |
-
if (nrow(new_only) == 0) {
|
| 2079 |
-
return(paste0(label, ": 0 new rows (all ", scraped_rows, " already exist)"))
|
| 2080 |
-
}
|
| 2081 |
-
|
| 2082 |
-
scrape_status_msg(paste0("Uploading ", nrow(new_only), " new rows for ", label, "..."))
|
| 2083 |
-
|
| 2084 |
-
hf <- reticulate::import("huggingface_hub")
|
| 2085 |
-
api <- hf$HfApi()
|
| 2086 |
-
|
| 2087 |
-
# Upload new data as timestamped parquet into folder
|
| 2088 |
-
tmp_data <- tempfile(fileext = ".parquet")
|
| 2089 |
-
arrow::write_parquet(new_only, tmp_data)
|
| 2090 |
-
|
| 2091 |
-
api$upload_file(
|
| 2092 |
-
path_or_fileobj = tmp_data,
|
| 2093 |
-
path_in_repo = paste0(folder, "/", timestamp, ".parquet"),
|
| 2094 |
-
repo_id = repo_id,
|
| 2095 |
-
repo_type = "dataset",
|
| 2096 |
-
token = hf_token
|
| 2097 |
-
)
|
| 2098 |
-
file.remove(tmp_data)
|
| 2099 |
-
|
| 2100 |
-
# Update UID index (append new UIDs)
|
| 2101 |
-
all_uids <- data.frame(PitchUID = c(existing_uids, new_only$PitchUID))
|
| 2102 |
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2103 |
-
|
| 2104 |
-
|
| 2105 |
-
|
| 2106 |
-
|
| 2107 |
-
api$upload_file(
|
| 2108 |
-
path_or_fileobj = tmp_idx,
|
| 2109 |
-
path_in_repo = index_file,
|
| 2110 |
-
repo_id = repo_id,
|
| 2111 |
-
repo_type = "dataset",
|
| 2112 |
-
token = hf_token
|
| 2113 |
)
|
| 2114 |
-
|
| 2115 |
-
|
| 2116 |
-
rm(new_only, all_uids); gc()
|
| 2117 |
-
paste0(label, ": ", scraped_rows, " scraped, ", nrow(new_only), " new rows added (", length(existing_uids) + nrow(new_only), " total)")
|
| 2118 |
-
}
|
| 2119 |
-
|
| 2120 |
-
# Determine folder based on scrape source
|
| 2121 |
-
if (input$scrape_source == "pbp") {
|
| 2122 |
-
msg1 <- upload_to_hf(scraped_data(), "pbp", "pbp_uid_index.csv.gz", "Master Dataset")
|
| 2123 |
-
gc()
|
| 2124 |
-
|
| 2125 |
-
cp <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
|
| 2126 |
-
msg2 <- if (nrow(cp) > 0) {
|
| 2127 |
-
upload_to_hf(cp, "coastal_pitchers", "coastal_pitchers_uid_index.csv.gz", "Coastal Pitchers")
|
| 2128 |
-
} else { "Coastal Pitchers: No matching rows" }
|
| 2129 |
-
rm(cp); gc()
|
| 2130 |
|
| 2131 |
-
|
| 2132 |
-
|
| 2133 |
-
|
| 2134 |
-
|
| 2135 |
-
rm(ch); gc()
|
| 2136 |
-
|
| 2137 |
-
scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
|
| 2138 |
|
| 2139 |
-
|
| 2140 |
-
|
| 2141 |
-
scrape_status_msg(
|
| 2142 |
|
| 2143 |
-
}
|
| 2144 |
-
|
| 2145 |
-
|
| 2146 |
-
}
|
| 2147 |
})
|
| 2148 |
|
| 2149 |
}
|
|
|
|
| 2042 |
observeEvent(input$upload_hf_btn, {
|
| 2043 |
req(scraped_data())
|
| 2044 |
|
| 2045 |
+
scrape_status_msg("Starting upload...")
|
| 2046 |
+
scrape_status_msg(paste0("Scraped data: ", nrow(scraped_data()), " rows, ",
|
| 2047 |
+
round(object.size(scraped_data()) / 1024^2, 1), " MB in memory"))
|
| 2048 |
+
|
| 2049 |
hf_token <- Sys.getenv("HF_WRITE_TOKEN")
|
| 2050 |
+
scrape_status_msg(paste0("Token found: ", nchar(hf_token) > 0))
|
| 2051 |
+
|
| 2052 |
+
# Test just the index download first
|
| 2053 |
+
tryCatch({
|
| 2054 |
+
scrape_status_msg("Downloading UID index...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2055 |
tmp_idx <- tempfile(fileext = ".csv.gz")
|
| 2056 |
+
resp <- httr::GET(
|
| 2057 |
+
paste0("https://huggingface.co/datasets/CoastalBaseball/2026MasterDataset/resolve/main/pbp_uid_index.csv.gz"),
|
| 2058 |
+
httr::add_headers(Authorization = paste("Bearer", hf_token)),
|
| 2059 |
+
httr::write_disk(tmp_idx, overwrite = TRUE)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2060 |
)
|
| 2061 |
+
scrape_status_msg(paste0("Index download status: ", httr::status_code(resp)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2062 |
|
| 2063 |
+
d <- read.csv(gzfile(tmp_idx), stringsAsFactors = FALSE)
|
| 2064 |
+
scrape_status_msg(paste0("Existing UIDs: ", nrow(d)))
|
| 2065 |
+
file.remove(tmp_idx)
|
| 2066 |
+
rm(d); gc()
|
|
|
|
|
|
|
|
|
|
| 2067 |
|
| 2068 |
+
scrape_status_msg("Index test passed. Now testing Python import...")
|
| 2069 |
+
hf <- reticulate::import("huggingface_hub")
|
| 2070 |
+
scrape_status_msg("Python import succeeded")
|
| 2071 |
|
| 2072 |
+
}, error = function(e) {
|
| 2073 |
+
scrape_status_msg(paste0("FAILED: ", e$message))
|
| 2074 |
+
})
|
|
|
|
| 2075 |
})
|
| 2076 |
|
| 2077 |
}
|