OwenStOnge commited on
Commit
cef55c1
·
verified ·
1 Parent(s): adeb4a7

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +18 -4
app.R CHANGED
@@ -2056,7 +2056,9 @@ observeEvent(input$upload_hf_btn, {
2056
  if (httr::status_code(resp) == 200) {
2057
  tmp_dl <- tempfile(fileext = ".parquet")
2058
  writeBin(httr::content(resp, as = "raw"), tmp_dl)
2059
- arrow::read_parquet(tmp_dl)
 
 
2060
  } else { NULL }
2061
  }, error = function(e) { NULL })
2062
 
@@ -2066,6 +2068,8 @@ observeEvent(input$upload_hf_btn, {
2066
  if (existing_rows > 0) {
2067
  scrape_status_msg(paste0("Merging ", label, "..."))
2068
  combined <- bind_rows(existing, new_data)
 
 
2069
  if ("PitchUID" %in% names(combined)) {
2070
  combined <- combined %>% distinct(PitchUID, .keep_all = TRUE)
2071
  } else {
@@ -2073,6 +2077,7 @@ observeEvent(input$upload_hf_btn, {
2073
  }
2074
  } else {
2075
  combined <- new_data
 
2076
  }
2077
 
2078
  new_rows <- nrow(combined) - existing_rows
@@ -2081,9 +2086,10 @@ observeEvent(input$upload_hf_btn, {
2081
 
2082
  tmp <- tempfile(fileext = ".parquet")
2083
  arrow::write_parquet(combined, tmp)
 
2084
 
2085
  result <- tryCatch({
2086
- hf <- import("huggingface_hub")
2087
  api <- hf$HfApi()
2088
  api$upload_file(
2089
  path_or_fileobj = tmp,
@@ -2092,7 +2098,7 @@ observeEvent(input$upload_hf_btn, {
2092
  repo_type = "dataset",
2093
  token = hf_token
2094
  )
2095
- paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", nrow(combined), " total)")
2096
  }, error = function(e) {
2097
  paste0(label, " upload error: ", e$message)
2098
  })
@@ -2102,21 +2108,29 @@ observeEvent(input$upload_hf_btn, {
2102
  return(result)
2103
  }
2104
 
 
2105
  main_file <- paste0(input$scrape_source, "_2026_master.parquet")
2106
  msg1 <- upload_to_hf(scraped_data(), main_file, "Master Dataset")
 
2107
 
2108
  if (input$scrape_source == "pbp") {
 
2109
  coastal_pitchers <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
2110
- coastal_hitters <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2111
 
2112
  msg2 <- if (nrow(coastal_pitchers) > 0) {
2113
  upload_to_hf(coastal_pitchers, "CoastalPitchers2026.parquet", "Coastal Pitchers")
2114
  } else { "Coastal Pitchers: No matching rows" }
2115
 
 
 
 
 
2116
  msg3 <- if (nrow(coastal_hitters) > 0) {
2117
  upload_to_hf(coastal_hitters, "CoastalHitters2026.parquet", "Coastal Hitters")
2118
  } else { "Coastal Hitters: No matching rows" }
2119
 
 
 
2120
  scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2121
  } else {
2122
  scrape_status_msg(msg1)
 
2056
  if (httr::status_code(resp) == 200) {
2057
  tmp_dl <- tempfile(fileext = ".parquet")
2058
  writeBin(httr::content(resp, as = "raw"), tmp_dl)
2059
+ d <- arrow::read_parquet(tmp_dl)
2060
+ file.remove(tmp_dl)
2061
+ d
2062
  } else { NULL }
2063
  }, error = function(e) { NULL })
2064
 
 
2068
  if (existing_rows > 0) {
2069
  scrape_status_msg(paste0("Merging ", label, "..."))
2070
  combined <- bind_rows(existing, new_data)
2071
+ rm(existing); gc()
2072
+
2073
  if ("PitchUID" %in% names(combined)) {
2074
  combined <- combined %>% distinct(PitchUID, .keep_all = TRUE)
2075
  } else {
 
2077
  }
2078
  } else {
2079
  combined <- new_data
2080
+ rm(existing); gc()
2081
  }
2082
 
2083
  new_rows <- nrow(combined) - existing_rows
 
2086
 
2087
  tmp <- tempfile(fileext = ".parquet")
2088
  arrow::write_parquet(combined, tmp)
2089
+ rm(combined); gc()
2090
 
2091
  result <- tryCatch({
2092
+ hf <- reticulate::import("huggingface_hub")
2093
  api <- hf$HfApi()
2094
  api$upload_file(
2095
  path_or_fileobj = tmp,
 
2098
  repo_type = "dataset",
2099
  token = hf_token
2100
  )
2101
+ paste0(label, ": ", scraped_rows, " rows scraped, ", new_rows, " new rows added (", nrow(new_data), " + ", existing_rows, " existing = deduped)")
2102
  }, error = function(e) {
2103
  paste0(label, " upload error: ", e$message)
2104
  })
 
2108
  return(result)
2109
  }
2110
 
2111
+ # Upload main dataset first
2112
  main_file <- paste0(input$scrape_source, "_2026_master.parquet")
2113
  msg1 <- upload_to_hf(scraped_data(), main_file, "Master Dataset")
2114
+ gc()
2115
 
2116
  if (input$scrape_source == "pbp") {
2117
+ # Filter THEN upload one at a time to save memory
2118
  coastal_pitchers <- scraped_data() %>% filter(PitcherTeam == "COA_CHA")
 
2119
 
2120
  msg2 <- if (nrow(coastal_pitchers) > 0) {
2121
  upload_to_hf(coastal_pitchers, "CoastalPitchers2026.parquet", "Coastal Pitchers")
2122
  } else { "Coastal Pitchers: No matching rows" }
2123
 
2124
+ rm(coastal_pitchers); gc()
2125
+
2126
+ coastal_hitters <- scraped_data() %>% filter(BatterTeam == "COA_CHA")
2127
+
2128
  msg3 <- if (nrow(coastal_hitters) > 0) {
2129
  upload_to_hf(coastal_hitters, "CoastalHitters2026.parquet", "Coastal Hitters")
2130
  } else { "Coastal Hitters: No matching rows" }
2131
 
2132
+ rm(coastal_hitters); gc()
2133
+
2134
  scrape_status_msg(paste(msg1, msg2, msg3, sep = "\n"))
2135
  } else {
2136
  scrape_status_msg(msg1)