Update app.R
Browse files
app.R
CHANGED
|
@@ -1,665 +1,67 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
# ==============================================================================
|
| 4 |
-
#
|
| 5 |
-
# using pre-treatment satellite imagery features (NDVI, Nightlight)
|
| 6 |
# ==============================================================================
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
for (k in seq_len(K)) {
|
| 45 |
-
idx_te <- which(folds == k)
|
| 46 |
-
idx_tr <- which(folds != k)
|
| 47 |
-
|
| 48 |
-
A_tr <- A[idx_tr]; X_tr <- as.matrix(X[idx_tr, , drop = FALSE])
|
| 49 |
-
X_te <- as.matrix(X[idx_te, , drop = FALSE])
|
| 50 |
-
|
| 51 |
-
sdv <- apply(X_tr, 2, sd)
|
| 52 |
-
keep <- is.finite(sdv) & (sdv > 1e-12)
|
| 53 |
-
if (!any(keep)) {
|
| 54 |
-
phat[idx_te] <- mean(A_tr)
|
| 55 |
-
next
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
X_tr_k <- X_tr[, keep, drop = FALSE]
|
| 59 |
-
X_te_k <- X_te[, keep, drop = FALSE]
|
| 60 |
-
|
| 61 |
-
X_df <- as.data.frame(X_tr_k)
|
| 62 |
-
X_mat <- model.matrix(~ . -1, data = X_df)
|
| 63 |
-
y_num <- as.numeric(A_tr)
|
| 64 |
-
|
| 65 |
-
p <- ncol(X_mat)
|
| 66 |
-
mtry_use <- if (is.null(mtry)) max(1L, floor(sqrt(p))) else max(1L, min(as.integer(mtry), p))
|
| 67 |
-
colsample_frac <- min(1, as.numeric(mtry_use) / max(1, p))
|
| 68 |
-
|
| 69 |
-
params <- list(
|
| 70 |
-
objective = "binary:logistic",
|
| 71 |
-
eval_metric = "logloss",
|
| 72 |
-
eta = 0.1,
|
| 73 |
-
max_depth = 6,
|
| 74 |
-
subsample = 0.8,
|
| 75 |
-
colsample_bytree = colsample_frac,
|
| 76 |
-
nthread = parallel::detectCores()
|
| 77 |
-
)
|
| 78 |
-
|
| 79 |
-
dtrain <- xgboost::xgb.DMatrix(data = X_mat, label = y_num)
|
| 80 |
-
|
| 81 |
-
fit <- xgboost::xgb.train(
|
| 82 |
-
params = params,
|
| 83 |
-
data = dtrain,
|
| 84 |
-
nrounds = as.integer(ntree),
|
| 85 |
-
verbose = 0
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
phat[idx_te] <- predict(fit, X_te_k)
|
| 89 |
-
}
|
| 90 |
-
list(phat = .clamp01(phat), folds = folds)
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
.draw_assign_fixed_m <- function(n, m) {
|
| 94 |
-
A <- rep.int(0L, n)
|
| 95 |
-
A[sample.int(n, m)] <- 1L
|
| 96 |
-
A
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
remote_audit_crt <- function(A, X,
|
| 100 |
-
K = 5,
|
| 101 |
-
B = 1000,
|
| 102 |
-
seed = 123,
|
| 103 |
-
label = "",
|
| 104 |
-
xgboost_ntree = 300L,
|
| 105 |
-
xgboost_mtry = NULL) {
|
| 106 |
-
stopifnot(length(A) == nrow(X))
|
| 107 |
-
keep <- is.finite(A) & apply(as.matrix(X), 1, function(r) all(is.finite(r)))
|
| 108 |
-
A <- as.integer(A[keep])
|
| 109 |
-
X <- as.matrix(X[keep, , drop = FALSE])
|
| 110 |
-
|
| 111 |
-
n <- length(A)
|
| 112 |
-
a_bar <- .clamp01(mean(A))
|
| 113 |
-
m <- sum(A)
|
| 114 |
-
|
| 115 |
-
set.seed(seed)
|
| 116 |
-
folds <- sample(rep(seq_len(K), length.out = n))
|
| 117 |
-
|
| 118 |
-
obs <- .cf_xgboost_phat(A, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)
|
| 119 |
-
T_obs <- .lik_improvement(A, obs$phat, a_bar)
|
| 120 |
-
|
| 121 |
-
plan(multisession(workers = availableCores()))
|
| 122 |
-
T_null <- future_sapply(seq_len(B), future.seed = TRUE, FUN = function(b) {
|
| 123 |
-
A_b <- .draw_assign_fixed_m(n, m)
|
| 124 |
-
ph <- .cf_xgboost_phat(A_b, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)$phat
|
| 125 |
-
.lik_improvement(A_b, ph, a_bar)
|
| 126 |
-
})
|
| 127 |
-
plan(sequential)
|
| 128 |
-
|
| 129 |
-
pval <- (1 + sum(T_null >= T_obs)) / (B + 1)
|
| 130 |
-
|
| 131 |
-
list(
|
| 132 |
-
T_obs = T_obs,
|
| 133 |
-
T_null = T_null,
|
| 134 |
-
p_value = pval,
|
| 135 |
-
a_bar = a_bar,
|
| 136 |
-
n = n,
|
| 137 |
-
treated = m,
|
| 138 |
-
K = K,
|
| 139 |
-
B = B,
|
| 140 |
-
label = label,
|
| 141 |
-
learner = "xgboost"
|
| 142 |
-
)
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
# ============================================================================
|
| 146 |
-
# UI
|
| 147 |
-
# ============================================================================
|
| 148 |
-
|
| 149 |
-
theme <- bs_theme(bootswatch = "flatly")
|
| 150 |
-
|
| 151 |
-
ui <- page_sidebar(
|
| 152 |
-
tags$head(tags$title("Remote Audit")),
|
| 153 |
-
title = div(
|
| 154 |
-
span("Remote Audit", style = "font-weight:700;"),
|
| 155 |
-
span(" with Satellite Imagery", style = "color: #888;")
|
| 156 |
-
),
|
| 157 |
-
theme = theme,
|
| 158 |
-
|
| 159 |
-
sidebar = sidebar(
|
| 160 |
-
width = 360,
|
| 161 |
-
|
| 162 |
-
h5("Data Input"),
|
| 163 |
-
radioButtons("data_source", NULL,
|
| 164 |
-
choices = c("Upload CSV" = "upload",
|
| 165 |
-
"Use Example (Begum et al. 2022)" = "example"),
|
| 166 |
-
selected = "example"),
|
| 167 |
-
|
| 168 |
-
conditionalPanel(
|
| 169 |
-
"input.data_source == 'upload'",
|
| 170 |
-
fileInput("file_csv", "Upload CSV", accept = ".csv")
|
| 171 |
-
),
|
| 172 |
-
|
| 173 |
-
h5("Audit Configuration"),
|
| 174 |
-
|
| 175 |
-
selectInput("audit_type", "Audit Type",
|
| 176 |
-
choices = c("Randomization" = "randomization",
|
| 177 |
-
"Missingness" = "missingness"),
|
| 178 |
-
selected = "randomization"),
|
| 179 |
-
|
| 180 |
-
conditionalPanel(
|
| 181 |
-
"input.audit_type == 'randomization'",
|
| 182 |
-
selectInput("treat_col", "Treatment Column", choices = NULL),
|
| 183 |
-
numericInput("control_val", "Control Value", value = 1, step = 1),
|
| 184 |
-
numericInput("treat_val", "Treatment Value", value = 2, step = 1)
|
| 185 |
-
),
|
| 186 |
-
|
| 187 |
-
conditionalPanel(
|
| 188 |
-
"input.audit_type == 'missingness'",
|
| 189 |
-
selectInput("missing_col", "Variable to Check", choices = NULL)
|
| 190 |
-
),
|
| 191 |
-
|
| 192 |
-
selectInput("lat_col", "Latitude Column", choices = NULL),
|
| 193 |
-
selectInput("long_col", "Longitude Column", choices = NULL),
|
| 194 |
-
|
| 195 |
-
numericInput("start_year", "Start Year", value = 2010, min = 1990, max = 2026),
|
| 196 |
-
numericInput("end_year", "End Year", value = 2011, min = 1990, max = 2026),
|
| 197 |
-
|
| 198 |
-
checkboxGroupInput("features", "Features",
|
| 199 |
-
choices = c("NDVI Median" = "ndvi_median",
|
| 200 |
-
"Nightlight Median" = "ntl_median"),
|
| 201 |
-
selected = c("ndvi_median", "ntl_median")),
|
| 202 |
-
|
| 203 |
-
h5("Parameters"),
|
| 204 |
-
numericInput("K", "K-Folds", value = 5, min = 2, max = 10),
|
| 205 |
-
numericInput("B", "Resamples", value = 1000, min = 100, max = 5000, step = 100),
|
| 206 |
-
numericInput("seed", "Random Seed", value = 123),
|
| 207 |
-
numericInput("ntree", "Number of Trees", value = 300, min = 50, max = 1000),
|
| 208 |
-
|
| 209 |
-
actionButton("run_audit", "Run Audit",
|
| 210 |
-
class = "btn-primary btn-lg",
|
| 211 |
-
icon = icon("play"),
|
| 212 |
-
style = "width: 100%;"),
|
| 213 |
-
|
| 214 |
-
tags$a(
|
| 215 |
-
href = "https://connorjerzak.com/linkorgs-summary/",
|
| 216 |
-
target = "_blank",
|
| 217 |
-
icon("circle-question"), " Technical Details"
|
| 218 |
-
)
|
| 219 |
-
),
|
| 220 |
-
|
| 221 |
-
layout_columns(
|
| 222 |
-
col_widths = c(12),
|
| 223 |
-
|
| 224 |
-
card(
|
| 225 |
-
card_header("Data Preview"),
|
| 226 |
-
card_body(
|
| 227 |
-
DTOutput("data_preview")
|
| 228 |
-
)
|
| 229 |
-
),
|
| 230 |
-
|
| 231 |
-
conditionalPanel(
|
| 232 |
-
"output.audit_complete",
|
| 233 |
-
card(
|
| 234 |
-
card_header("Audit Results"),
|
| 235 |
-
card_body(
|
| 236 |
-
uiOutput("results_summary"),
|
| 237 |
-
plotOutput("audit_plot", height = "400px"),
|
| 238 |
-
downloadButton("download_results", "Download Results",
|
| 239 |
-
class = "btn-success")
|
| 240 |
-
)
|
| 241 |
-
)
|
| 242 |
-
)
|
| 243 |
-
)
|
| 244 |
)
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
print(list.files())
|
| 257 |
-
if (file.exists("./Islam2019_WithGeocodesAndSatData.Rdata")) {
|
| 258 |
-
load("./Islam2019_WithGeocodesAndSatData.Rdata")
|
| 259 |
-
return(data)
|
| 260 |
-
} else {
|
| 261 |
-
showNotification("Example data file not found. Please upload your own CSV.",
|
| 262 |
-
type = "error", duration = 10)
|
| 263 |
-
return(NULL)
|
| 264 |
-
}
|
| 265 |
-
} else {
|
| 266 |
-
req(input$file_csv)
|
| 267 |
-
tryCatch({
|
| 268 |
-
read.csv(input$file_csv$datapath, stringsAsFactors = FALSE)
|
| 269 |
-
}, error = function(e) {
|
| 270 |
-
showNotification(paste("Error reading CSV:", e$message),
|
| 271 |
-
type = "error", duration = 10)
|
| 272 |
-
NULL
|
| 273 |
-
})
|
| 274 |
-
}
|
| 275 |
-
})
|
| 276 |
-
|
| 277 |
-
observe({
|
| 278 |
-
df <- data_loaded()
|
| 279 |
-
req(df)
|
| 280 |
-
|
| 281 |
-
cols <- names(df)
|
| 282 |
-
|
| 283 |
-
updateSelectInput(session, "treat_col", choices = cols,
|
| 284 |
-
selected = if ("begum_treat" %in% cols) "begum_treat" else cols[1])
|
| 285 |
-
updateSelectInput(session, "missing_col", choices = cols,
|
| 286 |
-
selected = cols[1])
|
| 287 |
-
updateSelectInput(session, "lat_col", choices = cols,
|
| 288 |
-
selected = grep("lat", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
|
| 289 |
-
updateSelectInput(session, "long_col", choices = cols,
|
| 290 |
-
selected = grep("lon|long", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
|
| 291 |
-
})
|
| 292 |
-
|
| 293 |
-
observeEvent(input$data_source, {
|
| 294 |
-
if (input$data_source == "upload" && is.null(input$file_csv)) {
|
| 295 |
-
updateSelectInput(session, "treat_col", choices = character(0))
|
| 296 |
-
updateSelectInput(session, "missing_col", choices = character(0))
|
| 297 |
-
updateSelectInput(session, "lat_col", choices = character(0))
|
| 298 |
-
updateSelectInput(session, "long_col", choices = character(0))
|
| 299 |
-
}
|
| 300 |
-
})
|
| 301 |
-
|
| 302 |
-
output$data_preview <- renderDT({
|
| 303 |
-
df <- data_loaded()
|
| 304 |
-
req(df)
|
| 305 |
-
|
| 306 |
-
datatable(
|
| 307 |
-
head(df, 100),
|
| 308 |
-
options = list(pageLength = 10, scrollX = TRUE, dom = 'tip'),
|
| 309 |
-
rownames = FALSE
|
| 310 |
-
)
|
| 311 |
-
})
|
| 312 |
-
|
| 313 |
-
observeEvent(input$run_audit, {
|
| 314 |
-
df <- data_loaded()
|
| 315 |
-
req(df)
|
| 316 |
-
|
| 317 |
-
missing_feats <- setdiff(input$features, names(df))
|
| 318 |
-
if (length(missing_feats) > 0) {
|
| 319 |
-
showNotification("Fetching satellite features from GEE...", type = "message")
|
| 320 |
-
|
| 321 |
-
req(input$lat_col %in% names(df), input$long_col %in% names(df))
|
| 322 |
-
|
| 323 |
-
if (input$start_year > input$end_year) {
|
| 324 |
-
showNotification("Start year must be <= end year", type = "error")
|
| 325 |
-
return()
|
| 326 |
-
}
|
| 327 |
-
|
| 328 |
-
gee_project <- Sys.getenv("GEE_PROJECT", unset = NULL)
|
| 329 |
-
gee_email <- Sys.getenv("GEE_EMAIL", unset = NULL)
|
| 330 |
-
gee_key <- Sys.getenv("GEE_KEY", unset = NULL)
|
| 331 |
-
|
| 332 |
-
py_run_string("
|
| 333 |
-
import ee, pandas as pd, json
|
| 334 |
-
|
| 335 |
-
def _ee_init(project, email=None, key_data=None):
|
| 336 |
-
if email is None and key_data is None:
|
| 337 |
-
try:
|
| 338 |
-
ee.Initialize(project=project)
|
| 339 |
-
except:
|
| 340 |
-
ee.Authenticate()
|
| 341 |
-
ee.Initialize(project=project)
|
| 342 |
-
else:
|
| 343 |
-
key_dict = json.loads(key_data)
|
| 344 |
-
credentials = ee.ServiceAccountCredentials(email, key_data=key_dict)
|
| 345 |
-
ee.Initialize(credentials=credentials, project=project)
|
| 346 |
-
|
| 347 |
-
def satellite_stats(points, start, end, scale=250):
|
| 348 |
-
feats = [ee.Feature(ee.Geometry.Point([float(p['lon']), float(p['lat'])]),
|
| 349 |
-
{'rowid': str(p['rowid'])}) for p in points]
|
| 350 |
-
fc = ee.FeatureCollection(feats)
|
| 351 |
-
|
| 352 |
-
def mask_modis(img):
|
| 353 |
-
qa = img.select('SummaryQA')
|
| 354 |
-
mask = qa.eq(0)
|
| 355 |
-
return img.updateMask(mask).select('NDVI').multiply(0.0001).copyProperties(img, img.propertyNames())
|
| 356 |
-
modis = (ee.ImageCollection('MODIS/061/MOD13Q1')
|
| 357 |
-
.filterDate(start, end)
|
| 358 |
-
.map(mask_modis))
|
| 359 |
-
ndvi_mean = modis.select('NDVI').mean().rename('ndvi_mean')
|
| 360 |
-
ndvi_median = modis.reduce(ee.Reducer.median()).rename('ndvi_median')
|
| 361 |
-
ndvi_max = modis.select('NDVI').max().rename('ndvi_max')
|
| 362 |
-
|
| 363 |
-
dmsp = ee.ImageCollection('NOAA/DMSP-OLS/NIGHTTIME_LIGHTS').select('stable_lights')
|
| 364 |
-
viirs = ee.ImageCollection('NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG').select('avg_rad')
|
| 365 |
-
|
| 366 |
-
dmsp_window = dmsp.filterDate(start, end)
|
| 367 |
-
viirs_window = viirs.filterDate(start, end)
|
| 368 |
-
|
| 369 |
-
overlap_start = ee.Date('2012-01-01')
|
| 370 |
-
overlap_end = ee.Date('2014-12-31')
|
| 371 |
-
dmsp_ov_img = dmsp.filterDate(overlap_start, overlap_end).mean()
|
| 372 |
-
viirs_ov_img = viirs.filterDate(overlap_start, overlap_end).mean()
|
| 373 |
-
|
| 374 |
-
def _buffer_feat(f):
|
| 375 |
-
f = ee.Feature(f)
|
| 376 |
-
return f.buffer(5000)
|
| 377 |
-
fc_buffer = fc.map(_buffer_feat)
|
| 378 |
-
region_geom = fc_buffer.geometry()
|
| 379 |
-
|
| 380 |
-
dmsp_ov_mean = ee.Number(dmsp_ov_img.reduceRegion(
|
| 381 |
-
reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
|
| 382 |
-
).get('stable_lights'))
|
| 383 |
-
viirs_ov_mean = ee.Number(viirs_ov_img.reduceRegion(
|
| 384 |
-
reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
|
| 385 |
-
).get('avg_rad'))
|
| 386 |
-
|
| 387 |
-
dmsp_global_mean = ee.Number(dmsp_ov_img.reduceRegion(
|
| 388 |
-
reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
|
| 389 |
-
scale=50000, maxPixels=1e13
|
| 390 |
-
).get('stable_lights'))
|
| 391 |
-
viirs_global_mean = ee.Number(viirs_ov_img.reduceRegion(
|
| 392 |
-
reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
|
| 393 |
-
scale=50000, maxPixels=1e13
|
| 394 |
-
).get('avg_rad'))
|
| 395 |
-
|
| 396 |
-
dmsp_use = ee.Algorithms.If(dmsp_ov_mean, dmsp_ov_mean, dmsp_global_mean)
|
| 397 |
-
viirs_use = ee.Algorithms.If(viirs_ov_mean, viirs_ov_mean, viirs_global_mean)
|
| 398 |
-
|
| 399 |
-
dmsp_use = ee.Number(dmsp_use)
|
| 400 |
-
viirs_use = ee.Number(viirs_use)
|
| 401 |
-
|
| 402 |
-
scale = ee.Algorithms.If(dmsp_use.gt(0), viirs_use.divide(dmsp_use), 1)
|
| 403 |
-
scale = ee.Number(scale)
|
| 404 |
-
|
| 405 |
-
def calib_img(img):
|
| 406 |
-
return img.multiply(scale).toFloat()
|
| 407 |
-
|
| 408 |
-
dmsp_equiv = dmsp_window.map(lambda img: calib_img(img.select('stable_lights').rename('ntl')))
|
| 409 |
-
viirs_prep = viirs_window.map(lambda img: img.select('avg_rad').rename('ntl').toFloat())
|
| 410 |
-
ntl_window = dmsp_equiv.merge(viirs_prep)
|
| 411 |
-
|
| 412 |
-
ntl_mean = ntl_window.mean().rename('ntl_mean')
|
| 413 |
-
ntl_median = ntl_window.reduce(ee.Reducer.median()).rename('ntl_median')
|
| 414 |
-
ntl_max = ntl_window.max().rename('ntl_max')
|
| 415 |
-
|
| 416 |
-
stacked = (ndvi_mean
|
| 417 |
-
.addBands([ndvi_median, ndvi_max,
|
| 418 |
-
ntl_mean, ntl_median, ntl_max]))
|
| 419 |
-
|
| 420 |
-
samples = stacked.sampleRegions(collection=fc, properties=['rowid'], scale=scale)
|
| 421 |
-
info = samples.getInfo()
|
| 422 |
-
rows = []
|
| 423 |
-
for f in info.get('features', []):
|
| 424 |
-
p = f.get('properties', {}) or {}
|
| 425 |
-
rows.append({
|
| 426 |
-
'rowid': p.get('rowid'),
|
| 427 |
-
'ndvi_mean': p.get('ndvi_mean'),
|
| 428 |
-
'ndvi_median': p.get('ndvi_median'),
|
| 429 |
-
'ndvi_max': p.get('ndvi_max'),
|
| 430 |
-
'ntl_mean': p.get('ntl_mean'),
|
| 431 |
-
'ntl_median': p.get('ntl_median'),
|
| 432 |
-
'ntl_max': p.get('ntl_max')
|
| 433 |
-
})
|
| 434 |
-
return pd.DataFrame(rows)
|
| 435 |
-
")
|
| 436 |
-
|
| 437 |
-
py$`_ee_init`(project = gee_project, email = gee_email, key_data = gee_key)
|
| 438 |
-
|
| 439 |
-
df$rowid <- seq_len(nrow(df))
|
| 440 |
-
pts_all <- df %>%
|
| 441 |
-
filter(is.finite(!!sym(input$lat_col)), is.finite(!!sym(input$long_col))) %>%
|
| 442 |
-
transmute(rowid = as.character(rowid),
|
| 443 |
-
lon = !!sym(input$long_col),
|
| 444 |
-
lat = !!sym(input$lat_col))
|
| 445 |
-
|
| 446 |
-
if (nrow(pts_all) == 0) {
|
| 447 |
-
showNotification("No valid geocoordinates found", type = "error")
|
| 448 |
-
return()
|
| 449 |
-
}
|
| 450 |
-
|
| 451 |
-
start <- sprintf("%d-01-01", input$start_year)
|
| 452 |
-
end <- sprintf("%d-01-01", input$end_year + 1)
|
| 453 |
-
|
| 454 |
-
batch_size <- 200L
|
| 455 |
-
idx <- split(seq_len(nrow(pts_all)), ceiling(seq_len(nrow(pts_all)) / batch_size))
|
| 456 |
-
sat_all <- list()
|
| 457 |
-
|
| 458 |
-
for (ii in idx) {
|
| 459 |
-
chunk <- pts_all[ii, , drop = FALSE]
|
| 460 |
-
points <- lapply(seq_len(nrow(chunk)), function(i) {
|
| 461 |
-
list(
|
| 462 |
-
rowid = chunk$rowid[i],
|
| 463 |
-
lon = chunk$lon[i],
|
| 464 |
-
lat = chunk$lat[i]
|
| 465 |
-
)
|
| 466 |
-
})
|
| 467 |
-
|
| 468 |
-
df_chunk <- py$satellite_stats(points, start, end, as.integer(250))
|
| 469 |
-
if (!is.null(df_chunk) && nrow(df_chunk) > 0) {
|
| 470 |
-
sat_all[[length(sat_all) + 1L]] <- df_chunk
|
| 471 |
-
}
|
| 472 |
-
}
|
| 473 |
-
|
| 474 |
-
if (length(sat_all) > 0) {
|
| 475 |
-
sat_df <- bind_rows(sat_all) %>% mutate(rowid = as.integer(rowid))
|
| 476 |
-
df <- left_join(df, sat_df, by = "rowid") %>% select(-rowid)
|
| 477 |
-
} else {
|
| 478 |
-
showNotification("Failed to fetch satellite data", type = "error")
|
| 479 |
-
return()
|
| 480 |
-
}
|
| 481 |
-
|
| 482 |
-
missing_feats <- setdiff(input$features, names(df))
|
| 483 |
-
if (length(missing_feats) > 0) {
|
| 484 |
-
showNotification(paste("Could not fetch:", paste(missing_feats, collapse = ", ")), type = "error")
|
| 485 |
-
return()
|
| 486 |
-
}
|
| 487 |
-
}
|
| 488 |
-
|
| 489 |
-
withProgress(message = "Running audit...", value = 0, {
|
| 490 |
-
|
| 491 |
-
incProgress(0.2, detail = "Preparing data...")
|
| 492 |
-
|
| 493 |
-
if (input$audit_type == "randomization") {
|
| 494 |
-
req(input$treat_col)
|
| 495 |
-
|
| 496 |
-
if (!(input$treat_col %in% names(df))) {
|
| 497 |
-
showNotification("Treatment column not found", type = "error")
|
| 498 |
-
return()
|
| 499 |
-
}
|
| 500 |
-
|
| 501 |
-
tt <- df[[input$treat_col]]
|
| 502 |
-
mask <- (tt %in% c(input$control_val, input$treat_val))
|
| 503 |
-
|
| 504 |
-
if (sum(mask) == 0) {
|
| 505 |
-
showNotification("No units match control/treatment values",
|
| 506 |
-
type = "error")
|
| 507 |
-
return()
|
| 508 |
-
}
|
| 509 |
-
|
| 510 |
-
A <- ifelse(tt[mask] == input$treat_val, 1L, 0L)
|
| 511 |
-
X <- as.matrix(df[mask, input$features, drop = FALSE])
|
| 512 |
-
|
| 513 |
-
keep <- apply(X, 1, function(r) all(is.finite(r)))
|
| 514 |
-
A <- A[keep]
|
| 515 |
-
X <- X[keep, , drop = FALSE]
|
| 516 |
-
|
| 517 |
-
if (length(A) < 10) {
|
| 518 |
-
showNotification("Too few complete cases (need >= 10)",
|
| 519 |
-
type = "error")
|
| 520 |
-
return()
|
| 521 |
-
}
|
| 522 |
-
|
| 523 |
-
} else {
|
| 524 |
-
req(input$missing_col)
|
| 525 |
-
|
| 526 |
-
if (!(input$missing_col %in% names(df))) {
|
| 527 |
-
showNotification("Missing column not found", type = "error")
|
| 528 |
-
return()
|
| 529 |
-
}
|
| 530 |
-
|
| 531 |
-
R <- as.integer(!is.na(df[[input$missing_col]]))
|
| 532 |
-
|
| 533 |
-
if (all(R == 1)) {
|
| 534 |
-
showNotification(
|
| 535 |
-
"No missingness detected in selected variable. Audit cannot proceed.",
|
| 536 |
-
type = "warning", duration = 10
|
| 537 |
-
)
|
| 538 |
-
return()
|
| 539 |
-
}
|
| 540 |
-
|
| 541 |
-
if (all(R == 0)) {
|
| 542 |
-
showNotification(
|
| 543 |
-
"All values are missing. Audit cannot proceed.",
|
| 544 |
-
type = "warning", duration = 10
|
| 545 |
-
)
|
| 546 |
-
return()
|
| 547 |
-
}
|
| 548 |
-
|
| 549 |
-
A <- R
|
| 550 |
-
X <- as.matrix(df[, input$features, drop = FALSE])
|
| 551 |
-
|
| 552 |
-
keep <- apply(X, 1, function(r) all(is.finite(r)))
|
| 553 |
-
A <- A[keep]
|
| 554 |
-
X <- X[keep, , drop = FALSE]
|
| 555 |
-
}
|
| 556 |
-
|
| 557 |
-
incProgress(0.4, detail = "Running conditional randomization test...")
|
| 558 |
-
|
| 559 |
-
results <- tryCatch({
|
| 560 |
-
remote_audit_crt(
|
| 561 |
-
A = A,
|
| 562 |
-
X = X,
|
| 563 |
-
K = input$K,
|
| 564 |
-
B = input$B,
|
| 565 |
-
seed = input$seed,
|
| 566 |
-
label = if (input$audit_type == "randomization") input$treat_col else input$missing_col,
|
| 567 |
-
xgboost_ntree = input$ntree
|
| 568 |
-
)
|
| 569 |
-
}, error = function(e) {
|
| 570 |
-
showNotification(paste("Audit failed:", e$message),
|
| 571 |
-
type = "error", duration = 10)
|
| 572 |
-
NULL
|
| 573 |
-
})
|
| 574 |
-
|
| 575 |
-
incProgress(1.0, detail = "Complete!")
|
| 576 |
-
|
| 577 |
-
if (!is.null(results)) {
|
| 578 |
-
audit_results(results)
|
| 579 |
-
showNotification("Audit complete!", type = "message", duration = 3)
|
| 580 |
-
}
|
| 581 |
-
})
|
| 582 |
-
})
|
| 583 |
-
|
| 584 |
-
output$results_summary <- renderUI({
|
| 585 |
-
res <- audit_results()
|
| 586 |
-
req(res)
|
| 587 |
-
|
| 588 |
-
HTML(sprintf(
|
| 589 |
-
"<h4>%s Audit Results</h4>
|
| 590 |
-
<p><strong>Learner:</strong> %s</p>
|
| 591 |
-
<p><strong>Sample size:</strong> %d (Treated: %d, Control: %d)</p>
|
| 592 |
-
<p><strong>Test statistic (T):</strong> %.4f</p>
|
| 593 |
-
<p><strong>P-value:</strong> %.4f</p>
|
| 594 |
-
<p><strong>Interpretation:</strong> %s</p>",
|
| 595 |
-
tools::toTitleCase(input$audit_type),
|
| 596 |
-
toupper(res$learner),
|
| 597 |
-
res$n,
|
| 598 |
-
res$treated,
|
| 599 |
-
res$n - res$treated,
|
| 600 |
-
res$T_obs,
|
| 601 |
-
res$p_value,
|
| 602 |
-
if (res$p_value < 0.05) {
|
| 603 |
-
"⚠️ Assignment is MORE predictable from satellite features than expected under random assignment (p < 0.05). This suggests potential deviation from the stated randomization mechanism."
|
| 604 |
-
} else {
|
| 605 |
-
"✓ Assignment is NOT significantly more predictable from satellite features than expected under random assignment (p >= 0.05). No evidence of deviation detected."
|
| 606 |
-
}
|
| 607 |
-
))
|
| 608 |
-
})
|
| 609 |
-
|
| 610 |
-
output$audit_plot <- renderPlot({
|
| 611 |
-
res <- audit_results()
|
| 612 |
-
req(res)
|
| 613 |
-
|
| 614 |
-
hist(res$T_null, breaks = 50,
|
| 615 |
-
main = sprintf("%s Audit: %s Learner",
|
| 616 |
-
tools::toTitleCase(input$audit_type),
|
| 617 |
-
toupper(res$learner)),
|
| 618 |
-
xlab = "Out-of-sample log-likelihood improvement (T)",
|
| 619 |
-
ylab = "Count",
|
| 620 |
-
col = "lightblue",
|
| 621 |
-
border = "white")
|
| 622 |
-
abline(v = res$T_obs, col = "red", lwd = 3, lty = 2)
|
| 623 |
-
legend("topright",
|
| 624 |
-
legend = c("Null distribution", "Observed"),
|
| 625 |
-
col = c("lightblue", "red"),
|
| 626 |
-
lwd = c(10, 3),
|
| 627 |
-
lty = c(1, 2))
|
| 628 |
-
mtext(sprintf("n=%d, treated=%d (%.1f%%), B=%d, p=%.4f",
|
| 629 |
-
res$n, res$treated, 100 * res$a_bar, res$B, res$p_value),
|
| 630 |
-
side = 3, line = 0.5, cex = 0.9)
|
| 631 |
-
})
|
| 632 |
-
|
| 633 |
-
output$download_results <- downloadHandler(
|
| 634 |
-
filename = function() {
|
| 635 |
-
sprintf("remote_audit_results_%s.csv", format(Sys.time(), "%Y%m%d_%H%M%S"))
|
| 636 |
-
},
|
| 637 |
-
content = function(file) {
|
| 638 |
-
res <- audit_results()
|
| 639 |
-
req(res)
|
| 640 |
-
|
| 641 |
-
summary_df <- data.frame(
|
| 642 |
-
audit_type = input$audit_type,
|
| 643 |
-
learner = res$learner,
|
| 644 |
-
n = res$n,
|
| 645 |
-
treated = res$treated,
|
| 646 |
-
treatment_rate = res$a_bar,
|
| 647 |
-
K = res$K,
|
| 648 |
-
B = res$B,
|
| 649 |
-
T_observed = res$T_obs,
|
| 650 |
-
p_value = res$p_value,
|
| 651 |
-
seed = input$seed,
|
| 652 |
-
features = paste(input$features, collapse = ";")
|
| 653 |
-
)
|
| 654 |
-
|
| 655 |
-
write.csv(summary_df, file, row.names = FALSE)
|
| 656 |
-
}
|
| 657 |
-
)
|
| 658 |
-
|
| 659 |
-
output$audit_complete <- reactive({
|
| 660 |
-
!is.null(audit_results())
|
| 661 |
-
})
|
| 662 |
-
outputOptions(output, "audit_complete", suspendWhenHidden = FALSE)
|
| 663 |
}
|
| 664 |
-
|
| 665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax=docker/dockerfile:1
|
| 2 |
+
FROM rocker/r2u:22.04
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
| 5 |
# ==============================================================================
|
| 6 |
+
# System dependencies
|
|
|
|
| 7 |
# ==============================================================================
|
| 8 |
+
RUN apt-get update -y && apt-get install -y --no-install-recommends \
|
| 9 |
+
wget bzip2 git unzip ca-certificates locales tzdata \
|
| 10 |
+
build-essential gfortran \
|
| 11 |
+
libcurl4-openssl-dev libssl-dev libxml2-dev libgit2-dev \
|
| 12 |
+
libopenblas-dev liblapack-dev \
|
| 13 |
+
python3 python3-pip \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 HF_HUB_DISABLE_TELEMETRY=1
|
| 16 |
+
# ==============================================================================
|
| 17 |
+
# Python packages
|
| 18 |
+
# ==============================================================================
|
| 19 |
+
RUN pip3 install --no-cache-dir pandas earthengine-api
|
| 20 |
+
# ==============================================================================
|
| 21 |
+
# Install R packages (prioritize apt when available for speed)
|
| 22 |
+
# ==============================================================================
|
| 23 |
+
ARG APT_R_PKGS="\
|
| 24 |
+
r-cran-shiny r-cran-dplyr r-cran-dt \
|
| 25 |
+
r-cran-data.table r-cran-foreach r-cran-doparallel"
|
| 26 |
+
RUN set -eux; \
|
| 27 |
+
apt-get update -y; \
|
| 28 |
+
for pkg in $APT_R_PKGS; do \
|
| 29 |
+
if apt-cache show "$pkg" >/dev/null 2>&1; then \
|
| 30 |
+
echo "Installing $pkg via apt ..."; \
|
| 31 |
+
apt-get install -y --no-install-recommends "$pkg" || true; \
|
| 32 |
+
fi; \
|
| 33 |
+
done; \
|
| 34 |
+
rm -rf /var/lib/apt/lists/*
|
| 35 |
+
# ==============================================================================
|
| 36 |
+
# R packages via CRAN (fallback for those not in apt)
|
| 37 |
+
# ==============================================================================
|
| 38 |
+
RUN Rscript - <<'RSCRIPT'
|
| 39 |
+
options(Ncpus = parallel::detectCores())
|
| 40 |
+
cran <- "https://cloud.r-project.org"
|
| 41 |
+
req <- c(
|
| 42 |
+
"shiny", "dplyr", "DT", "data.table",
|
| 43 |
+
"bslib", "shinyWidgets", "xgboost",
|
| 44 |
+
"reticulate", "future", "future.apply"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
)
|
| 46 |
+
installed <- rownames(installed.packages())
|
| 47 |
+
need <- setdiff(req, installed)
|
| 48 |
+
if (length(need)) {
|
| 49 |
+
if (!requireNamespace("pak", quietly = TRUE)) {
|
| 50 |
+
install.packages("pak", repos = "https://r-lib.github.io/p/pak/stable")
|
| 51 |
+
}
|
| 52 |
+
ok <- tryCatch({
|
| 53 |
+
pak::pak(need)
|
| 54 |
+
TRUE
|
| 55 |
+
}, error = function(e) FALSE)
|
| 56 |
+
if (!ok) install.packages(need, repos = cran)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
+
RSCRIPT
|
| 59 |
+
# ==============================================================================
|
| 60 |
+
# Copy application files
|
| 61 |
+
# ==============================================================================
|
| 62 |
+
COPY . /code/
|
| 63 |
+
# ==============================================================================
|
| 64 |
+
# Shiny entrypoint
|
| 65 |
+
# ==============================================================================
|
| 66 |
+
EXPOSE 7860
|
| 67 |
+
CMD ["R", "--quiet", "-e", "port <- as.integer(Sys.getenv('PORT', '7860')); shiny::runApp('/code', host='0.0.0.0', port=port)"]
|