cjerzak commited on
Commit
2cc9e94
·
verified ·
1 Parent(s): 3777026

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +641 -42
app.R CHANGED
@@ -1,58 +1,657 @@
1
- library(shiny)
2
- library(bslib)
3
- library(dplyr)
4
- library(ggplot2)
 
5
 
6
- df <- readr::read_csv("penguins.csv")
7
- # Find subset of columns that are suitable for scatter plot
8
- df_num <- df |> select(where(is.numeric), -Year)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  ui <- page_sidebar(
11
- theme = bs_theme(bootswatch = "minty"),
12
- title = "Penguins explorer",
 
 
 
 
 
13
  sidebar = sidebar(
14
- varSelectInput("xvar", "X variable", df_num, selected = "Bill Length (mm)"),
15
- varSelectInput("yvar", "Y variable", df_num, selected = "Bill Depth (mm)"),
16
- checkboxGroupInput("species", "Filter by species",
17
- choices = unique(df$Species), selected = unique(df$Species)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  ),
19
- hr(), # Add a horizontal rule
20
- checkboxInput("by_species", "Show species", TRUE),
21
- checkboxInput("show_margins", "Show marginal plots", TRUE),
22
- checkboxInput("smooth", "Add smoother"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ),
24
- plotOutput("scatter")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  )
26
 
 
 
 
 
27
  server <- function(input, output, session) {
28
- subsetted <- reactive({
29
- req(input$species)
30
- df |> filter(Species %in% input$species)
31
- })
32
-
33
- output$scatter <- renderPlot(
34
- {
35
- p <- ggplot(subsetted(), aes(!!input$xvar, !!input$yvar)) +
36
- theme_light() +
37
- list(
38
- theme(legend.position = "bottom"),
39
- if (input$by_species) aes(color = Species),
40
- geom_point(),
41
- if (input$smooth) geom_smooth()
42
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- if (input$show_margins) {
45
- margin_type <- if (input$by_species) "density" else "histogram"
46
- p <- p |> ggExtra::ggMarginal(
47
- type = margin_type, margins = "both",
48
- size = 8, groupColour = input$by_species, groupFill = input$by_species
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
-
52
- p
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  },
54
- res = 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  )
 
 
 
 
 
56
  }
57
 
58
- shinyApp(ui, server)
 
1
+ # app.R — Remote Audit: Design-Based Tests of Randomization with Satellite Imagery
2
+ # ==============================================================================
3
+ # Performs conditional randomization tests to audit experimental integrity
4
+ # using pre-treatment satellite imagery features (NDVI, Nightlight)
5
+ # ==============================================================================
6
 
7
+ # For Hugging Face deployment, set secrets GEE_PROJECT, GEE_EMAIL, GEE_KEY (the service account key JSON as string)
8
+
9
+ options(shiny.maxRequestSize = 50 * 1024^2)
10
+ options(error = NULL)
11
+
12
+ suppressPackageStartupMessages({
13
+ library(shiny)
14
+ library(bslib)
15
+ library(DT)
16
+ library(shinyWidgets)
17
+ library(reticulate)
18
+ library(dplyr)
19
+ library(xgboost)
20
+ library(future)
21
+ library(future.apply)
22
+ library(parallel)
23
+ })
24
+
25
+ # ============================================================================
26
+ # Helper Functions
27
+ # ============================================================================
28
+
29
+ .clamp01 <- function(p, eps = 1e-6) pmin(pmax(p, eps), 1 - eps)
30
+
31
+ .lik_improvement <- function(A, phat, a_bar) {
32
+ phat <- .clamp01(phat)
33
+ a_bar <- .clamp01(a_bar)
34
+ L <- sum(A * log(phat) + (1 - A) * log(1 - phat))
35
+ L0 <- sum(A * log(a_bar) + (1 - A) * log(1 - a_bar))
36
+ L - L0
37
+ }
38
+
39
+ .cf_xgboost_phat <- function(A, X, K = 5, folds = NULL, ntree = 300L, mtry = NULL) {
40
+ n <- length(A)
41
+ if (is.null(folds)) folds <- sample(rep(seq_len(K), length.out = n))
42
+ phat <- rep(NA_real_, n)
43
+
44
+ for (k in seq_len(K)) {
45
+ idx_te <- which(folds == k)
46
+ idx_tr <- which(folds != k)
47
+
48
+ A_tr <- A[idx_tr]; X_tr <- as.matrix(X[idx_tr, , drop = FALSE])
49
+ X_te <- as.matrix(X[idx_te, , drop = FALSE])
50
+
51
+ sdv <- apply(X_tr, 2, sd)
52
+ keep <- is.finite(sdv) & (sdv > 1e-12)
53
+ if (!any(keep)) {
54
+ phat[idx_te] <- mean(A_tr)
55
+ next
56
+ }
57
+
58
+ X_tr_k <- X_tr[, keep, drop = FALSE]
59
+ X_te_k <- X_te[, keep, drop = FALSE]
60
+
61
+ X_df <- as.data.frame(X_tr_k)
62
+ X_mat <- model.matrix(~ . -1, data = X_df)
63
+ y_num <- as.numeric(A_tr)
64
+
65
+ p <- ncol(X_mat)
66
+ mtry_use <- if (is.null(mtry)) max(1L, floor(sqrt(p))) else max(1L, min(as.integer(mtry), p))
67
+ colsample_frac <- min(1, as.numeric(mtry_use) / max(1, p))
68
+
69
+ params <- list(
70
+ objective = "binary:logistic",
71
+ eval_metric = "logloss",
72
+ eta = 0.1,
73
+ max_depth = 6,
74
+ subsample = 0.8,
75
+ colsample_bytree = colsample_frac,
76
+ nthread = parallel::detectCores()
77
+ )
78
+
79
+ dtrain <- xgboost::xgb.DMatrix(data = X_mat, label = y_num)
80
+
81
+ fit <- xgboost::xgb.train(
82
+ params = params,
83
+ data = dtrain,
84
+ nrounds = as.integer(ntree),
85
+ verbose = 0
86
+ )
87
+
88
+ phat[idx_te] <- predict(fit, X_te_k)
89
+ }
90
+ list(phat = .clamp01(phat), folds = folds)
91
+ }
92
+
93
+ .draw_assign_fixed_m <- function(n, m) {
94
+ A <- rep.int(0L, n)
95
+ A[sample.int(n, m)] <- 1L
96
+ A
97
+ }
98
+
99
+ remote_audit_crt <- function(A, X,
100
+ K = 5,
101
+ B = 1000,
102
+ seed = 123,
103
+ label = "",
104
+ xgboost_ntree = 300L,
105
+ xgboost_mtry = NULL) {
106
+ stopifnot(length(A) == nrow(X))
107
+ keep <- is.finite(A) & apply(as.matrix(X), 1, function(r) all(is.finite(r)))
108
+ A <- as.integer(A[keep])
109
+ X <- as.matrix(X[keep, , drop = FALSE])
110
+
111
+ n <- length(A)
112
+ a_bar <- .clamp01(mean(A))
113
+ m <- sum(A)
114
+
115
+ set.seed(seed)
116
+ folds <- sample(rep(seq_len(K), length.out = n))
117
+
118
+ obs <- .cf_xgboost_phat(A, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)
119
+ T_obs <- .lik_improvement(A, obs$phat, a_bar)
120
+
121
+ plan(multisession(workers = availableCores()))
122
+ T_null <- future_sapply(seq_len(B), future.seed = TRUE, FUN = function(b) {
123
+ A_b <- .draw_assign_fixed_m(n, m)
124
+ ph <- .cf_xgboost_phat(A_b, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)$phat
125
+ .lik_improvement(A_b, ph, a_bar)
126
+ })
127
+ plan(sequential)
128
+
129
+ pval <- (1 + sum(T_null >= T_obs)) / (B + 1)
130
+
131
+ list(
132
+ T_obs = T_obs,
133
+ T_null = T_null,
134
+ p_value = pval,
135
+ a_bar = a_bar,
136
+ n = n,
137
+ treated = m,
138
+ K = K,
139
+ B = B,
140
+ label = label,
141
+ learner = "xgboost"
142
+ )
143
+ }
144
+
145
+ # ============================================================================
146
+ # UI
147
+ # ============================================================================
148
+
149
+ theme <- bs_theme(bootswatch = "flatly")
150
 
151
  ui <- page_sidebar(
152
+ tags$head(tags$title("Remote Audit")),
153
+ title = div(
154
+ span("Remote Audit", style = "font-weight:700;"),
155
+ span(" with Satellite Imagery", style = "color: #888;")
156
+ ),
157
+ theme = theme,
158
+
159
  sidebar = sidebar(
160
+ width = 360,
161
+
162
+ h5("Data Input"),
163
+ radioButtons("data_source", NULL,
164
+ choices = c("Upload CSV" = "upload",
165
+ "Use Example (Begum et al. 2022)" = "example"),
166
+ selected = "example"),
167
+
168
+ conditionalPanel(
169
+ "input.data_source == 'upload'",
170
+ fileInput("file_csv", "Upload CSV", accept = ".csv")
171
+ ),
172
+
173
+ h5("Audit Configuration"),
174
+
175
+ selectInput("audit_type", "Audit Type",
176
+ choices = c("Randomization" = "randomization",
177
+ "Missingness" = "missingness"),
178
+ selected = "randomization"),
179
+
180
+ conditionalPanel(
181
+ "input.audit_type == 'randomization'",
182
+ selectInput("treat_col", "Treatment Column", choices = NULL),
183
+ numericInput("control_val", "Control Value", value = 1, step = 1),
184
+ numericInput("treat_val", "Treatment Value", value = 2, step = 1)
185
  ),
186
+
187
+ conditionalPanel(
188
+ "input.audit_type == 'missingness'",
189
+ selectInput("missing_col", "Variable to Check", choices = NULL)
190
+ ),
191
+
192
+ selectInput("lat_col", "Latitude Column", choices = NULL),
193
+ selectInput("long_col", "Longitude Column", choices = NULL),
194
+
195
+ numericInput("start_year", "Start Year", value = 2010, min = 1990, max = 2026),
196
+ numericInput("end_year", "End Year", value = 2011, min = 1990, max = 2026),
197
+
198
+ checkboxGroupInput("features", "Features",
199
+ choices = c("NDVI Median" = "ndvi_median",
200
+ "Nightlight Median" = "ntl_median"),
201
+ selected = c("ndvi_median", "ntl_median")),
202
+
203
+ h5("Parameters"),
204
+ numericInput("K", "K-Folds", value = 5, min = 2, max = 10),
205
+ numericInput("B", "Resamples", value = 1000, min = 100, max = 5000, step = 100),
206
+ numericInput("seed", "Random Seed", value = 123),
207
+ numericInput("ntree", "Number of Trees", value = 300, min = 50, max = 1000),
208
+
209
+ actionButton("run_audit", "Run Audit",
210
+ class = "btn-primary btn-lg",
211
+ icon = icon("play"),
212
+ style = "width: 100%;"),
213
+
214
+ tags$a(
215
+ href = "https://connorjerzak.com/linkorgs-summary/",
216
+ target = "_blank",
217
+ icon("circle-question"), " Technical Details"
218
+ )
219
  ),
220
+
221
+ layout_columns(
222
+ col_widths = c(12),
223
+
224
+ card(
225
+ card_header("Data Preview"),
226
+ card_body(
227
+ DTOutput("data_preview")
228
+ )
229
+ ),
230
+
231
+ conditionalPanel(
232
+ "output.audit_complete",
233
+ card(
234
+ card_header("Audit Results"),
235
+ card_body(
236
+ uiOutput("results_summary"),
237
+ plotOutput("audit_plot", height = "400px"),
238
+ downloadButton("download_results", "Download Results",
239
+ class = "btn-success")
240
+ )
241
+ )
242
+ )
243
+ )
244
  )
245
 
246
+ # ============================================================================
247
+ # Server
248
+ # ============================================================================
249
+
250
  server <- function(input, output, session) {
251
+
252
+ audit_results <- reactiveVal(NULL)
253
+
254
+ data_loaded <- reactive({
255
+ if (input$data_source == "example") {
256
+ if (file.exists("Islam2019_WithGeocodesAndSatData.Rdata")) {
257
+ load("Islam2019_WithGeocodesAndSatData.Rdata")
258
+ return(data)
259
+ } else {
260
+ showNotification("Example data file not found. Please upload your own CSV.",
261
+ type = "error", duration = 10)
262
+ return(NULL)
263
+ }
264
+ } else {
265
+ req(input$file_csv)
266
+ tryCatch({
267
+ read.csv(input$file_csv$datapath, stringsAsFactors = FALSE)
268
+ }, error = function(e) {
269
+ showNotification(paste("Error reading CSV:", e$message),
270
+ type = "error", duration = 10)
271
+ NULL
272
+ })
273
+ }
274
+ })
275
+
276
+ observe({
277
+ df <- data_loaded()
278
+ req(df)
279
+
280
+ cols <- names(df)
281
+
282
+ updateSelectInput(session, "treat_col", choices = cols,
283
+ selected = if ("begum_treat" %in% cols) "begum_treat" else cols[1])
284
+ updateSelectInput(session, "missing_col", choices = cols,
285
+ selected = cols[1])
286
+ updateSelectInput(session, "lat_col", choices = cols,
287
+ selected = grep("lat", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
288
+ updateSelectInput(session, "long_col", choices = cols,
289
+ selected = grep("lon|long", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
290
+ })
291
+
292
+ output$data_preview <- renderDT({
293
+ df <- data_loaded()
294
+ req(df)
295
+
296
+ datatable(
297
+ head(df, 100),
298
+ options = list(pageLength = 10, scrollX = TRUE, dom = 'tip'),
299
+ rownames = FALSE
300
+ )
301
+ })
302
+
303
+ observeEvent(input$run_audit, {
304
+ df <- data_loaded()
305
+ req(df)
306
+
307
+ missing_feats <- setdiff(input$features, names(df))
308
+ if (length(missing_feats) > 0) {
309
+ showNotification("Fetching satellite features from GEE...", type = "message")
310
+
311
+ req(input$lat_col %in% names(df), input$long_col %in% names(df))
312
+
313
+ if (input$start_year > input$end_year) {
314
+ showNotification("Start year must be <= end year", type = "error")
315
+ return()
316
+ }
317
+
318
+ gee_project <- Sys.getenv("GEE_PROJECT", unset = NULL)
319
+ gee_email <- Sys.getenv("GEE_EMAIL", unset = NULL)
320
+ gee_key <- Sys.getenv("GEE_KEY", unset = NULL)
321
+
322
+ if (!py_module_available("ee")) py_install("earthengine-api")
323
+
324
+ py_run_string("
325
+ import ee, pandas as pd, json
326
+
327
+ def _ee_init(project, email=None, key_data=None):
328
+ if email is None and key_data is None:
329
+ try:
330
+ ee.Initialize(project=project)
331
+ except:
332
+ ee.Authenticate()
333
+ ee.Initialize(project=project)
334
+ else:
335
+ key_dict = json.loads(key_data)
336
+ credentials = ee.ServiceAccountCredentials(email, key_data=key_dict)
337
+ ee.Initialize(credentials=credentials, project=project)
338
+
339
+ def satellite_stats(points, start, end, scale=250):
340
+ feats = [ee.Feature(ee.Geometry.Point([float(p['lon']), float(p['lat'])]),
341
+ {'rowid': str(p['rowid'])}) for p in points]
342
+ fc = ee.FeatureCollection(feats)
343
+
344
+ def mask_modis(img):
345
+ qa = img.select('SummaryQA')
346
+ mask = qa.eq(0)
347
+ return img.updateMask(mask).select('NDVI').multiply(0.0001).copyProperties(img, img.propertyNames())
348
+ modis = (ee.ImageCollection('MODIS/061/MOD13Q1')
349
+ .filterDate(start, end)
350
+ .map(mask_modis))
351
+ ndvi_mean = modis.select('NDVI').mean().rename('ndvi_mean')
352
+ ndvi_median = modis.reduce(ee.Reducer.median()).rename('ndvi_median')
353
+ ndvi_max = modis.select('NDVI').max().rename('ndvi_max')
354
+
355
+ dmsp = ee.ImageCollection('NOAA/DMSP-OLS/NIGHTTIME_LIGHTS').select('stable_lights')
356
+ viirs = ee.ImageCollection('NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG').select('avg_rad')
357
+
358
+ dmsp_window = dmsp.filterDate(start, end)
359
+ viirs_window = viirs.filterDate(start, end)
360
+
361
+ overlap_start = ee.Date('2012-01-01')
362
+ overlap_end = ee.Date('2014-12-31')
363
+ dmsp_ov_img = dmsp.filterDate(overlap_start, overlap_end).mean()
364
+ viirs_ov_img = viirs.filterDate(overlap_start, overlap_end).mean()
365
+
366
+ def _buffer_feat(f):
367
+ f = ee.Feature(f)
368
+ return f.buffer(5000)
369
+ fc_buffer = fc.map(_buffer_feat)
370
+ region_geom = fc_buffer.geometry()
371
+
372
+ dmsp_ov_mean = ee.Number(dmsp_ov_img.reduceRegion(
373
+ reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
374
+ ).get('stable_lights'))
375
+ viirs_ov_mean = ee.Number(viirs_ov_img.reduceRegion(
376
+ reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
377
+ ).get('avg_rad'))
378
+
379
+ dmsp_global_mean = ee.Number(dmsp_ov_img.reduceRegion(
380
+ reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
381
+ scale=50000, maxPixels=1e13
382
+ ).get('stable_lights'))
383
+ viirs_global_mean = ee.Number(viirs_ov_img.reduceRegion(
384
+ reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
385
+ scale=50000, maxPixels=1e13
386
+ ).get('avg_rad'))
387
+
388
+ dmsp_use = ee.Algorithms.If(dmsp_ov_mean, dmsp_ov_mean, dmsp_global_mean)
389
+ viirs_use = ee.Algorithms.If(viirs_ov_mean, viirs_ov_mean, viirs_global_mean)
390
+
391
+ dmsp_use = ee.Number(dmsp_use)
392
+ viirs_use = ee.Number(viirs_use)
393
+
394
+ scale = ee.Algorithms.If(dmsp_use.gt(0), viirs_use.divide(dmsp_use), 1)
395
+ scale = ee.Number(scale)
396
+
397
+ def calib_img(img):
398
+ return img.multiply(scale).toFloat()
399
 
400
+ dmsp_equiv = dmsp_window.map(lambda img: calib_img(img.select('stable_lights').rename('ntl')))
401
+ viirs_prep = viirs_window.map(lambda img: img.select('avg_rad').rename('ntl').toFloat())
402
+ ntl_window = dmsp_equiv.merge(viirs_prep)
403
+
404
+ ntl_mean = ntl_window.mean().rename('ntl_mean')
405
+ ntl_median = ntl_window.reduce(ee.Reducer.median()).rename('ntl_median')
406
+ ntl_max = ntl_window.max().rename('ntl_max')
407
+
408
+ stacked = (ndvi_mean
409
+ .addBands([ndvi_median, ndvi_max,
410
+ ntl_mean, ntl_median, ntl_max]))
411
+
412
+ samples = stacked.sampleRegions(collection=fc, properties=['rowid'], scale=scale)
413
+ info = samples.getInfo()
414
+ rows = []
415
+ for f in info.get('features', []):
416
+ p = f.get('properties', {}) or {}
417
+ rows.append({
418
+ 'rowid': p.get('rowid'),
419
+ 'ndvi_mean': p.get('ndvi_mean'),
420
+ 'ndvi_median': p.get('ndvi_median'),
421
+ 'ndvi_max': p.get('ndvi_max'),
422
+ 'ntl_mean': p.get('ntl_mean'),
423
+ 'ntl_median': p.get('ntl_median'),
424
+ 'ntl_max': p.get('ntl_max')
425
+ })
426
+ return pd.DataFrame(rows)
427
+ ")
428
+
429
+ py$`_ee_init`(project = gee_project, email = gee_email, key_data = gee_key)
430
+
431
+ df$rowid <- seq_len(nrow(df))
432
+ pts_all <- df %>%
433
+ filter(is.finite(!!sym(input$lat_col)), is.finite(!!sym(input$long_col))) %>%
434
+ transmute(rowid = as.character(rowid),
435
+ lon = !!sym(input$long_col),
436
+ lat = !!sym(input$lat_col))
437
+
438
+ if (nrow(pts_all) == 0) {
439
+ showNotification("No valid geocoordinates found", type = "error")
440
+ return()
441
+ }
442
+
443
+ start <- sprintf("%d-01-01", input$start_year)
444
+ end <- sprintf("%d-01-01", input$end_year + 1)
445
+
446
+ batch_size <- 200L
447
+ idx <- split(seq_len(nrow(pts_all)), ceiling(seq_len(nrow(pts_all)) / batch_size))
448
+ sat_all <- list()
449
+
450
+ for (ii in idx) {
451
+ chunk <- pts_all[ii, , drop = FALSE]
452
+ points <- lapply(seq_len(nrow(chunk)), function(i) {
453
+ list(
454
+ rowid = chunk$rowid[i],
455
+ lon = chunk$lon[i],
456
+ lat = chunk$lat[i]
457
+ )
458
+ })
459
+
460
+ df_chunk <- py$satellite_stats(points, start, end, as.integer(250))
461
+ if (!is.null(df_chunk) && nrow(df_chunk) > 0) {
462
+ sat_all[[length(sat_all) + 1L]] <- df_chunk
463
+ }
464
+ }
465
+
466
+ if (length(sat_all) > 0) {
467
+ sat_df <- bind_rows(sat_all) %>% mutate(rowid = as.integer(rowid))
468
+ df <- left_join(df, sat_df, by = "rowid") %>% select(-rowid)
469
+ } else {
470
+ showNotification("Failed to fetch satellite data", type = "error")
471
+ return()
472
+ }
473
+
474
+ missing_feats <- setdiff(input$features, names(df))
475
+ if (length(missing_feats) > 0) {
476
+ showNotification(paste("Could not fetch:", paste(missing_feats, collapse = ", ")), type = "error")
477
+ return()
478
+ }
479
+ }
480
+
481
+ withProgress(message = "Running audit...", value = 0, {
482
+
483
+ incProgress(0.2, detail = "Preparing data...")
484
+
485
+ if (input$audit_type == "randomization") {
486
+ req(input$treat_col)
487
+
488
+ if (!(input$treat_col %in% names(df))) {
489
+ showNotification("Treatment column not found", type = "error")
490
+ return()
491
+ }
492
+
493
+ tt <- df[[input$treat_col]]
494
+ mask <- (tt %in% c(input$control_val, input$treat_val))
495
+
496
+ if (sum(mask) == 0) {
497
+ showNotification("No units match control/treatment values",
498
+ type = "error")
499
+ return()
500
+ }
501
+
502
+ A <- ifelse(tt[mask] == input$treat_val, 1L, 0L)
503
+ X <- as.matrix(df[mask, input$features, drop = FALSE])
504
+
505
+ keep <- apply(X, 1, function(r) all(is.finite(r)))
506
+ A <- A[keep]
507
+ X <- X[keep, , drop = FALSE]
508
+
509
+ if (length(A) < 10) {
510
+ showNotification("Too few complete cases (need >= 10)",
511
+ type = "error")
512
+ return()
513
+ }
514
+
515
+ } else {
516
+ req(input$missing_col)
517
+
518
+ if (!(input$missing_col %in% names(df))) {
519
+ showNotification("Missing column not found", type = "error")
520
+ return()
521
+ }
522
+
523
+ R <- as.integer(!is.na(df[[input$missing_col]]))
524
+
525
+ if (all(R == 1)) {
526
+ showNotification(
527
+ "No missingness detected in selected variable. Audit cannot proceed.",
528
+ type = "warning", duration = 10
529
+ )
530
+ return()
531
+ }
532
+
533
+ if (all(R == 0)) {
534
+ showNotification(
535
+ "All values are missing. Audit cannot proceed.",
536
+ type = "warning", duration = 10
537
+ )
538
+ return()
539
+ }
540
+
541
+ A <- R
542
+ X <- as.matrix(df[, input$features, drop = FALSE])
543
+
544
+ keep <- apply(X, 1, function(r) all(is.finite(r)))
545
+ A <- A[keep]
546
+ X <- X[keep, , drop = FALSE]
547
+ }
548
+
549
+ incProgress(0.4, detail = "Running conditional randomization test...")
550
+
551
+ results <- tryCatch({
552
+ remote_audit_crt(
553
+ A = A,
554
+ X = X,
555
+ K = input$K,
556
+ B = input$B,
557
+ seed = input$seed,
558
+ label = if (input$audit_type == "randomization") input$treat_col else input$missing_col,
559
+ xgboost_ntree = input$ntree
560
  )
561
+ }, error = function(e) {
562
+ showNotification(paste("Audit failed:", e$message),
563
+ type = "error", duration = 10)
564
+ NULL
565
+ })
566
+
567
+ incProgress(1.0, detail = "Complete!")
568
+
569
+ if (!is.null(results)) {
570
+ audit_results(results)
571
+ showNotification("Audit complete!", type = "message", duration = 3)
572
  }
573
+ })
574
+ })
575
+
576
+ output$results_summary <- renderUI({
577
+ res <- audit_results()
578
+ req(res)
579
+
580
+ HTML(sprintf(
581
+ "<h4>%s Audit Results</h4>
582
+ <p><strong>Learner:</strong> %s</p>
583
+ <p><strong>Sample size:</strong> %d (Treated: %d, Control: %d)</p>
584
+ <p><strong>Test statistic (T):</strong> %.4f</p>
585
+ <p><strong>P-value:</strong> %.4f</p>
586
+ <p><strong>Interpretation:</strong> %s</p>",
587
+ tools::toTitleCase(input$audit_type),
588
+ toupper(res$learner),
589
+ res$n,
590
+ res$treated,
591
+ res$n - res$treated,
592
+ res$T_obs,
593
+ res$p_value,
594
+ if (res$p_value < 0.05) {
595
+ "⚠️ Assignment is MORE predictable from satellite features than expected under random assignment (p < 0.05). This suggests potential deviation from the stated randomization mechanism."
596
+ } else {
597
+ "✓ Assignment is NOT significantly more predictable from satellite features than expected under random assignment (p >= 0.05). No evidence of deviation detected."
598
+ }
599
+ ))
600
+ })
601
+
602
+ output$audit_plot <- renderPlot({
603
+ res <- audit_results()
604
+ req(res)
605
+
606
+ hist(res$T_null, breaks = 50,
607
+ main = sprintf("%s Audit: %s Learner",
608
+ tools::toTitleCase(input$audit_type),
609
+ toupper(res$learner)),
610
+ xlab = "Out-of-sample log-likelihood improvement (T)",
611
+ ylab = "Count",
612
+ col = "lightblue",
613
+ border = "white")
614
+ abline(v = res$T_obs, col = "red", lwd = 3, lty = 2)
615
+ legend("topright",
616
+ legend = c("Null distribution", "Observed"),
617
+ col = c("lightblue", "red"),
618
+ lwd = c(10, 3),
619
+ lty = c(1, 2))
620
+ mtext(sprintf("n=%d, treated=%d (%.1f%%), B=%d, p=%.4f",
621
+ res$n, res$treated, 100 * res$a_bar, res$B, res$p_value),
622
+ side = 3, line = 0.5, cex = 0.9)
623
+ })
624
+
625
+ output$download_results <- downloadHandler(
626
+ filename = function() {
627
+ sprintf("remote_audit_results_%s.csv", format(Sys.time(), "%Y%m%d_%H%M%S"))
628
  },
629
+ content = function(file) {
630
+ res <- audit_results()
631
+ req(res)
632
+
633
+ summary_df <- data.frame(
634
+ audit_type = input$audit_type,
635
+ learner = res$learner,
636
+ n = res$n,
637
+ treated = res$treated,
638
+ treatment_rate = res$a_bar,
639
+ K = res$K,
640
+ B = res$B,
641
+ T_observed = res$T_obs,
642
+ p_value = res$p_value,
643
+ seed = input$seed,
644
+ features = paste(input$features, collapse = ";")
645
+ )
646
+
647
+ write.csv(summary_df, file, row.names = FALSE)
648
+ }
649
  )
650
+
651
+ output$audit_complete <- reactive({
652
+ !is.null(audit_results())
653
+ })
654
+ outputOptions(output, "audit_complete", suspendWhenHidden = FALSE)
655
  }
656
 
657
+ shinyApp(ui, server)