cjerzak commited on
Commit
a470aa8
·
verified ·
1 Parent(s): 27d3c78

Update app.R

Browse files
Files changed (1) hide show
  1. app.R +63 -661
app.R CHANGED
@@ -1,665 +1,67 @@
1
- # setwd("~/Dropbox/ImageDeconfoundAid/BrokenExperiment/ShinyApp/"); Sys.setenv(RETICULATE_PYTHON = "/Users/cjerzak/miniconda3/bin/python")
2
- # app.R — Remote Audit: Design-Based Tests of Randomization with Satellite Imagery
 
 
3
  # ==============================================================================
4
- # Performs conditional randomization tests to audit experimental integrity
5
- # using pre-treatment satellite imagery features (NDVI, Nightlight)
6
  # ==============================================================================
7
-
8
- # For Hugging Face deployment, set secrets GEE_PROJECT, GEE_EMAIL, GEE_KEY (the service account key JSON as string)
9
- options(shiny.maxRequestSize = 50 * 1024^2)
10
- options(error = NULL)
11
-
12
- suppressPackageStartupMessages({
13
- library(shiny)
14
- library(bslib)
15
- library(DT)
16
- library(shinyWidgets)
17
- library(reticulate)
18
- library(dplyr)
19
- library(xgboost)
20
- library(future)
21
- library(future.apply)
22
- library(parallel)
23
- })
24
-
25
- # ============================================================================
26
- # Helper Functions
27
- # ============================================================================
28
-
29
- .clamp01 <- function(p, eps = 1e-6) pmin(pmax(p, eps), 1 - eps)
30
-
31
- .lik_improvement <- function(A, phat, a_bar) {
32
- phat <- .clamp01(phat)
33
- a_bar <- .clamp01(a_bar)
34
- L <- sum(A * log(phat) + (1 - A) * log(1 - phat))
35
- L0 <- sum(A * log(a_bar) + (1 - A) * log(1 - a_bar))
36
- L - L0
37
- }
38
-
39
- .cf_xgboost_phat <- function(A, X, K = 5, folds = NULL, ntree = 300L, mtry = NULL) {
40
- n <- length(A)
41
- if (is.null(folds)) folds <- sample(rep(seq_len(K), length.out = n))
42
- phat <- rep(NA_real_, n)
43
-
44
- for (k in seq_len(K)) {
45
- idx_te <- which(folds == k)
46
- idx_tr <- which(folds != k)
47
-
48
- A_tr <- A[idx_tr]; X_tr <- as.matrix(X[idx_tr, , drop = FALSE])
49
- X_te <- as.matrix(X[idx_te, , drop = FALSE])
50
-
51
- sdv <- apply(X_tr, 2, sd)
52
- keep <- is.finite(sdv) & (sdv > 1e-12)
53
- if (!any(keep)) {
54
- phat[idx_te] <- mean(A_tr)
55
- next
56
- }
57
-
58
- X_tr_k <- X_tr[, keep, drop = FALSE]
59
- X_te_k <- X_te[, keep, drop = FALSE]
60
-
61
- X_df <- as.data.frame(X_tr_k)
62
- X_mat <- model.matrix(~ . -1, data = X_df)
63
- y_num <- as.numeric(A_tr)
64
-
65
- p <- ncol(X_mat)
66
- mtry_use <- if (is.null(mtry)) max(1L, floor(sqrt(p))) else max(1L, min(as.integer(mtry), p))
67
- colsample_frac <- min(1, as.numeric(mtry_use) / max(1, p))
68
-
69
- params <- list(
70
- objective = "binary:logistic",
71
- eval_metric = "logloss",
72
- eta = 0.1,
73
- max_depth = 6,
74
- subsample = 0.8,
75
- colsample_bytree = colsample_frac,
76
- nthread = parallel::detectCores()
77
- )
78
-
79
- dtrain <- xgboost::xgb.DMatrix(data = X_mat, label = y_num)
80
-
81
- fit <- xgboost::xgb.train(
82
- params = params,
83
- data = dtrain,
84
- nrounds = as.integer(ntree),
85
- verbose = 0
86
- )
87
-
88
- phat[idx_te] <- predict(fit, X_te_k)
89
- }
90
- list(phat = .clamp01(phat), folds = folds)
91
- }
92
-
93
- .draw_assign_fixed_m <- function(n, m) {
94
- A <- rep.int(0L, n)
95
- A[sample.int(n, m)] <- 1L
96
- A
97
- }
98
-
99
- remote_audit_crt <- function(A, X,
100
- K = 5,
101
- B = 1000,
102
- seed = 123,
103
- label = "",
104
- xgboost_ntree = 300L,
105
- xgboost_mtry = NULL) {
106
- stopifnot(length(A) == nrow(X))
107
- keep <- is.finite(A) & apply(as.matrix(X), 1, function(r) all(is.finite(r)))
108
- A <- as.integer(A[keep])
109
- X <- as.matrix(X[keep, , drop = FALSE])
110
-
111
- n <- length(A)
112
- a_bar <- .clamp01(mean(A))
113
- m <- sum(A)
114
-
115
- set.seed(seed)
116
- folds <- sample(rep(seq_len(K), length.out = n))
117
-
118
- obs <- .cf_xgboost_phat(A, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)
119
- T_obs <- .lik_improvement(A, obs$phat, a_bar)
120
-
121
- plan(multisession(workers = availableCores()))
122
- T_null <- future_sapply(seq_len(B), future.seed = TRUE, FUN = function(b) {
123
- A_b <- .draw_assign_fixed_m(n, m)
124
- ph <- .cf_xgboost_phat(A_b, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)$phat
125
- .lik_improvement(A_b, ph, a_bar)
126
- })
127
- plan(sequential)
128
-
129
- pval <- (1 + sum(T_null >= T_obs)) / (B + 1)
130
-
131
- list(
132
- T_obs = T_obs,
133
- T_null = T_null,
134
- p_value = pval,
135
- a_bar = a_bar,
136
- n = n,
137
- treated = m,
138
- K = K,
139
- B = B,
140
- label = label,
141
- learner = "xgboost"
142
- )
143
- }
144
-
145
- # ============================================================================
146
- # UI
147
- # ============================================================================
148
-
149
- theme <- bs_theme(bootswatch = "flatly")
150
-
151
- ui <- page_sidebar(
152
- tags$head(tags$title("Remote Audit")),
153
- title = div(
154
- span("Remote Audit", style = "font-weight:700;"),
155
- span(" with Satellite Imagery", style = "color: #888;")
156
- ),
157
- theme = theme,
158
-
159
- sidebar = sidebar(
160
- width = 360,
161
-
162
- h5("Data Input"),
163
- radioButtons("data_source", NULL,
164
- choices = c("Upload CSV" = "upload",
165
- "Use Example (Begum et al. 2022)" = "example"),
166
- selected = "example"),
167
-
168
- conditionalPanel(
169
- "input.data_source == 'upload'",
170
- fileInput("file_csv", "Upload CSV", accept = ".csv")
171
- ),
172
-
173
- h5("Audit Configuration"),
174
-
175
- selectInput("audit_type", "Audit Type",
176
- choices = c("Randomization" = "randomization",
177
- "Missingness" = "missingness"),
178
- selected = "randomization"),
179
-
180
- conditionalPanel(
181
- "input.audit_type == 'randomization'",
182
- selectInput("treat_col", "Treatment Column", choices = NULL),
183
- numericInput("control_val", "Control Value", value = 1, step = 1),
184
- numericInput("treat_val", "Treatment Value", value = 2, step = 1)
185
- ),
186
-
187
- conditionalPanel(
188
- "input.audit_type == 'missingness'",
189
- selectInput("missing_col", "Variable to Check", choices = NULL)
190
- ),
191
-
192
- selectInput("lat_col", "Latitude Column", choices = NULL),
193
- selectInput("long_col", "Longitude Column", choices = NULL),
194
-
195
- numericInput("start_year", "Start Year", value = 2010, min = 1990, max = 2026),
196
- numericInput("end_year", "End Year", value = 2011, min = 1990, max = 2026),
197
-
198
- checkboxGroupInput("features", "Features",
199
- choices = c("NDVI Median" = "ndvi_median",
200
- "Nightlight Median" = "ntl_median"),
201
- selected = c("ndvi_median", "ntl_median")),
202
-
203
- h5("Parameters"),
204
- numericInput("K", "K-Folds", value = 5, min = 2, max = 10),
205
- numericInput("B", "Resamples", value = 1000, min = 100, max = 5000, step = 100),
206
- numericInput("seed", "Random Seed", value = 123),
207
- numericInput("ntree", "Number of Trees", value = 300, min = 50, max = 1000),
208
-
209
- actionButton("run_audit", "Run Audit",
210
- class = "btn-primary btn-lg",
211
- icon = icon("play"),
212
- style = "width: 100%;"),
213
-
214
- tags$a(
215
- href = "https://connorjerzak.com/linkorgs-summary/",
216
- target = "_blank",
217
- icon("circle-question"), " Technical Details"
218
- )
219
- ),
220
-
221
- layout_columns(
222
- col_widths = c(12),
223
-
224
- card(
225
- card_header("Data Preview"),
226
- card_body(
227
- DTOutput("data_preview")
228
- )
229
- ),
230
-
231
- conditionalPanel(
232
- "output.audit_complete",
233
- card(
234
- card_header("Audit Results"),
235
- card_body(
236
- uiOutput("results_summary"),
237
- plotOutput("audit_plot", height = "400px"),
238
- downloadButton("download_results", "Download Results",
239
- class = "btn-success")
240
- )
241
- )
242
- )
243
- )
244
  )
245
-
246
- # ============================================================================
247
- # Server
248
- # ============================================================================
249
-
250
- server <- function(input, output, session) {
251
-
252
- audit_results <- reactiveVal(NULL)
253
-
254
- data_loaded <- reactive({
255
- if (input$data_source == "example") {
256
- print(list.files())
257
- if (file.exists("./Islam2019_WithGeocodesAndSatData.Rdata")) {
258
- load("./Islam2019_WithGeocodesAndSatData.Rdata")
259
- return(data)
260
- } else {
261
- showNotification("Example data file not found. Please upload your own CSV.",
262
- type = "error", duration = 10)
263
- return(NULL)
264
- }
265
- } else {
266
- req(input$file_csv)
267
- tryCatch({
268
- read.csv(input$file_csv$datapath, stringsAsFactors = FALSE)
269
- }, error = function(e) {
270
- showNotification(paste("Error reading CSV:", e$message),
271
- type = "error", duration = 10)
272
- NULL
273
- })
274
- }
275
- })
276
-
277
- observe({
278
- df <- data_loaded()
279
- req(df)
280
-
281
- cols <- names(df)
282
-
283
- updateSelectInput(session, "treat_col", choices = cols,
284
- selected = if ("begum_treat" %in% cols) "begum_treat" else cols[1])
285
- updateSelectInput(session, "missing_col", choices = cols,
286
- selected = cols[1])
287
- updateSelectInput(session, "lat_col", choices = cols,
288
- selected = grep("lat", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
289
- updateSelectInput(session, "long_col", choices = cols,
290
- selected = grep("lon|long", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
291
- })
292
-
293
- observeEvent(input$data_source, {
294
- if (input$data_source == "upload" && is.null(input$file_csv)) {
295
- updateSelectInput(session, "treat_col", choices = character(0))
296
- updateSelectInput(session, "missing_col", choices = character(0))
297
- updateSelectInput(session, "lat_col", choices = character(0))
298
- updateSelectInput(session, "long_col", choices = character(0))
299
- }
300
- })
301
-
302
- output$data_preview <- renderDT({
303
- df <- data_loaded()
304
- req(df)
305
-
306
- datatable(
307
- head(df, 100),
308
- options = list(pageLength = 10, scrollX = TRUE, dom = 'tip'),
309
- rownames = FALSE
310
- )
311
- })
312
-
313
- observeEvent(input$run_audit, {
314
- df <- data_loaded()
315
- req(df)
316
-
317
- missing_feats <- setdiff(input$features, names(df))
318
- if (length(missing_feats) > 0) {
319
- showNotification("Fetching satellite features from GEE...", type = "message")
320
-
321
- req(input$lat_col %in% names(df), input$long_col %in% names(df))
322
-
323
- if (input$start_year > input$end_year) {
324
- showNotification("Start year must be <= end year", type = "error")
325
- return()
326
- }
327
-
328
- gee_project <- Sys.getenv("GEE_PROJECT", unset = NULL)
329
- gee_email <- Sys.getenv("GEE_EMAIL", unset = NULL)
330
- gee_key <- Sys.getenv("GEE_KEY", unset = NULL)
331
-
332
- py_run_string("
333
- import ee, pandas as pd, json
334
-
335
- def _ee_init(project, email=None, key_data=None):
336
- if email is None and key_data is None:
337
- try:
338
- ee.Initialize(project=project)
339
- except:
340
- ee.Authenticate()
341
- ee.Initialize(project=project)
342
- else:
343
- key_dict = json.loads(key_data)
344
- credentials = ee.ServiceAccountCredentials(email, key_data=key_dict)
345
- ee.Initialize(credentials=credentials, project=project)
346
-
347
- def satellite_stats(points, start, end, scale=250):
348
- feats = [ee.Feature(ee.Geometry.Point([float(p['lon']), float(p['lat'])]),
349
- {'rowid': str(p['rowid'])}) for p in points]
350
- fc = ee.FeatureCollection(feats)
351
-
352
- def mask_modis(img):
353
- qa = img.select('SummaryQA')
354
- mask = qa.eq(0)
355
- return img.updateMask(mask).select('NDVI').multiply(0.0001).copyProperties(img, img.propertyNames())
356
- modis = (ee.ImageCollection('MODIS/061/MOD13Q1')
357
- .filterDate(start, end)
358
- .map(mask_modis))
359
- ndvi_mean = modis.select('NDVI').mean().rename('ndvi_mean')
360
- ndvi_median = modis.reduce(ee.Reducer.median()).rename('ndvi_median')
361
- ndvi_max = modis.select('NDVI').max().rename('ndvi_max')
362
-
363
- dmsp = ee.ImageCollection('NOAA/DMSP-OLS/NIGHTTIME_LIGHTS').select('stable_lights')
364
- viirs = ee.ImageCollection('NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG').select('avg_rad')
365
-
366
- dmsp_window = dmsp.filterDate(start, end)
367
- viirs_window = viirs.filterDate(start, end)
368
-
369
- overlap_start = ee.Date('2012-01-01')
370
- overlap_end = ee.Date('2014-12-31')
371
- dmsp_ov_img = dmsp.filterDate(overlap_start, overlap_end).mean()
372
- viirs_ov_img = viirs.filterDate(overlap_start, overlap_end).mean()
373
-
374
- def _buffer_feat(f):
375
- f = ee.Feature(f)
376
- return f.buffer(5000)
377
- fc_buffer = fc.map(_buffer_feat)
378
- region_geom = fc_buffer.geometry()
379
-
380
- dmsp_ov_mean = ee.Number(dmsp_ov_img.reduceRegion(
381
- reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
382
- ).get('stable_lights'))
383
- viirs_ov_mean = ee.Number(viirs_ov_img.reduceRegion(
384
- reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13
385
- ).get('avg_rad'))
386
-
387
- dmsp_global_mean = ee.Number(dmsp_ov_img.reduceRegion(
388
- reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
389
- scale=50000, maxPixels=1e13
390
- ).get('stable_lights'))
391
- viirs_global_mean = ee.Number(viirs_ov_img.reduceRegion(
392
- reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
393
- scale=50000, maxPixels=1e13
394
- ).get('avg_rad'))
395
-
396
- dmsp_use = ee.Algorithms.If(dmsp_ov_mean, dmsp_ov_mean, dmsp_global_mean)
397
- viirs_use = ee.Algorithms.If(viirs_ov_mean, viirs_ov_mean, viirs_global_mean)
398
-
399
- dmsp_use = ee.Number(dmsp_use)
400
- viirs_use = ee.Number(viirs_use)
401
-
402
- scale = ee.Algorithms.If(dmsp_use.gt(0), viirs_use.divide(dmsp_use), 1)
403
- scale = ee.Number(scale)
404
-
405
- def calib_img(img):
406
- return img.multiply(scale).toFloat()
407
-
408
- dmsp_equiv = dmsp_window.map(lambda img: calib_img(img.select('stable_lights').rename('ntl')))
409
- viirs_prep = viirs_window.map(lambda img: img.select('avg_rad').rename('ntl').toFloat())
410
- ntl_window = dmsp_equiv.merge(viirs_prep)
411
-
412
- ntl_mean = ntl_window.mean().rename('ntl_mean')
413
- ntl_median = ntl_window.reduce(ee.Reducer.median()).rename('ntl_median')
414
- ntl_max = ntl_window.max().rename('ntl_max')
415
-
416
- stacked = (ndvi_mean
417
- .addBands([ndvi_median, ndvi_max,
418
- ntl_mean, ntl_median, ntl_max]))
419
-
420
- samples = stacked.sampleRegions(collection=fc, properties=['rowid'], scale=scale)
421
- info = samples.getInfo()
422
- rows = []
423
- for f in info.get('features', []):
424
- p = f.get('properties', {}) or {}
425
- rows.append({
426
- 'rowid': p.get('rowid'),
427
- 'ndvi_mean': p.get('ndvi_mean'),
428
- 'ndvi_median': p.get('ndvi_median'),
429
- 'ndvi_max': p.get('ndvi_max'),
430
- 'ntl_mean': p.get('ntl_mean'),
431
- 'ntl_median': p.get('ntl_median'),
432
- 'ntl_max': p.get('ntl_max')
433
- })
434
- return pd.DataFrame(rows)
435
- ")
436
-
437
- py$`_ee_init`(project = gee_project, email = gee_email, key_data = gee_key)
438
-
439
- df$rowid <- seq_len(nrow(df))
440
- pts_all <- df %>%
441
- filter(is.finite(!!sym(input$lat_col)), is.finite(!!sym(input$long_col))) %>%
442
- transmute(rowid = as.character(rowid),
443
- lon = !!sym(input$long_col),
444
- lat = !!sym(input$lat_col))
445
-
446
- if (nrow(pts_all) == 0) {
447
- showNotification("No valid geocoordinates found", type = "error")
448
- return()
449
- }
450
-
451
- start <- sprintf("%d-01-01", input$start_year)
452
- end <- sprintf("%d-01-01", input$end_year + 1)
453
-
454
- batch_size <- 200L
455
- idx <- split(seq_len(nrow(pts_all)), ceiling(seq_len(nrow(pts_all)) / batch_size))
456
- sat_all <- list()
457
-
458
- for (ii in idx) {
459
- chunk <- pts_all[ii, , drop = FALSE]
460
- points <- lapply(seq_len(nrow(chunk)), function(i) {
461
- list(
462
- rowid = chunk$rowid[i],
463
- lon = chunk$lon[i],
464
- lat = chunk$lat[i]
465
- )
466
- })
467
-
468
- df_chunk <- py$satellite_stats(points, start, end, as.integer(250))
469
- if (!is.null(df_chunk) && nrow(df_chunk) > 0) {
470
- sat_all[[length(sat_all) + 1L]] <- df_chunk
471
- }
472
- }
473
-
474
- if (length(sat_all) > 0) {
475
- sat_df <- bind_rows(sat_all) %>% mutate(rowid = as.integer(rowid))
476
- df <- left_join(df, sat_df, by = "rowid") %>% select(-rowid)
477
- } else {
478
- showNotification("Failed to fetch satellite data", type = "error")
479
- return()
480
- }
481
-
482
- missing_feats <- setdiff(input$features, names(df))
483
- if (length(missing_feats) > 0) {
484
- showNotification(paste("Could not fetch:", paste(missing_feats, collapse = ", ")), type = "error")
485
- return()
486
- }
487
- }
488
-
489
- withProgress(message = "Running audit...", value = 0, {
490
-
491
- incProgress(0.2, detail = "Preparing data...")
492
-
493
- if (input$audit_type == "randomization") {
494
- req(input$treat_col)
495
-
496
- if (!(input$treat_col %in% names(df))) {
497
- showNotification("Treatment column not found", type = "error")
498
- return()
499
- }
500
-
501
- tt <- df[[input$treat_col]]
502
- mask <- (tt %in% c(input$control_val, input$treat_val))
503
-
504
- if (sum(mask) == 0) {
505
- showNotification("No units match control/treatment values",
506
- type = "error")
507
- return()
508
- }
509
-
510
- A <- ifelse(tt[mask] == input$treat_val, 1L, 0L)
511
- X <- as.matrix(df[mask, input$features, drop = FALSE])
512
-
513
- keep <- apply(X, 1, function(r) all(is.finite(r)))
514
- A <- A[keep]
515
- X <- X[keep, , drop = FALSE]
516
-
517
- if (length(A) < 10) {
518
- showNotification("Too few complete cases (need >= 10)",
519
- type = "error")
520
- return()
521
- }
522
-
523
- } else {
524
- req(input$missing_col)
525
-
526
- if (!(input$missing_col %in% names(df))) {
527
- showNotification("Missing column not found", type = "error")
528
- return()
529
- }
530
-
531
- R <- as.integer(!is.na(df[[input$missing_col]]))
532
-
533
- if (all(R == 1)) {
534
- showNotification(
535
- "No missingness detected in selected variable. Audit cannot proceed.",
536
- type = "warning", duration = 10
537
- )
538
- return()
539
- }
540
-
541
- if (all(R == 0)) {
542
- showNotification(
543
- "All values are missing. Audit cannot proceed.",
544
- type = "warning", duration = 10
545
- )
546
- return()
547
- }
548
-
549
- A <- R
550
- X <- as.matrix(df[, input$features, drop = FALSE])
551
-
552
- keep <- apply(X, 1, function(r) all(is.finite(r)))
553
- A <- A[keep]
554
- X <- X[keep, , drop = FALSE]
555
- }
556
-
557
- incProgress(0.4, detail = "Running conditional randomization test...")
558
-
559
- results <- tryCatch({
560
- remote_audit_crt(
561
- A = A,
562
- X = X,
563
- K = input$K,
564
- B = input$B,
565
- seed = input$seed,
566
- label = if (input$audit_type == "randomization") input$treat_col else input$missing_col,
567
- xgboost_ntree = input$ntree
568
- )
569
- }, error = function(e) {
570
- showNotification(paste("Audit failed:", e$message),
571
- type = "error", duration = 10)
572
- NULL
573
- })
574
-
575
- incProgress(1.0, detail = "Complete!")
576
-
577
- if (!is.null(results)) {
578
- audit_results(results)
579
- showNotification("Audit complete!", type = "message", duration = 3)
580
- }
581
- })
582
- })
583
-
584
- output$results_summary <- renderUI({
585
- res <- audit_results()
586
- req(res)
587
-
588
- HTML(sprintf(
589
- "<h4>%s Audit Results</h4>
590
- <p><strong>Learner:</strong> %s</p>
591
- <p><strong>Sample size:</strong> %d (Treated: %d, Control: %d)</p>
592
- <p><strong>Test statistic (T):</strong> %.4f</p>
593
- <p><strong>P-value:</strong> %.4f</p>
594
- <p><strong>Interpretation:</strong> %s</p>",
595
- tools::toTitleCase(input$audit_type),
596
- toupper(res$learner),
597
- res$n,
598
- res$treated,
599
- res$n - res$treated,
600
- res$T_obs,
601
- res$p_value,
602
- if (res$p_value < 0.05) {
603
- "⚠️ Assignment is MORE predictable from satellite features than expected under random assignment (p < 0.05). This suggests potential deviation from the stated randomization mechanism."
604
- } else {
605
- "✓ Assignment is NOT significantly more predictable from satellite features than expected under random assignment (p >= 0.05). No evidence of deviation detected."
606
- }
607
- ))
608
- })
609
-
610
- output$audit_plot <- renderPlot({
611
- res <- audit_results()
612
- req(res)
613
-
614
- hist(res$T_null, breaks = 50,
615
- main = sprintf("%s Audit: %s Learner",
616
- tools::toTitleCase(input$audit_type),
617
- toupper(res$learner)),
618
- xlab = "Out-of-sample log-likelihood improvement (T)",
619
- ylab = "Count",
620
- col = "lightblue",
621
- border = "white")
622
- abline(v = res$T_obs, col = "red", lwd = 3, lty = 2)
623
- legend("topright",
624
- legend = c("Null distribution", "Observed"),
625
- col = c("lightblue", "red"),
626
- lwd = c(10, 3),
627
- lty = c(1, 2))
628
- mtext(sprintf("n=%d, treated=%d (%.1f%%), B=%d, p=%.4f",
629
- res$n, res$treated, 100 * res$a_bar, res$B, res$p_value),
630
- side = 3, line = 0.5, cex = 0.9)
631
- })
632
-
633
- output$download_results <- downloadHandler(
634
- filename = function() {
635
- sprintf("remote_audit_results_%s.csv", format(Sys.time(), "%Y%m%d_%H%M%S"))
636
- },
637
- content = function(file) {
638
- res <- audit_results()
639
- req(res)
640
-
641
- summary_df <- data.frame(
642
- audit_type = input$audit_type,
643
- learner = res$learner,
644
- n = res$n,
645
- treated = res$treated,
646
- treatment_rate = res$a_bar,
647
- K = res$K,
648
- B = res$B,
649
- T_observed = res$T_obs,
650
- p_value = res$p_value,
651
- seed = input$seed,
652
- features = paste(input$features, collapse = ";")
653
- )
654
-
655
- write.csv(summary_df, file, row.names = FALSE)
656
- }
657
- )
658
-
659
- output$audit_complete <- reactive({
660
- !is.null(audit_results())
661
- })
662
- outputOptions(output, "audit_complete", suspendWhenHidden = FALSE)
663
  }
664
-
665
- shinyApp(ui, server)
 
 
 
 
 
 
 
 
 
1
+ # syntax=docker/dockerfile:1
2
+ FROM rocker/r2u:22.04
3
+ WORKDIR /code
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
  # ==============================================================================
6
+ # System dependencies
 
7
  # ==============================================================================
8
+ RUN apt-get update -y && apt-get install -y --no-install-recommends \
9
+ wget bzip2 git unzip ca-certificates locales tzdata \
10
+ build-essential gfortran \
11
+ libcurl4-openssl-dev libssl-dev libxml2-dev libgit2-dev \
12
+ libopenblas-dev liblapack-dev \
13
+ python3 python3-pip \
14
+ && rm -rf /var/lib/apt/lists/*
15
+ ENV LC_ALL=C.UTF-8 LANG=C.UTF-8 HF_HUB_DISABLE_TELEMETRY=1
16
+ # ==============================================================================
17
+ # Python packages
18
+ # ==============================================================================
19
+ RUN pip3 install --no-cache-dir pandas earthengine-api
20
+ # ==============================================================================
21
+ # Install R packages (prioritize apt when available for speed)
22
+ # ==============================================================================
23
+ ARG APT_R_PKGS="\
24
+ r-cran-shiny r-cran-dplyr r-cran-dt \
25
+ r-cran-data.table r-cran-foreach r-cran-doparallel"
26
+ RUN set -eux; \
27
+ apt-get update -y; \
28
+ for pkg in $APT_R_PKGS; do \
29
+ if apt-cache show "$pkg" >/dev/null 2>&1; then \
30
+ echo "Installing $pkg via apt ..."; \
31
+ apt-get install -y --no-install-recommends "$pkg" || true; \
32
+ fi; \
33
+ done; \
34
+ rm -rf /var/lib/apt/lists/*
35
+ # ==============================================================================
36
+ # R packages via CRAN (fallback for those not in apt)
37
+ # ==============================================================================
38
+ RUN Rscript - <<'RSCRIPT'
39
+ options(Ncpus = parallel::detectCores())
40
+ cran <- "https://cloud.r-project.org"
41
+ req <- c(
42
+ "shiny", "dplyr", "DT", "data.table",
43
+ "bslib", "shinyWidgets", "xgboost",
44
+ "reticulate", "future", "future.apply"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  )
46
+ installed <- rownames(installed.packages())
47
+ need <- setdiff(req, installed)
48
+ if (length(need)) {
49
+ if (!requireNamespace("pak", quietly = TRUE)) {
50
+ install.packages("pak", repos = "https://r-lib.github.io/p/pak/stable")
51
+ }
52
+ ok <- tryCatch({
53
+ pak::pak(need)
54
+ TRUE
55
+ }, error = function(e) FALSE)
56
+ if (!ok) install.packages(need, repos = cran)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  }
58
+ RSCRIPT
59
+ # ==============================================================================
60
+ # Copy application files
61
+ # ==============================================================================
62
+ COPY . /code/
63
+ # ==============================================================================
64
+ # Shiny entrypoint
65
+ # ==============================================================================
66
+ EXPOSE 7860
67
+ CMD ["R", "--quiet", "-e", "port <- as.integer(Sys.getenv('PORT', '7860')); shiny::runApp('/code', host='0.0.0.0', port=port)"]