Spaces:

cjerzak
/

ra

Running

File size: 23,132 Bytes

# setwd("~/Dropbox/ImageDeconfoundAid/BrokenExperiment/ShinyApp/"); Sys.setenv(RETICULATE_PYTHON = "/Users/cjerzak/miniconda3/bin/python")
# app.R — Remote Audit: Design-Based Tests of Randomization with Satellite Imagery
# ==============================================================================
# Performs conditional randomization tests to audit experimental integrity
# using pre-treatment satellite imagery features (NDVI, Nightlight)
# ==============================================================================

# For Hugging Face deployment, set secrets GEE_PROJECT, GEE_EMAIL, GEE_KEY (the service account key JSON as string)
options(shiny.maxRequestSize = 50 * 1024^2)
options(error = NULL)

suppressPackageStartupMessages({
  library(shiny)
  library(bslib)
  library(DT)
  library(shinyWidgets)
  library(reticulate)
  library(dplyr)
  library(xgboost)
  library(future)
  library(future.apply)
  library(parallel)
})

# ============================================================================
# Helper Functions
# ============================================================================

.clamp01 <- function(p, eps = 1e-6) pmin(pmax(p, eps), 1 - eps)

.lik_improvement <- function(A, phat, a_bar) {
  phat <- .clamp01(phat)
  a_bar <- .clamp01(a_bar)
  L <- sum(A * log(phat) + (1 - A) * log(1 - phat))
  L0 <- sum(A * log(a_bar) + (1 - A) * log(1 - a_bar))
  L - L0
}

.cf_xgboost_phat <- function(A, X, K = 5, folds = NULL, ntree = 300L, mtry = NULL) {
  n <- length(A)
  if (is.null(folds)) folds <- sample(rep(seq_len(K), length.out = n))
  phat <- rep(NA_real_, n)
  
  for (k in seq_len(K)) {
    idx_te <- which(folds == k)
    idx_tr <- which(folds != k)
    
    A_tr <- A[idx_tr]; X_tr <- as.matrix(X[idx_tr, , drop = FALSE])
    X_te <- as.matrix(X[idx_te, , drop = FALSE])
    
    sdv <- apply(X_tr, 2, sd)
    keep <- is.finite(sdv) & (sdv > 1e-12)
    if (!any(keep)) {
      phat[idx_te] <- mean(A_tr)
      next
    }
    
    X_tr_k <- X_tr[, keep, drop = FALSE]
    X_te_k <- X_te[, keep, drop = FALSE]
    
    X_df <- as.data.frame(X_tr_k)
    X_mat <- model.matrix(~ . -1, data = X_df)
    y_num <- as.numeric(A_tr)
    
    p <- ncol(X_mat)
    mtry_use <- if (is.null(mtry)) max(1L, floor(sqrt(p))) else max(1L, min(as.integer(mtry), p))
    colsample_frac <- min(1, as.numeric(mtry_use) / max(1, p))
    
    params <- list(
      objective = "binary:logistic",
      eval_metric = "logloss",
      eta = 0.1,
      max_depth = 6,
      subsample = 0.8,
      colsample_bytree = colsample_frac,
      nthread = parallel::detectCores()
    )
    
    dtrain <- xgboost::xgb.DMatrix(data = X_mat, label = y_num)
    
    fit <- xgboost::xgb.train(
      params = params,
      data = dtrain,
      nrounds = as.integer(ntree),
      verbose = 0
    )
    
    phat[idx_te] <- predict(fit, X_te_k)
  }
  list(phat = .clamp01(phat), folds = folds)
}

.draw_assign_fixed_m <- function(n, m) {
  A <- rep.int(0L, n)
  A[sample.int(n, m)] <- 1L
  A
}

remote_audit_crt <- function(A, X,
                             K = 5,
                             B = 1000,
                             seed = 123,
                             label = "",
                             xgboost_ntree = 300L,
                             xgboost_mtry = NULL) {
  stopifnot(length(A) == nrow(X))
  keep <- is.finite(A) & apply(as.matrix(X), 1, function(r) all(is.finite(r)))
  A <- as.integer(A[keep])
  X <- as.matrix(X[keep, , drop = FALSE])
  
  n <- length(A)
  a_bar <- .clamp01(mean(A))
  m <- sum(A)
  
  set.seed(seed)
  folds <- sample(rep(seq_len(K), length.out = n))
  
  obs <- .cf_xgboost_phat(A, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)
  T_obs <- .lik_improvement(A, obs$phat, a_bar)
  
  plan(multisession(workers = availableCores()))
  T_null <- future_sapply(seq_len(B), future.seed = TRUE, FUN = function(b) {
    A_b <- .draw_assign_fixed_m(n, m)
    ph <- .cf_xgboost_phat(A_b, X, K = K, folds = folds, ntree = xgboost_ntree, mtry = xgboost_mtry)$phat
    .lik_improvement(A_b, ph, a_bar)
  })
  plan(sequential)
  
  pval <- (1 + sum(T_null >= T_obs)) / (B + 1)
  
  list(
    T_obs = T_obs,
    T_null = T_null,
    p_value = pval,
    a_bar = a_bar,
    n = n,
    treated = m,
    K = K,
    B = B,
    label = label,
    learner = "xgboost"
  )
}

# ============================================================================
# UI
# ============================================================================

theme <- bs_theme(bootswatch = "flatly")

ui <- page_sidebar(
  tags$head(tags$title("Remote Audit")),
  title = div(
    span("Remote Audit", style = "font-weight:700;"),
    span(" with Satellite Imagery", style = "color: #888;")
  ),
  theme = theme,
  
  sidebar = sidebar(
    width = 360,
    
    h5("Data Input"),
    radioButtons("data_source", NULL,
                 choices = c("Upload CSV" = "upload",
                             "Use Example (Begum et al. 2022)" = "example"),
                 selected = "example"),
    
    conditionalPanel(
      "input.data_source == 'upload'",
      fileInput("file_csv", "Upload CSV", accept = ".csv")
    ),
    
    h5("Audit Configuration"),
    
    selectInput("audit_type", "Audit Type",
                choices = c("Randomization" = "randomization",
                            "Missingness" = "missingness"),
                selected = "randomization"),
    
    conditionalPanel(
      "input.audit_type == 'randomization'",
      selectInput("treat_col", "Treatment Column", choices = NULL),
      numericInput("control_val", "Control Value", value = 1, step = 1),
      numericInput("treat_val", "Treatment Value", value = 2, step = 1)
    ),
    
    conditionalPanel(
      "input.audit_type == 'missingness'",
      selectInput("missing_col", "Variable to Check", choices = NULL)
    ),
    
    selectInput("lat_col", "Latitude Column", choices = NULL),
    selectInput("long_col", "Longitude Column", choices = NULL),
    
    numericInput("start_year", "Start Year", value = 2010, min = 1990, max = 2026),
    numericInput("end_year", "End Year", value = 2011, min = 1990, max = 2026),
    
    checkboxGroupInput("features", "Features",
                       choices = c("NDVI Median" = "ndvi_median",
                                   "Nightlight Median" = "ntl_median"),
                       selected = c("ndvi_median", "ntl_median")),
    
    h5("Parameters"),
    numericInput("K", "K-Folds", value = 10, min = 2, max = 20),
    numericInput("B", "Resamples", value = 1000, min = 100, max = 5000, step = 100),
    numericInput("seed", "Random Seed", value = 987),
    numericInput("ntree", "Number of Trees", value = 300, min = 50, max = 1000),
    
    actionButton("run_audit", "Run Audit",
                 class = "btn-primary btn-lg",
                 icon = icon("play"),
                 style = "width: 100%;"),
    
    tags$a(
      href = "https://connorjerzak.com/linkorgs-summary/",
      target = "_blank",
      icon("circle-question"), " Technical Details"
    )
  ),
  
  layout_columns(
    col_widths = c(12),
    
    card(
      card_header("Data Preview"),
      card_body(
        DTOutput("data_preview")
      )
    ),
    
    conditionalPanel(
      "output.audit_complete",
      card(
        card_header("Audit Results"),
        card_body(
          uiOutput("results_summary"),
          plotOutput("audit_plot", height = "400px"),
          downloadButton("download_results", "Download Results",
                         class = "btn-success")
        )
      )
    )
  )
)

# ============================================================================
# Server
# ============================================================================

server <- function(input, output, session) {
  
  audit_results <- reactiveVal(NULL)
  
  data_loaded <- reactive({ 
    if (input$data_source == "example") {
      print(list.files())
      {
        load("./Islam2019_WithGeocodesAndSatData.Rdata")
        return(data)
      } 
    } else {
      req(input$file_csv)
      tryCatch({
        read.csv(input$file_csv$datapath, stringsAsFactors = FALSE)
      }, error = function(e) {
        showNotification(paste("Error reading CSV:", e$message),
                         type = "error", duration = 10)
        NULL
      })
    }
  })
  
  observe({
    df <- data_loaded()
    req(df)
    
    cols <- names(df)
    
    updateSelectInput(session, "treat_col", choices = cols,
                      selected = if ("begum_treat" %in% cols) "begum_treat" else cols[1])
    updateSelectInput(session, "missing_col", choices = cols,
                      selected = cols[1])
    updateSelectInput(session, "lat_col", choices = cols,
                      selected = grep("lat", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
    updateSelectInput(session, "long_col", choices = cols,
                      selected = grep("lon|long", cols, value = TRUE, ignore.case = TRUE)[1] %||% NULL)
  })
  
  observeEvent(input$data_source, {
    if (input$data_source == "upload" && is.null(input$file_csv)) {
      updateSelectInput(session, "treat_col", choices = character(0))
      updateSelectInput(session, "missing_col", choices = character(0))
      updateSelectInput(session, "lat_col", choices = character(0))
      updateSelectInput(session, "long_col", choices = character(0))
    }
  })
  
  output$data_preview <- renderDT({
    df <- data_loaded()
    req(df)
    
    datatable(
      head(df, 100),
      options = list(pageLength = 10, scrollX = TRUE, dom = 'tip'),
      rownames = FALSE
    )
  })
  
  observeEvent(input$run_audit, {
    df <- data_loaded()
    req(df)
    
    missing_feats <- setdiff(input$features, names(df))
    if (length(missing_feats) > 0) {
      showNotification("Fetching satellite features from GEE...", type = "message")
      
      req(input$lat_col %in% names(df), input$long_col %in% names(df))
      
      if (input$start_year > input$end_year) {
        showNotification("Start year must be <= end year", type = "error")
        return()
      }
      
      gee_project <- Sys.getenv("GEE_PROJECT")
      gee_email <- Sys.getenv("GEE_EMAIL")
      gee_key <- Sys.getenv("GEE_KEY")
      
      {
        py_run_string("
import ee
import pandas as pd
import json
import os

def _ee_init(project, email=None, key_data=None):
    # Normalize empty strings to None
    project = project or None
    email = (email or None) if email else None
    key_data = (key_data or None) if key_data else None

    # Prefer service account if provided; otherwise try ADC (non-interactive)
    if key_data:
        # key_data must be a JSON string
        key_json = key_data if isinstance(key_data, str) else json.dumps(key_data)
        credentials = ee.ServiceAccountCredentials(email, key_data=key_json)
        ee.Initialize(credentials=credentials, project=project)
    else:
        try:
            ee.Initialize(project=project)
        except Exception as e:
            raise RuntimeError(
                'No service-account key provided and ADC not available. '
                'Set GOOGLE_APPLICATION_CREDENTIALS or provide GEE_EMAIL+GEE_KEY.'
            ) from e

def satellite_stats(points, start, end, sample_scale=250):
    # Build FeatureCollection from input points
    feats = [ee.Feature(ee.Geometry.Point([float(p['lon']), float(p['lat'])]),
                        {'rowid': str(p['rowid'])}) for p in points]
    fc = ee.FeatureCollection(feats)

    # MODIS NDVI (scaled by 0.0001), mask to SummaryQA == 0
    def mask_modis(img):
        qa = img.select('SummaryQA')
        return (img.updateMask(qa.eq(0))
                   .select('NDVI').multiply(0.0001)
                   .copyProperties(img, img.propertyNames()))

    modis = (ee.ImageCollection('MODIS/061/MOD13Q1')
             .filterDate(start, end)
             .map(mask_modis))

    ndvi_mean   = modis.select('NDVI').mean().rename('ndvi_mean')
    ndvi_median = modis.select('NDVI').median().rename('ndvi_median')
    ndvi_max    = modis.select('NDVI').max().rename('ndvi_max')

    # Night lights: DMSP (pre-2014) and VIIRS (2012+)
    dmsp  = ee.ImageCollection('NOAA/DMSP-OLS/NIGHTTIME_LIGHTS').select('stable_lights')
    #viirs = ee.ImageCollection('NOAA/VIIRS/DNB/MONTHLY_V1/VCMSLCFG').select('avg_rad') # old
    viirs = ee.ImageCollection('NOAA/VIIRS/DNB/MONTHLY_V1/VCMCFG').select('avg_rad') # new 

    dmsp_window  = dmsp.filterDate(start, end)
    viirs_window = viirs.filterDate(start, end)

    # Overlap window for simple cross-calibration (DMSP↔VIIRS)
    #overlap_start = ee.Date('2012-01-01') # old
    overlap_start = ee.Date('2012-04-01') # new 
    overlap_end   = ee.Date('2013-12-31')  # DMSP coverage effectively ends 2013

    dmsp_ov_img  = dmsp.filterDate(overlap_start, overlap_end).mean()
    viirs_ov_img = viirs.filterDate(overlap_start, overlap_end).mean()

    # Buffer features to form a region-of-interest for overlap means
    fc_buffer   = fc.map(lambda f: ee.Feature(f).buffer(5000))
    region_geom = fc_buffer.geometry()

    # Null-safe reducers via dictionaries + contains()
    dmsp_ov_dict = ee.Dictionary(dmsp_ov_img.reduceRegion(
        reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13))
    viirs_ov_dict = ee.Dictionary(viirs_ov_img.reduceRegion(
        reducer=ee.Reducer.mean(), geometry=region_geom, scale=5000, maxPixels=1e13))

    dmsp_global_mean = ee.Number(ee.Image(dmsp_ov_img).reduceRegion(
        reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
        scale=50000, maxPixels=1e13).get('stable_lights'))
    viirs_global_mean = ee.Number(ee.Image(viirs_ov_img).reduceRegion(
        reducer=ee.Reducer.mean(), geometry=ee.Geometry.Rectangle([-180, -90, 180, 90]),
        scale=50000, maxPixels=1e13).get('avg_rad'))

    dmsp_has  = dmsp_ov_dict.contains('stable_lights')
    viirs_has = viirs_ov_dict.contains('avg_rad')

    dmsp_use  = ee.Number(ee.Algorithms.If(dmsp_has,  dmsp_ov_dict.get('stable_lights'), dmsp_global_mean))
    viirs_use = ee.Number(ee.Algorithms.If(viirs_has, viirs_ov_dict.get('avg_rad'),        viirs_global_mean))

    # Guard divide-by-zero; produce a VIIRS-per-DMSP scale factor (ratio)
    k_viirs_per_dmsp = ee.Number(ee.Algorithms.If(dmsp_use.gt(0), viirs_use.divide(dmsp_use), 1))

    # Build a merged NTL series in VIIRS-equivalent units
    dmsp_equiv = dmsp_window.map(
        lambda img: img.select('stable_lights').multiply(k_viirs_per_dmsp).rename('ntl').toFloat()
    )
    viirs_prep = viirs_window.map(
        lambda img: img.select('avg_rad').rename('ntl').toFloat()
    )
    ntl_window = dmsp_equiv.merge(viirs_prep)

    ntl_mean   = ntl_window.mean().rename('ntl_mean')
    ntl_median = ntl_window.median().rename('ntl_median')
    ntl_max    = ntl_window.max().rename('ntl_max')

    # Stack all bands
    stacked = (ndvi_mean
               .addBands([ndvi_median, ndvi_max,
                          ntl_mean, ntl_median, ntl_max]))

    # IMPORTANT: use the intended pixel size (meters) for sampling
    samples = stacked.sampleRegions(collection=fc, properties=['rowid'], scale=sample_scale)

    # Bring results client-side
    info = samples.getInfo()
    rows = []
    for f in info.get('features', []):
        p = f.get('properties', {}) or {}
        rows.append({
            'rowid':       p.get('rowid'),
            'ndvi_mean':   p.get('ndvi_mean'),
            'ndvi_median': p.get('ndvi_median'),
            'ndvi_max':    p.get('ndvi_max'),
            'ntl_mean':    p.get('ntl_mean'),
            'ntl_median':  p.get('ntl_median'),
            'ntl_max':     p.get('ntl_max')
        })
    return pd.DataFrame(rows)
")
      }

      py$`_ee_init`(project = gee_project, email = gee_email, key_data = gee_key)
      
      df$rowid <- seq_len(nrow(df))
      pts_all <- df %>%
        filter(is.finite(!!sym(input$lat_col)), is.finite(!!sym(input$long_col))) %>%
        transmute(rowid = as.character(rowid),
                  lon = !!sym(input$long_col),
                  lat = !!sym(input$lat_col))
      
      if (nrow(pts_all) == 0) {
        showNotification("No valid geocoordinates found", type = "error")
        return()
      }
      
      start <- sprintf("%d-01-01", input$start_year)
      end <- sprintf("%d-01-01", input$end_year + 1)
      
      batch_size <- 200L
      idx <- split(seq_len(nrow(pts_all)), ceiling(seq_len(nrow(pts_all)) / batch_size))
      sat_all <- list()
      
      for (ii in idx) {
        chunk <- pts_all[ii, , drop = FALSE]
        points <- lapply(seq_len(nrow(chunk)), function(i) {
          list(
            rowid = chunk$rowid[i],
            lon = chunk$lon[i],
            lat = chunk$lat[i]
          )
        })
        
        df_chunk <- py$satellite_stats(points, start, end, as.integer(250))
        if (!is.null(df_chunk) && nrow(df_chunk) > 0) {
          sat_all[[length(sat_all) + 1L]] <- df_chunk
        }
      }
      
      if (length(sat_all) > 0) {
        sat_df <- bind_rows(sat_all) %>% mutate(rowid = as.integer(rowid))
        df <- left_join(df, sat_df, by = "rowid") %>% select(-rowid)
      } else {
        showNotification("Failed to fetch satellite data", type = "error")
        return()
      }
      
      missing_feats <- setdiff(input$features, names(df))
      if (length(missing_feats) > 0) {
        showNotification(paste("Could not fetch:", paste(missing_feats, collapse = ", ")), type = "error")
        return()
      }
    }
    
    withProgress(message = "Running audit...", value = 0, {
      
      incProgress(0.2, detail = "Preparing data...")
      
      if (input$audit_type == "randomization") {
        req(input$treat_col)
        
        if (!(input$treat_col %in% names(df))) {
          showNotification("Treatment column not found", type = "error")
          return()
        }
        
        tt <- df[[input$treat_col]]
        mask <- (tt %in% c(input$control_val, input$treat_val))
        
        if (sum(mask) == 0) {
          showNotification("No units match control/treatment values",
                           type = "error")
          return()
        }
        
        A <- ifelse(tt[mask] == input$treat_val, 1L, 0L)
        X <- as.matrix(df[mask, input$features, drop = FALSE])
        
        keep <- apply(X, 1, function(r) all(is.finite(r)))
        A <- A[keep]
        X <- X[keep, , drop = FALSE]
        
        if (length(A) < 10) {
          showNotification("Too few complete cases (need >= 10)",
                           type = "error")
          return()
        }
        
      } else {
        req(input$missing_col)
        
        if (!(input$missing_col %in% names(df))) {
          showNotification("Missing column not found", type = "error")
          return()
        }
        
        R <- as.integer(!is.na(df[[input$missing_col]]))
        
        if (all(R == 1)) {
          showNotification(
            "No missingness detected in selected variable. Audit cannot proceed.",
            type = "warning", duration = 10
          )
          return()
        }
        
        if (all(R == 0)) {
          showNotification(
            "All values are missing. Audit cannot proceed.",
            type = "warning", duration = 10
          )
          return()
        }
        
        A <- R
        X <- as.matrix(df[, input$features, drop = FALSE])
        
        keep <- apply(X, 1, function(r) all(is.finite(r)))
        A <- A[keep]
        X <- X[keep, , drop = FALSE]
      }
      
      incProgress(0.4, detail = "Running conditional randomization test...")
      
      results <- tryCatch({
        remote_audit_crt(
          A = A,
          X = X,
          K = input$K,
          B = input$B,
          seed = input$seed,
          label = ifelse(input$audit_type == "randomization",yes = input$treat_col, no = input$missing_col),
          xgboost_ntree = input$ntree
        )
      }, error = function(e) {
        showNotification(paste("Audit failed:", e$message),
                         type = "error", duration = 10)
        NULL
      })
      
      incProgress(1.0, detail = "Complete!")
      
      if (!is.null(results)) {
        audit_results(results)
        showNotification("Audit complete!", type = "message", duration = 3)
      }
    })
  })
  
  output$results_summary <- renderUI({
    res <- audit_results()
    req(res)
    
    HTML(sprintf(
      "<h4>%s Audit Results</h4>
       <p><strong>Learner:</strong> %s</p>
       <p><strong>Sample size:</strong> %d (Treated: %d, Control: %d)</p>
       <p><strong>Test statistic (T):</strong> %.4f</p>
       <p><strong>P-value:</strong> %.4f</p>
       <p><strong>Interpretation:</strong> %s</p>",
      tools::toTitleCase(input$audit_type),
      toupper(res$learner),
      res$n,
      res$treated,
      res$n - res$treated,
      res$T_obs,
      res$p_value,
      if (res$p_value < 0.05) {
        "⚠️ Assignment is MORE predictable from satellite features than expected under random assignment (p < 0.05). This suggests potential deviation from the stated randomization mechanism."
      } else {
        "✓ Assignment is NOT significantly more predictable from satellite features than expected under random assignment (p >= 0.05). No evidence of deviation detected."
      }
    ))
  })
  
  if(FALSE){
  output$audit_plot <- renderPlot({
    res <- audit_results()
    req(res)
    
    hist(res$T_null, breaks = 50,
         main = sprintf("%s Audit: %s Learner",
                        tools::toTitleCase(input$audit_type),
                        toupper(res$learner)),
         xlab = "Out-of-sample log-likelihood improvement (T)",
         ylab = "Count",
         col = "lightblue",
         border = "white")
    abline(v = res$T_obs, col = "red", lwd = 3, lty = 2)
    legend("topright",
           legend = c("Null distribution", "Observed"),
           col = c("lightblue", "red"),
           lwd = c(10, 3),
           lty = c(1, 2))
    mtext(sprintf("n=%d, treated=%d (%.1f%%), B=%d, p=%.4f",
                  res$n, res$treated, 100 * res$a_bar, res$B, res$p_value),
          side = 3, line = 0.5, cex = 0.9)
  })
  }
  
  output$download_results <- downloadHandler(
    filename = function() {
      sprintf("remote_audit_results_%s.csv", format(Sys.time(), "%Y%m%d_%H%M%S"))
    },
    content = function(file) {
      res <- audit_results()
      req(res)
      
      summary_df <- data.frame(
        audit_type = input$audit_type,
        learner = res$learner,
        n = res$n,
        treated = res$treated,
        treatment_rate = res$a_bar,
        K = res$K,
        B = res$B,
        T_observed = res$T_obs,
        p_value = res$p_value,
        seed = input$seed,
        features = paste(input$features, collapse = ";")
      )
      
      write.csv(summary_df, file, row.names = FALSE)
    }
  )
  
  output$audit_complete <- reactive({
    !is.null(audit_results())
  })
  outputOptions(output, "audit_complete", suspendWhenHidden = FALSE)
}

shinyApp(ui, server)