Spaces:

DocForg
/

Document_Forgery_Detection

Sleeping

File size: 6,838 Bytes

ff0e79e

# Hybrid Document Forgery Detection - Configuration

# System Settings
system:
  device: cuda  # cuda or cpu
  num_workers: 0  # Reduced to avoid multiprocessing errors
  pin_memory: true
  seed: 42

# Data Settings
data:
  image_size: 384
  batch_size: 8  # Reduced for 16GB RAM
  num_classes: 3  # copy_move, splicing, text_substitution
  
  # Dataset paths
  datasets:
    doctamper:
      path: datasets/DocTamper
      type: lmdb
      has_pixel_mask: true
      min_region_area: 0.001  # 0.1%
    
    rtm:
      path: datasets/RealTextManipulation
      type: folder
      has_pixel_mask: true
      min_region_area: 0.0003  # 0.03%
    
    casia:
      path: datasets/CASIA 1.0 dataset
      type: folder
      has_pixel_mask: false
      min_region_area: 0.001  # 0.1%
      skip_deskew: true
      skip_denoising: true
    
    receipts:
      path: datasets/findit2
      type: folder
      has_pixel_mask: true
      min_region_area: 0.0005  # 0.05%
    
    fcd:
      path: datasets/DocTamper/DocTamperV1-FCD
      type: lmdb
      has_pixel_mask: true
      min_region_area: 0.00035  # 0.035% (larger forgeries, keep 99%)
    
    scd:
      path: datasets/DocTamper/DocTamperV1-SCD
      type: lmdb
      has_pixel_mask: true
      min_region_area: 0.00009  # 0.009% (small forgeries, keep 91.5%)

  # Chunked training for DocTamper (RAM constraint)
  chunked_training:
    enabled: true
    dataset: doctamper
    chunks:
      - {start: 0.0, end: 0.25, name: "chunk_1"}
      - {start: 0.25, end: 0.5, name: "chunk_2"}
      - {start: 0.5, end: 0.75, name: "chunk_3"}
      - {start: 0.75, end: 1.0, name: "chunk_4"}
    
    # Mixed dataset training (TrainingSet + FCD + SCD)
    mixing_ratios:
      doctamper: 0.70  # 70% TrainingSet (maintains baseline)
      scd: 0.20        # 20% SCD (handles small forgeries, 0.88% avg)
      fcd: 0.10        # 10% FCD (adds diversity, 3.55% avg)

# Preprocessing
preprocessing:
  deskew: true
  normalize: true
  noise_threshold: 15.0  # Laplacian variance threshold
  median_filter_size: 3
  gaussian_sigma: 0.8
  
  # Dataset-aware preprocessing
  dataset_specific:
    casia:
      deskew: false
      denoising: false

# Augmentation (Training only)
augmentation:
  enabled: true
  
  # Common augmentations
  common:
    - {type: "noise", prob: 0.3}
    - {type: "motion_blur", prob: 0.2}
    - {type: "jpeg_compression", prob: 0.3, quality: [60, 95]}
    - {type: "lighting", prob: 0.3}
    - {type: "perspective", prob: 0.2}
  
  # Dataset-specific augmentations
  receipts:
    - {type: "stain", prob: 0.2}
    - {type: "fold", prob: 0.15}

# Model Architecture
model:
  # Encoder
  encoder:
    name: mobilenetv3_small_100
    pretrained: true
    features_only: true
  
  # Decoder
  decoder:
    name: unet_lite
    channels: [16, 24, 40, 48, 96]  # MobileNetV3-Small feature channels
    upsampling: bilinear
    use_depthwise_separable: true
  
  # Output
  output_channels: 1  # Binary forgery mask

# Loss Function
loss:
  # Dataset-aware loss
  use_dice: true  # Only for datasets with pixel masks
  bce_weight: 1.0
  dice_weight: 1.0

# Training
training:
  epochs: 30  # Per chunk (increased for single-pass training)
  learning_rate: 0.001  # Higher initial LR for faster convergence
  weight_decay: 0.0001  # Slight increase for better regularization
  
  # Optimizer
  optimizer: adamw
  
  # Scheduler
  scheduler:
    type: cosine_annealing_warm_restarts
    T_0: 10  # Restart every 10 epochs
    T_mult: 2  # Double restart period each time
    warmup_epochs: 3  # Warmup for first 3 epochs
    min_lr: 0.00001  # End at 1/100th of initial LR
  
  # Early stopping
  early_stopping:
    enabled: true
    patience: 10  # Increased to allow more exploration
    min_delta: 0.0005  # Accept smaller improvements (0.05%)
    restore_best_weights: true  # Restore best model when stopping
    monitor: val_dice
    mode: max
  
  # Checkpointing
  checkpoint:
    save_best: true
    save_every: 5  # Save every 5 epochs
    save_last: true  # Also save last checkpoint
    monitor: val_dice

# Mask Refinement
mask_refinement:
  threshold: 0.5
  morphology:
    closing_kernel: 5
    opening_kernel: 3
  
  # Adaptive thresholds per dataset
  min_region_area:
    rtm: 0.0003
    receipts: 0.0005
    default: 0.001

# Feature Extraction
features:
  # Deep features
  deep:
    enabled: true
    pooling: gap  # Global Average Pooling
  
  # Statistical & Shape features
  statistical:
    enabled: true
    features:
      - area
      - perimeter
      - aspect_ratio
      - solidity
      - eccentricity
      - entropy
  
  # Frequency-domain features
  frequency:
    enabled: true
    features:
      - dct_coefficients
      - high_frequency_energy
      - wavelet_energy
  
  # Noise & ELA features
  noise:
    enabled: true
    features:
      - ela_mean
      - ela_variance
      - noise_residual
  
  # OCR-consistency features (text documents only)
  ocr:
    enabled: true
    gated: true  # Only for text documents
    features:
      - confidence_deviation
      - spacing_irregularity
      - stroke_width_variation
  
  # Feature normalization
  normalization:
    method: standard_scaler
    handle_missing: true

# LightGBM Classifier
classifier:
  model: lightgbm
  params:
    objective: multiclass
    num_class: 3
    boosting_type: gbdt
    num_leaves: 31
    learning_rate: 0.05
    n_estimators: 200
    max_depth: 7
    min_child_samples: 20
    subsample: 0.8
    colsample_bytree: 0.8
    reg_alpha: 0.1
    reg_lambda: 0.1
    random_state: 42
  
  # Confidence threshold
  confidence_threshold: 0.6

# Metrics
metrics:
  # Localization metrics (only for datasets with pixel masks)
  localization:
    - iou
    - dice
    - precision
    - recall
  
  # Classification metrics
  classification:
    - accuracy
    - f1_score
    - precision
    - recall
    - confusion_matrix
  
  # Dataset-aware metric computation
  compute_localization:
    doctamper: true
    rtm: true
    casia: false
    receipts: true

# Outputs
outputs:
  base_dir: outputs
  
  # Subdirectories
  checkpoints: outputs/checkpoints
  logs: outputs/logs
  plots: outputs/plots
  results: outputs/results
  
  # Visualization
  visualization:
    save_mask: true
    save_overlay: true
    save_json: true
    overlay_alpha: 0.5
    colormap: jet

# Deployment
deployment:
  export_onnx: true
  onnx_path: outputs/model.onnx
  quantization: false
  opset_version: 14

# Logging
logging:
  level: INFO
  tensorboard: true
  csv: true
  console: true