Spaces:

DocForg
/

Document_Forgery_Detection

Sleeping

App Files Files Community

JKrishnanandhaa commited on 7 days ago

Commit

51fdac5

verified ·

1 Parent(s): 8378f18

Upload 8 files

Browse files

Files changed (8) hide show

scripts/evaluate.py +158 -0
scripts/evaluate_full_testingset.py +277 -0
scripts/export_model.py +92 -0
scripts/inference_pipeline.py +556 -0
scripts/run_inference.py +138 -0
scripts/setup.py +135 -0
scripts/train_chunked.py +222 -0
scripts/train_classifier_doctamper_fixed.py +368 -0

scripts/evaluate.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Model Evaluation Script
+Evaluate trained model on validation/test sets with comprehensive metrics.
+Usage:
+    python scripts/evaluate.py --model outputs/checkpoints/best_doctamper.pth --dataset doctamper
+"""
+import argparse
+import sys
+from pathlib import Path
+import json
+import numpy as np
+from tqdm import tqdm
+import torch
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.config import get_config
+from src.models import get_model
+from src.data import get_dataset
+from src.training.metrics import SegmentationMetrics
+from src.utils import plot_training_curves
+def parse_args():
+    parser = argparse.ArgumentParser(description="Evaluate forgery detection model")
+    parser.add_argument('--model', type=str, required=True,
+                       help='Path to model checkpoint')
+    parser.add_argument('--dataset', type=str, required=True,
+                       choices=['doctamper', 'rtm', 'casia', 'receipts'],
+                       help='Dataset to evaluate on')
+    parser.add_argument('--split', type=str, default='val',
+                       help='Data split (val/test)')
+    parser.add_argument('--output', type=str, default='outputs/evaluation',
+                       help='Output directory')
+    parser.add_argument('--config', type=str, default='config.yaml',
+                       help='Path to config file')
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # Load config
+    config = get_config(args.config)
+    # Device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print("\n" + "="*60)
+    print("Model Evaluation")
+    print("="*60)
+    print(f"Model: {args.model}")
+    print(f"Dataset: {args.dataset}")
+    print(f"Split: {args.split}")
+    print(f"Device: {device}")
+    print("="*60)
+    # Load model
+    model = get_model(config).to(device)
+    checkpoint = torch.load(args.model, map_location=device)
+    if 'model_state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint)
+    model.eval()
+    print("Model loaded")
+    # Load dataset
+    dataset = get_dataset(config, args.dataset, split=args.split)
+    print(f"Dataset loaded: {len(dataset)} samples")
+    # Create output directory
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Evaluate
+    metrics = SegmentationMetrics()
+    has_pixel_mask = config.has_pixel_mask(args.dataset)
+    print(f"\nEvaluating...")
+    all_ious = []
+    all_dices = []
+    with torch.no_grad():
+        for i in tqdm(range(len(dataset)), desc="Evaluating"):
+            try:
+                image, mask, metadata = dataset[i]
+                # Move to device
+                image = image.unsqueeze(0).to(device)
+                mask = mask.unsqueeze(0).to(device)
+                # Forward pass
+                logits, _ = model(image)
+                probs = torch.sigmoid(logits)
+                # Update metrics
+                if has_pixel_mask:
+                    metrics.update(probs, mask, has_pixel_mask=True)
+                    # Per-sample metrics
+                    pred_binary = (probs > 0.5).float()
+                    intersection = (pred_binary * mask).sum().item()
+                    union = pred_binary.sum().item() + mask.sum().item() - intersection
+                    iou = intersection / (union + 1e-8)
+                    dice = (2 * intersection) / (pred_binary.sum().item() + mask.sum().item() + 1e-8)
+                    all_ious.append(iou)
+                    all_dices.append(dice)
+            except Exception as e:
+                print(f"Error on sample {i}: {e}")
+                continue
+    # Compute final metrics
+    results = metrics.compute()
+    # Add per-sample statistics
+    if has_pixel_mask and all_ious:
+        results['iou_mean'] = np.mean(all_ious)
+        results['iou_std'] = np.std(all_ious)
+        results['dice_mean'] = np.mean(all_dices)
+        results['dice_std'] = np.std(all_dices)
+    # Print results
+    print("\n" + "="*60)
+    print("Evaluation Results")
+    print("="*60)
+    for key, value in results.items():
+        if isinstance(value, float):
+            print(f"  {key}: {value:.4f}")
+    # Save results
+    results_path = output_dir / f'{args.dataset}_{args.split}_results.json'
+    with open(results_path, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to: {results_path}")
+    print("="*60)
+if __name__ == '__main__':
+    main()

scripts/evaluate_full_testingset.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""
+Comprehensive Evaluation on Entire TestingSet
+Evaluates the complete pipeline on all 30,000 samples from TestingSet
+Calculates all metrics: Accuracy, Precision, Recall, F1, IoU, Dice, etc.
+No visualizations - metrics only for speed
+Usage:
+    python scripts/evaluate_full_testingset.py
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from tqdm import tqdm
+import json
+from datetime import datetime
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.config import get_config
+from src.models import get_model
+from src.data import get_dataset
+from src.features import get_mask_refiner, get_region_extractor
+from src.training.classifier import ForgeryClassifier
+from src.data.preprocessing import DocumentPreprocessor
+from src.data.augmentation import DatasetAwareAugmentation
+# Class mapping
+CLASS_NAMES = {0: 'Copy-Move', 1: 'Splicing', 2: 'Generation'}
+def calculate_metrics(pred_mask, gt_mask):
+    """Calculate all segmentation metrics"""
+    pred = pred_mask.astype(bool)
+    gt = gt_mask.astype(bool)
+    intersection = (pred & gt).sum()
+    union = (pred | gt).sum()
+    tp = intersection
+    fp = (pred & ~gt).sum()
+    fn = (~pred & gt).sum()
+    tn = (~pred & ~gt).sum()
+    # Segmentation metrics
+    iou = intersection / (union + 1e-8)
+    dice = (2 * intersection) / (pred.sum() + gt.sum() + 1e-8)
+    precision = tp / (tp + fp + 1e-8)
+    recall = tp / (tp + fn + 1e-8)
+    f1 = 2 * precision * recall / (precision + recall + 1e-8)
+    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-8)
+    return {
+        'iou': float(iou),
+        'dice': float(dice),
+        'precision': float(precision),
+        'recall': float(recall),
+        'f1': float(f1),
+        'accuracy': float(accuracy),
+        'tp': int(tp),
+        'fp': int(fp),
+        'fn': int(fn),
+        'tn': int(tn)
+    }
+def main():
+    print("="*80)
+    print("COMPREHENSIVE EVALUATION ON ENTIRE TESTINGSET")
+    print("="*80)
+    print("Dataset: DocTamper TestingSet (30,000 samples)")
+    print("Mode: Metrics only (no visualizations)")
+    print("="*80)
+    config = get_config('config.yaml')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Load models
+    print("\n1. Loading models...")
+    # Localization model
+    model = get_model(config).to(device)
+    checkpoint = torch.load('outputs/checkpoints/best_doctamper.pth', map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    print(f"   ✓ Localization model loaded (Dice: {checkpoint.get('best_metric', 0):.2%})")
+    # Classifier
+    classifier = ForgeryClassifier(config)
+    classifier.load('outputs/classifier')
+    print(f"   ✓ Classifier loaded")
+    # Components
+    preprocessor = DocumentPreprocessor(config, 'doctamper')
+    augmentation = DatasetAwareAugmentation(config, 'doctamper', is_training=False)
+    mask_refiner = get_mask_refiner(config)
+    region_extractor = get_region_extractor(config)
+    # Load dataset
+    print("\n2. Loading TestingSet...")
+    dataset = get_dataset(config, 'doctamper', split='val')  # val = TestingSet
+    total_samples = len(dataset)
+    print(f"   ✓ Loaded {total_samples} samples")
+    # Initialize metrics storage
+    all_metrics = []
+    detection_stats = {
+        'total': 0,
+        'has_forgery': 0,
+        'detected': 0,
+        'missed': 0,
+        'false_positives': 0,
+        'true_negatives': 0
+    }
+    print("\n3. Running evaluation...")
+    print("="*80)
+    # Process all samples
+    for idx in tqdm(range(total_samples), desc="Evaluating"):
+        try:
+            # Get sample from dataset
+            image_tensor, mask_tensor, metadata = dataset[idx]
+            # Ground truth
+            gt_mask = mask_tensor.numpy()[0]
+            gt_mask_binary = (gt_mask > 0.5).astype(np.uint8)
+            has_forgery = gt_mask_binary.sum() > 0
+            # Run localization
+            with torch.no_grad():
+                image_batch = image_tensor.unsqueeze(0).to(device)
+                logits, _ = model(image_batch)
+                prob_map = torch.sigmoid(logits).cpu().numpy()[0, 0]
+            # Generate mask
+            binary_mask = (prob_map > 0.5).astype(np.uint8)
+            refined_mask = mask_refiner.refine(binary_mask)
+            # Calculate metrics
+            metrics = calculate_metrics(refined_mask, gt_mask_binary)
+            metrics['sample_idx'] = idx
+            metrics['has_forgery'] = has_forgery
+            metrics['prob_max'] = float(prob_map.max())
+            # Detection statistics
+            detected = refined_mask.sum() > 0
+            detection_stats['total'] += 1
+            if has_forgery:
+                detection_stats['has_forgery'] += 1
+                if detected:
+                    detection_stats['detected'] += 1
+                else:
+                    detection_stats['missed'] += 1
+            else:
+                if detected:
+                    detection_stats['false_positives'] += 1
+                else:
+                    detection_stats['true_negatives'] += 1
+            all_metrics.append(metrics)
+        except Exception as e:
+            print(f"\nError at sample {idx}: {str(e)[:100]}")
+            continue
+    # Calculate overall statistics
+    print("\n" + "="*80)
+    print("RESULTS")
+    print("="*80)
+    # Detection statistics
+    print("\n📊 DETECTION STATISTICS:")
+    print("-"*80)
+    print(f"Total samples: {detection_stats['total']}")
+    print(f"Samples with forgery: {detection_stats['has_forgery']}")
+    print(f"Samples without forgery: {detection_stats['total'] - detection_stats['has_forgery']}")
+    print()
+    print(f"✅ Correctly detected: {detection_stats['detected']}")
+    print(f"❌ Missed detections: {detection_stats['missed']}")
+    print(f"⚠️  False positives: {detection_stats['false_positives']}")
+    print(f"✓  True negatives: {detection_stats['true_negatives']}")
+    print()
+    # Detection rates
+    if detection_stats['has_forgery'] > 0:
+        detection_rate = detection_stats['detected'] / detection_stats['has_forgery']
+        miss_rate = detection_stats['missed'] / detection_stats['has_forgery']
+        print(f"Detection Rate (Recall): {detection_rate:.2%} ⬆️ Higher is better")
+        print(f"Miss Rate: {miss_rate:.2%} ⬇️ Lower is better")
+    if detection_stats['detected'] + detection_stats['false_positives'] > 0:
+        precision = detection_stats['detected'] / (detection_stats['detected'] + detection_stats['false_positives'])
+        print(f"Precision: {precision:.2%} ⬆️ Higher is better")
+    overall_accuracy = (detection_stats['detected'] + detection_stats['true_negatives']) / detection_stats['total']
+    print(f"Overall Accuracy: {overall_accuracy:.2%} ⬆️ Higher is better")
+    # Segmentation metrics (only for samples with forgery)
+    forgery_metrics = [m for m in all_metrics if m['has_forgery']]
+    if forgery_metrics:
+        print("\n📈 SEGMENTATION METRICS (on samples with forgery):")
+        print("-"*80)
+        avg_iou = np.mean([m['iou'] for m in forgery_metrics])
+        avg_dice = np.mean([m['dice'] for m in forgery_metrics])
+        avg_precision = np.mean([m['precision'] for m in forgery_metrics])
+        avg_recall = np.mean([m['recall'] for m in forgery_metrics])
+        avg_f1 = np.mean([m['f1'] for m in forgery_metrics])
+        avg_accuracy = np.mean([m['accuracy'] for m in forgery_metrics])
+        print(f"IoU (Intersection over Union): {avg_iou:.4f} ⬆️ Higher is better (0-1)")
+        print(f"Dice Coefficient: {avg_dice:.4f} ⬆️ Higher is better (0-1)")
+        print(f"Pixel Precision: {avg_precision:.4f} ⬆️ Higher is better (0-1)")
+        print(f"Pixel Recall: {avg_recall:.4f} ⬆️ Higher is better (0-1)")
+        print(f"Pixel F1-Score: {avg_f1:.4f} ⬆️ Higher is better (0-1)")
+        print(f"Pixel Accuracy: {avg_accuracy:.4f} ⬆️ Higher is better (0-1)")
+        # Probability statistics
+        avg_prob = np.mean([m['prob_max'] for m in forgery_metrics])
+        print(f"\nAverage Max Probability: {avg_prob:.4f}")
+    # Save results
+    print("\n" + "="*80)
+    print("SAVING RESULTS")
+    print("="*80)
+    output_dir = Path('outputs/evaluation')
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Summary
+    summary = {
+        'timestamp': datetime.now().isoformat(),
+        'total_samples': detection_stats['total'],
+        'detection_statistics': detection_stats,
+        'detection_rate': detection_stats['detected'] / detection_stats['has_forgery'] if detection_stats['has_forgery'] > 0 else 0,
+        'precision': detection_stats['detected'] / (detection_stats['detected'] + detection_stats['false_positives']) if (detection_stats['detected'] + detection_stats['false_positives']) > 0 else 0,
+        'overall_accuracy': overall_accuracy,
+        'segmentation_metrics': {
+            'iou': float(avg_iou) if forgery_metrics else 0,
+            'dice': float(avg_dice) if forgery_metrics else 0,
+            'precision': float(avg_precision) if forgery_metrics else 0,
+            'recall': float(avg_recall) if forgery_metrics else 0,
+            'f1': float(avg_f1) if forgery_metrics else 0,
+            'accuracy': float(avg_accuracy) if forgery_metrics else 0
+        }
+    }
+    summary_path = output_dir / 'evaluation_summary.json'
+    with open(summary_path, 'w') as f:
+        json.dump(summary, f, indent=2)
+    print(f"✓ Summary saved to: {summary_path}")
+    # Detailed metrics (optional - can be large)
+    # detailed_path = output_dir / 'detailed_metrics.json'
+    # with open(detailed_path, 'w') as f:
+    #     json.dump(all_metrics, f, indent=2)
+    # print(f"✓ Detailed metrics saved to: {detailed_path}")
+    print("\n" + "="*80)
+    print("✅ EVALUATION COMPLETE!")
+    print("="*80)
+    print(f"\nKey Metrics Summary:")
+    print(f"  Detection Rate: {detection_stats['detected'] / detection_stats['has_forgery']:.2%}")
+    print(f"  Overall Accuracy: {overall_accuracy:.2%}")
+    print(f"  Dice Score: {avg_dice:.4f}" if forgery_metrics else "  Dice Score: N/A")
+    print(f"  IoU: {avg_iou:.4f}" if forgery_metrics else "  IoU: N/A")
+    print("="*80 + "\n")
+if __name__ == '__main__':
+    main()

scripts/export_model.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""
+Model Export Script
+Export trained model to ONNX format for deployment.
+Usage:
+    python scripts/export_model.py --model outputs/checkpoints/best_doctamper.pth --format onnx
+"""
+import argparse
+import sys
+from pathlib import Path
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import torch
+from src.config import get_config
+from src.models import get_model
+from src.utils import export_to_onnx, export_to_torchscript
+def parse_args():
+    parser = argparse.ArgumentParser(description="Export model for deployment")
+    parser.add_argument('--model', type=str, required=True,
+                       help='Path to model checkpoint')
+    parser.add_argument('--format', type=str, default='onnx',
+                       choices=['onnx', 'torchscript', 'both'],
+                       help='Export format')
+    parser.add_argument('--output', type=str, default='outputs/exported',
+                       help='Output directory')
+    parser.add_argument('--config', type=str, default='config.yaml',
+                       help='Path to config file')
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    # Load config
+    config = get_config(args.config)
+    print("\n" + "="*60)
+    print("Model Export")
+    print("="*60)
+    print(f"Model: {args.model}")
+    print(f"Format: {args.format}")
+    print("="*60)
+    # Create output directory
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Load model
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = get_model(config).to(device)
+    checkpoint = torch.load(args.model, map_location=device)
+    if 'model_state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint)
+    model.eval()
+    print("Model loaded")
+    # Get image size
+    image_size = config.get('data.image_size', 384)
+    # Export
+    if args.format in ['onnx', 'both']:
+        onnx_path = output_dir / 'model.onnx'
+        export_to_onnx(model, str(onnx_path), input_size=(image_size, image_size))
+    if args.format in ['torchscript', 'both']:
+        ts_path = output_dir / 'model.pt'
+        export_to_torchscript(model, str(ts_path), input_size=(image_size, image_size))
+    print("\n" + "="*60)
+    print("Export Complete!")
+    print(f"Output: {output_dir}")
+    print("="*60)
+if __name__ == '__main__':
+    main()

scripts/inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,556 @@

+"""
+Complete Document Forgery Detection Pipeline
+Implements Full Algorithm Steps 1-11
+Features:
+- ✅ Localization (WHERE is forgery?)
+- ✅ Classification (WHAT type of forgery?)
+- ✅ Confidence filtering
+- ✅ Visualizations (heatmaps, overlays, bounding boxes)
+- ✅ JSON output with detailed results
+- ✅ Actual vs Predicted comparison (if ground truth available)
+Usage:
+    python scripts/inference_pipeline.py --image path/to/document.jpg
+    python scripts/inference_pipeline.py --image path/to/document.jpg --ground_truth path/to/mask.png
+"""
+import sys
+from pathlib import Path
+import argparse
+import numpy as np
+import cv2
+import torch
+import json
+from datetime import datetime
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.config import get_config
+from src.models import get_model
+from src.features import get_feature_extractor, get_mask_refiner, get_region_extractor
+from src.training.classifier import ForgeryClassifier
+from src.data.preprocessing import DocumentPreprocessor
+# Class mapping
+CLASS_NAMES = {
+    0: 'Copy-Move',
+    1: 'Splicing',
+    2: 'Generation'
+}
+CLASS_COLORS = {
+    0: (255, 0, 0),      # Red for Copy-Move
+    1: (0, 255, 0),      # Green for Splicing
+    2: (0, 0, 255)       # Blue for Generation
+}
+class ForgeryDetectionPipeline:
+    """
+    Complete forgery detection pipeline
+    Implements Algorithm Steps 1-11
+    """
+    def __init__(self, config_path='config.yaml'):
+        """Initialize pipeline with models"""
+        print("="*70)
+        print("Initializing Forgery Detection Pipeline")
+        print("="*70)
+        self.config = get_config(config_path)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load localization model (Steps 1-6)
+        print("\n1. Loading localization model...")
+        self.localization_model = get_model(self.config).to(self.device)
+        checkpoint = torch.load('outputs/checkpoints/best_doctamper.pth',
+                               map_location=self.device)
+        self.localization_model.load_state_dict(checkpoint['model_state_dict'])
+        self.localization_model.eval()
+        print(f"   ✓ Loaded (Val Dice: {checkpoint.get('best_metric', 0):.2%})")
+        # Load classifier (Step 8)
+        print("\n2. Loading forgery type classifier...")
+        self.classifier = ForgeryClassifier(self.config)
+        self.classifier.load('outputs/classifier')
+        print("   ✓ Loaded")
+        # Initialize components
+        print("\n3. Initializing components...")
+        self.preprocessor = DocumentPreprocessor(self.config, 'doctamper')
+        # Initialize augmentation for inference
+        from src.data.augmentation import DatasetAwareAugmentation
+        self.augmentation = DatasetAwareAugmentation(self.config, 'doctamper', is_training=False)
+        self.feature_extractor = get_feature_extractor(self.config, is_text_document=True)
+        self.mask_refiner = get_mask_refiner(self.config)
+        self.region_extractor = get_region_extractor(self.config)
+        print("   ✓ Ready")
+        print("\n" + "="*70)
+        print("Pipeline Initialized Successfully!")
+        print("="*70 + "\n")
+    def detect(self, image_path, ground_truth_path=None, output_dir='outputs/inference'):
+        """
+        Run complete detection pipeline
+        Args:
+            image_path: Path to input document image
+            ground_truth_path: Optional path to ground truth mask
+            output_dir: Directory to save outputs
+        Returns:
+            results: Dictionary with detection results
+        """
+        print(f"\n{'='*70}")
+        print(f"Processing: {image_path}")
+        print(f"{'='*70}\n")
+        # Create output directory
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        # Get base filename
+        base_name = Path(image_path).stem
+        # Step 1-2: Load and preprocess image (EXACTLY like dataset)
+        print("Step 1-2: Loading and preprocessing...")
+        image = cv2.imread(str(image_path))
+        if image is None:
+            raise ValueError(f"Could not load image: {image_path}")
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Create dummy mask for preprocessing
+        dummy_mask = np.zeros(image_rgb.shape[:2], dtype=np.uint8)
+        # Step 1: Preprocess (like dataset line: image, mask = self.preprocessor(image, mask))
+        preprocessed_img, preprocessed_mask = self.preprocessor(image_rgb, dummy_mask)
+        # Step 2: Augment (like dataset line: augmented = self.augmentation(image, mask))
+        augmented = self.augmentation(preprocessed_img, preprocessed_mask)
+        # Step 3: Extract tensor (like dataset line: image = augmented['image'])
+        image_tensor = augmented['image']
+        print(f"   ✓ Image shape: {image_rgb.shape}")
+        print(f"   ✓ Preprocessed tensor shape: {image_tensor.shape}")
+        print(f"   ✓ Tensor range: [{image_tensor.min():.4f}, {image_tensor.max():.4f}]")
+        # Load ground truth if provided
+        ground_truth = None
+        if ground_truth_path:
+            ground_truth = cv2.imread(str(ground_truth_path), cv2.IMREAD_GRAYSCALE)
+            if ground_truth is not None:
+                # Resize to match preprocessed size
+                target_size = (image_tensor.shape[2], image_tensor.shape[1])  # (W, H)
+                ground_truth = cv2.resize(ground_truth, target_size)
+                print(f"   ✓ Ground truth loaded")
+        # Step 3-4: Localization (WHERE is forgery?)
+        print("\nStep 3-4: Forgery localization...")
+        image_batch = image_tensor.unsqueeze(0).to(self.device)
+        with torch.no_grad():
+            logits, decoder_features = self.localization_model(image_batch)
+            prob_map = torch.sigmoid(logits).cpu().numpy()[0, 0]
+        print(f"   ✓ Probability map generated")
+        print(f"   ✓ Prob map range: [{prob_map.min():.4f}, {prob_map.max():.4f}]")
+        # Step 5: Binary mask generation
+        print("\nStep 5: Generating binary mask...")
+        binary_mask = (prob_map > 0.5).astype(np.uint8)
+        refined_mask = self.mask_refiner.refine(binary_mask)
+        print(f"   ✓ Mask refined")
+        # Step 6: Region extraction
+        print("\nStep 6: Extracting forgery regions...")
+        # Convert tensor to numpy for region extraction and feature extraction
+        preprocessed_numpy = image_tensor.permute(1, 2, 0).cpu().numpy()
+        regions = self.region_extractor.extract(refined_mask, prob_map, preprocessed_numpy)
+        print(f"   ✓ Found {len(regions)} regions")
+        if len(regions) == 0:
+            print("\n⚠ No forgery regions detected!")
+            # Still create visualizations if ground truth exists
+            if ground_truth is not None:
+                print("\nCreating comparison with ground truth...")
+                self._create_comparison_visualization(
+                    image_rgb, prob_map, refined_mask, ground_truth,
+                    base_name, output_path
+                )
+            return self._create_clean_result(image_rgb, base_name, output_path, ground_truth)
+        # Step 7-8: Feature extraction and classification
+        print("\nStep 7-8: Classifying forgery types...")
+        region_results = []
+        for i, region in enumerate(regions):
+            # Extract features (Step 7)
+            features = self.feature_extractor.extract(
+                preprocessed_numpy,
+                region['region_mask'],
+                [f.cpu() for f in decoder_features]
+            )
+            # Ensure correct dimension (526)
+            expected_dim = 526
+            if len(features) < expected_dim:
+                features = np.pad(features, (0, expected_dim - len(features)))
+            elif len(features) > expected_dim:
+                features = features[:expected_dim]
+            features = features.reshape(1, -1)
+            # Classify (Step 8)
+            predictions, confidences = self.classifier.predict(features)
+            forgery_type = int(predictions[0])
+            confidence = float(confidences[0])
+            region_results.append({
+                'region_id': i + 1,
+                'bounding_box': region['bounding_box'],
+                'area': int(region['area']),
+                'forgery_type': CLASS_NAMES[forgery_type],
+                'forgery_type_id': forgery_type,
+                'confidence': confidence,
+                'mask_probability_mean': float(prob_map[region['region_mask'] > 0].mean())
+            })
+            print(f"   Region {i+1}: {CLASS_NAMES[forgery_type]} "
+                  f"(confidence: {confidence:.2%})")
+        # Step 9: False positive removal
+        print("\nStep 9: Filtering low-confidence regions...")
+        confidence_threshold = self.config.get('classification.confidence_threshold', 0.6)
+        filtered_results = [r for r in region_results if r['confidence'] >= confidence_threshold]
+        print(f"   ✓ Kept {len(filtered_results)}/{len(region_results)} regions "
+              f"(threshold: {confidence_threshold:.0%})")
+        # Step 10-11: Generate outputs
+        print("\nStep 10-11: Generating outputs...")
+        # Calculate scale factors for coordinate conversion
+        # Bounding boxes are in preprocessed coordinates (384x384)
+        # Need to scale to original image coordinates
+        orig_h, orig_w = image_rgb.shape[:2]
+        prep_h, prep_w = prob_map.shape
+        scale_x = orig_w / prep_w
+        scale_y = orig_h / prep_h
+        # Create visualizations
+        self._create_visualizations(
+            image_rgb, prob_map, refined_mask, filtered_results,
+            ground_truth, base_name, output_path, scale_x, scale_y
+        )
+        # Create JSON output
+        results = self._create_json_output(
+            image_path, filtered_results, ground_truth, base_name, output_path
+        )
+        print(f"\n{'='*70}")
+        print("✅ Detection Complete!")
+        print(f"{'='*70}")
+        print(f"Output directory: {output_path}")
+        print(f"Detected {len(filtered_results)} forgery regions")
+        print(f"{'='*70}\n")
+        return results
+    def _create_visualizations(self, image, prob_map, mask, results,
+                               ground_truth, base_name, output_path, scale_x, scale_y):
+        """Create all visualizations"""
+        # 1. Probability heatmap
+        plt.figure(figsize=(15, 5))
+        plt.subplot(1, 3, 1)
+        plt.imshow(image)
+        plt.title('Original Document')
+        plt.axis('off')
+        plt.subplot(1, 3, 2)
+        plt.imshow(prob_map, cmap='hot', vmin=0, vmax=1)
+        plt.colorbar(label='Forgery Probability')
+        plt.title('Probability Heatmap')
+        plt.axis('off')
+        plt.subplot(1, 3, 3)
+        plt.imshow(mask, cmap='gray')
+        plt.title('Binary Mask')
+        plt.axis('off')
+        plt.tight_layout()
+        plt.savefig(output_path / f'{base_name}_heatmap.png', dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"   ✓ Saved heatmap")
+        # 2. Overlay with bounding boxes and labels
+        overlay = image.copy()
+        alpha = 0.4
+        # Create colored mask overlay (scale mask to original size)
+        mask_scaled = cv2.resize(mask, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST)
+        colored_mask = np.zeros_like(image)
+        for result in results:
+            bbox = result['bounding_box']
+            forgery_type = result['forgery_type_id']
+            color = CLASS_COLORS[forgery_type]
+            # Scale bounding box to original image coordinates
+            x, y, w, h = bbox
+            x_scaled = int(x * scale_x)
+            y_scaled = int(y * scale_y)
+            w_scaled = int(w * scale_x)
+            h_scaled = int(h * scale_y)
+            # Color the region
+            colored_mask[y_scaled:y_scaled+h_scaled, x_scaled:x_scaled+w_scaled] = color
+        # Blend with original
+        overlay = cv2.addWeighted(overlay, 1-alpha, colored_mask, alpha, 0)
+        # Draw bounding boxes and labels
+        fig, ax = plt.subplots(1, figsize=(12, 8))
+        ax.imshow(overlay)
+        for result in results:
+            bbox = result['bounding_box']
+            x, y, w, h = bbox  # bbox is [x, y, w, h] in preprocessed coordinates
+            # Scale to original image coordinates
+            x_scaled = x * scale_x
+            y_scaled = y * scale_y
+            w_scaled = w * scale_x
+            h_scaled = h * scale_y
+            forgery_type = result['forgery_type']
+            confidence = result['confidence']
+            color_rgb = tuple(c/255 for c in CLASS_COLORS[result['forgery_type_id']])
+            # Draw rectangle
+            rect = patches.Rectangle((x_scaled, y_scaled), w_scaled, h_scaled,
+                                     linewidth=2, edgecolor=color_rgb,
+                                     facecolor='none')
+            ax.add_patch(rect)
+            # Add label
+            label = f"{forgery_type}\n{confidence:.1%}"
+            ax.text(x_scaled, y_scaled-10, label, color='white', fontsize=10,
+                   bbox=dict(boxstyle='round', facecolor=color_rgb, alpha=0.8))
+        ax.axis('off')
+        ax.set_title('Forgery Detection Results', fontsize=14, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig(output_path / f'{base_name}_overlay.png', dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"   ✓ Saved overlay")
+        # 3. Comparison with ground truth (if available)
+        if ground_truth is not None:
+            fig, axes = plt.subplots(1, 3, figsize=(18, 6))
+            axes[0].imshow(image)
+            axes[0].set_title('Original Document', fontsize=12)
+            axes[0].axis('off')
+            axes[1].imshow(ground_truth, cmap='gray')
+            axes[1].set_title('Ground Truth', fontsize=12)
+            axes[1].axis('off')
+            axes[2].imshow(mask, cmap='gray')
+            axes[2].set_title('Predicted Mask', fontsize=12)
+            axes[2].axis('off')
+            # Calculate metrics
+            intersection = np.logical_and(ground_truth > 127, mask > 0).sum()
+            union = np.logical_or(ground_truth > 127, mask > 0).sum()
+            iou = intersection / (union + 1e-8)
+            dice = 2 * intersection / (ground_truth.sum() + mask.sum() + 1e-8)
+            fig.suptitle(f'Actual vs Predicted (IoU: {iou:.2%}, Dice: {dice:.2%})',
+                        fontsize=14, fontweight='bold')
+            plt.tight_layout()
+            plt.savefig(output_path / f'{base_name}_comparison.png', dpi=150, bbox_inches='tight')
+            plt.close()
+            print(f"   ✓ Saved comparison (IoU: {iou:.2%}, Dice: {dice:.2%})")
+        # 4. Per-region visualization
+        if len(results) > 0:
+            n_regions = len(results)
+            cols = min(4, n_regions)
+            rows = (n_regions + cols - 1) // cols
+            fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
+            if n_regions == 1:
+                axes = [axes]
+            else:
+                axes = axes.flatten()
+            for i, result in enumerate(results):
+                bbox = result['bounding_box']
+                x, y, w, h = bbox  # bbox is [x, y, w, h] in preprocessed coordinates
+                # Scale to original image coordinates
+                x_scaled = int(x * scale_x)
+                y_scaled = int(y * scale_y)
+                w_scaled = int(w * scale_x)
+                h_scaled = int(h * scale_y)
+                region_img = image[y_scaled:y_scaled+h_scaled, x_scaled:x_scaled+w_scaled]
+                axes[i].imshow(region_img)
+                axes[i].set_title(f"Region {i+1}: {result['forgery_type']}\n"
+                                 f"Confidence: {result['confidence']:.1%}",
+                                 fontsize=10)
+                axes[i].axis('off')
+            # Hide unused subplots
+            for i in range(n_regions, len(axes)):
+                axes[i].axis('off')
+            plt.tight_layout()
+            plt.savefig(output_path / f'{base_name}_regions.png', dpi=150, bbox_inches='tight')
+            plt.close()
+            print(f"   ✓ Saved region details")
+    def _create_json_output(self, image_path, results, ground_truth, base_name, output_path):
+        """Create JSON output with results"""
+        output = {
+            'image_path': str(image_path),
+            'timestamp': datetime.now().isoformat(),
+            'num_regions_detected': len(results),
+            'regions': results
+        }
+        # Add ground truth comparison if available
+        if ground_truth is not None:
+            output['has_ground_truth'] = True
+        # Save JSON
+        json_path = output_path / f'{base_name}_results.json'
+        with open(json_path, 'w') as f:
+            json.dump(output, f, indent=2)
+        print(f"   ✓ Saved JSON results")
+        return output
+    def _create_comparison_visualization(self, image, prob_map, mask, ground_truth,
+                                         base_name, output_path):
+        """Create comparison visualization between actual and predicted"""
+        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+        # Original image
+        axes[0, 0].imshow(image)
+        axes[0, 0].set_title('Original Document', fontsize=14, fontweight='bold')
+        axes[0, 0].axis('off')
+        # Ground truth
+        axes[0, 1].imshow(ground_truth, cmap='gray')
+        axes[0, 1].set_title('Ground Truth (Actual)', fontsize=14, fontweight='bold')
+        axes[0, 1].axis('off')
+        # Predicted mask
+        axes[1, 0].imshow(mask, cmap='gray')
+        axes[1, 0].set_title('Predicted Mask', fontsize=14, fontweight='bold')
+        axes[1, 0].axis('off')
+        # Probability heatmap
+        im = axes[1, 1].imshow(prob_map, cmap='hot', vmin=0, vmax=1)
+        axes[1, 1].set_title('Probability Heatmap', fontsize=14, fontweight='bold')
+        axes[1, 1].axis('off')
+        plt.colorbar(im, ax=axes[1, 1], fraction=0.046, pad=0.04)
+        # Calculate metrics
+        intersection = np.logical_and(ground_truth > 127, mask > 0).sum()
+        union = np.logical_or(ground_truth > 127, mask > 0).sum()
+        gt_sum = (ground_truth > 127).sum()
+        pred_sum = (mask > 0).sum()
+        iou = intersection / (union + 1e-8)
+        dice = 2 * intersection / (gt_sum + pred_sum + 1e-8)
+        precision = intersection / (pred_sum + 1e-8) if pred_sum > 0 else 0
+        recall = intersection / (gt_sum + 1e-8) if gt_sum > 0 else 0
+        fig.suptitle(f'Actual vs Predicted Comparison\n'
+                    f'IoU: {iou:.2%} | Dice: {dice:.2%} | '
+                    f'Precision: {precision:.2%} | Recall: {recall:.2%}',
+                    fontsize=16, fontweight='bold')
+        plt.tight_layout()
+        plt.savefig(output_path / f'{base_name}_comparison.png', dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"   ✓ Saved comparison (IoU: {iou:.2%}, Dice: {dice:.2%})")
+    def _create_clean_result(self, image, base_name, output_path, ground_truth=None):
+        """Create result for clean (no forgery) document"""
+        # Save original image
+        plt.figure(figsize=(10, 8))
+        plt.imshow(image)
+        plt.title('No Forgery Detected', fontsize=14, fontweight='bold', color='green')
+        plt.axis('off')
+        plt.tight_layout()
+        plt.savefig(output_path / f'{base_name}_clean.png', dpi=150, bbox_inches='tight')
+        plt.close()
+        # Create JSON
+        output = {
+            'timestamp': datetime.now().isoformat(),
+            'num_regions_detected': 0,
+            'regions': [],
+            'status': 'clean'
+        }
+        json_path = output_path / f'{base_name}_results.json'
+        with open(json_path, 'w') as f:
+            json.dump(output, f, indent=2)
+        return output
+def main():
+    parser = argparse.ArgumentParser(description='Document Forgery Detection Pipeline')
+    parser.add_argument('--image', type=str, required=True,
+                       help='Path to input document image')
+    parser.add_argument('--ground_truth', type=str, default=None,
+                       help='Path to ground truth mask (optional)')
+    parser.add_argument('--output_dir', type=str, default='outputs/inference',
+                       help='Output directory for results')
+    parser.add_argument('--config', type=str, default='config.yaml',
+                       help='Path to config file')
+    args = parser.parse_args()
+    # Initialize pipeline
+    pipeline = ForgeryDetectionPipeline(args.config)
+    # Run detection
+    results = pipeline.detect(
+        args.image,
+        ground_truth_path=args.ground_truth,
+        output_dir=args.output_dir
+    )
+    # Print summary
+    print("\nDetection Summary:")
+    print(f"  Regions detected: {results['num_regions_detected']}")
+    if results['num_regions_detected'] > 0:
+        for region in results['regions']:
+            print(f"    - {region['forgery_type']}: {region['confidence']:.1%} confidence")
+if __name__ == '__main__':
+    main()

scripts/run_inference.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Inference Script for Document Forgery Detection
+Run inference on single images or entire directories.
+Usage:
+    python scripts/run_inference.py --input path/to/image.jpg --model outputs/checkpoints/best_doctamper.pth
+    python scripts/run_inference.py --input path/to/folder/ --model outputs/checkpoints/best_doctamper.pth
+"""
+import argparse
+import sys
+from pathlib import Path
+import json
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.config import get_config
+from src.inference import get_pipeline
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run forgery detection inference")
+    parser.add_argument('--input', type=str, required=True,
+                       help='Input image or directory path')
+    parser.add_argument('--model', type=str, required=True,
+                       help='Path to localization model checkpoint')
+    parser.add_argument('--classifier', type=str, default=None,
+                       help='Path to classifier directory (optional)')
+    parser.add_argument('--output', type=str, default='outputs/results',
+                       help='Output directory')
+    parser.add_argument('--is_text', action='store_true',
+                       help='Enable OCR features for text documents')
+    parser.add_argument('--config', type=str, default='config.yaml',
+                       help='Path to config file')
+    return parser.parse_args()
+def process_file(pipeline, input_path: str, output_dir: str):
+    """Process a single file"""
+    try:
+        result = pipeline.run(input_path, output_dir)
+        return result
+    except Exception as e:
+        print(f"Error processing {input_path}: {e}")
+        return None
+def main():
+    args = parse_args()
+    # Load config
+    config = get_config(args.config)
+    print("\n" + "="*60)
+    print("Hybrid Document Forgery Detection - Inference")
+    print("="*60)
+    print(f"Input: {args.input}")
+    print(f"Model: {args.model}")
+    print(f"Classifier: {args.classifier or 'None'}")
+    print(f"Output: {args.output}")
+    print("="*60)
+    # Create pipeline
+    pipeline = get_pipeline(
+        config,
+        model_path=args.model,
+        classifier_path=args.classifier,
+        is_text_document=args.is_text
+    )
+    # Create output directory
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Get input files
+    input_path = Path(args.input)
+    if input_path.is_file():
+        files = [input_path]
+    elif input_path.is_dir():
+        extensions = ['.jpg', '.jpeg', '.png', '.pdf', '.bmp', '.tiff']
+        files = [f for f in input_path.iterdir()
+                if f.suffix.lower() in extensions]
+    else:
+        print(f"Invalid input path: {input_path}")
+        return
+    print(f"\nProcessing {len(files)} file(s)...")
+    # Process files
+    all_results = []
+    for file_path in files:
+        result = process_file(pipeline, str(file_path), str(output_dir))
+        if result:
+            all_results.append(result)
+            # Print summary
+            status = "TAMPERED" if result['is_tampered'] else "AUTHENTIC"
+            print(f"\n  {file_path.name}: {status}")
+            if result['is_tampered']:
+                print(f"    Regions detected: {result['num_regions']}")
+                for region in result['regions'][:3]:  # Show first 3
+                    print(f"    - {region['forgery_type']} (conf: {region['confidence']:.2f})")
+    # Save summary
+    summary_path = output_dir / 'inference_summary.json'
+    summary = {
+        'total_files': len(files),
+        'processed': len(all_results),
+        'tampered': sum(1 for r in all_results if r['is_tampered']),
+        'authentic': sum(1 for r in all_results if not r['is_tampered']),
+        'results': all_results
+    }
+    with open(summary_path, 'w') as f:
+        json.dump(summary, f, indent=2, default=str)
+    print("\n" + "="*60)
+    print("Inference Complete!")
+    print(f"Total: {summary['total_files']}, "
+          f"Tampered: {summary['tampered']}, "
+          f"Authentic: {summary['authentic']}")
+    print(f"Results saved to: {output_dir}")
+    print("="*60)
+if __name__ == '__main__':
+    main()

scripts/setup.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""
+Setup Script
+Creates output directories and verifies installation.
+Usage:
+    python scripts/setup.py
+"""
+import sys
+from pathlib import Path
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+def create_directories():
+    """Create required output directories"""
+    base_dir = Path(__file__).parent.parent
+    directories = [
+        base_dir / 'outputs',
+        base_dir / 'outputs' / 'checkpoints',
+        base_dir / 'outputs' / 'logs',
+        base_dir / 'outputs' / 'plots',
+        base_dir / 'outputs' / 'results',
+        base_dir / 'outputs' / 'classifier',
+        base_dir / 'outputs' / 'exported',
+    ]
+    for directory in directories:
+        directory.mkdir(parents=True, exist_ok=True)
+        print(f"Created: {directory}")
+def verify_installation():
+    """Verify all required packages are installed"""
+    required_packages = [
+        ('torch', 'PyTorch'),
+        ('torchvision', 'TorchVision'),
+        ('timm', 'TIMM'),
+        ('lightgbm', 'LightGBM'),
+        ('sklearn', 'Scikit-learn'),
+        ('cv2', 'OpenCV'),
+        ('PIL', 'Pillow'),
+        ('numpy', 'NumPy'),
+        ('pandas', 'Pandas'),
+        ('matplotlib', 'Matplotlib'),
+        ('seaborn', 'Seaborn'),
+        ('albumentations', 'Albumentations'),
+        ('tqdm', 'TQDM'),
+        ('yaml', 'PyYAML'),
+        ('pywt', 'PyWavelets'),
+    ]
+    print("\nVerifying installation...")
+    print("-" * 40)
+    missing = []
+    for package, name in required_packages:
+        try:
+            __import__(package)
+            print(f"  ✓ {name}")
+        except ImportError:
+            print(f"  ✗ {name} (MISSING)")
+            missing.append(name)
+    # Check CUDA
+    print("-" * 40)
+    try:
+        import torch
+        if torch.cuda.is_available():
+            print(f"  ✓ CUDA Available: {torch.cuda.get_device_name(0)}")
+        else:
+            print("  ⚠ CUDA Not Available (CPU mode)")
+    except Exception as e:
+        print(f"  ✗ CUDA Check Failed: {e}")
+    return missing
+def verify_datasets():
+    """Verify dataset paths exist"""
+    base_dir = Path(__file__).parent.parent
+    datasets = {
+        'DocTamper': base_dir / 'datasets' / 'DocTamper',
+        'RTM': base_dir / 'datasets' / 'RealTextManipulation',
+        'CASIA': base_dir / 'datasets' / 'CASIA 1.0 dataset',
+        'Receipts': base_dir / 'datasets' / 'findit2',
+    }
+    print("\nVerifying datasets...")
+    print("-" * 40)
+    for name, path in datasets.items():
+        if path.exists():
+            print(f"  ✓ {name}: {path}")
+        else:
+            print(f"  ✗ {name}: NOT FOUND ({path})")
+def main():
+    print("\n" + "="*60)
+    print("Hybrid Document Forgery Detection - Setup")
+    print("="*60)
+    # Create directories
+    print("\nCreating directories...")
+    print("-" * 40)
+    create_directories()
+    # Verify installation
+    missing = verify_installation()
+    # Verify datasets
+    verify_datasets()
+    # Summary
+    print("\n" + "="*60)
+    if missing:
+        print("Setup complete with WARNINGS")
+        print(f"Missing packages: {', '.join(missing)}")
+        print("Run: pip install -r requirements.txt")
+    else:
+        print("Setup Complete! All checks passed.")
+    print("="*60)
+if __name__ == '__main__':
+    main()

scripts/train_chunked.py ADDED Viewed

	@@ -0,0 +1,222 @@

+"""
+Chunked Training Script for Document Forgery Detection
+Supports training on large datasets (DocTamper) in chunks to manage RAM constraints.
+Usage:
+    python scripts/train_chunked.py --dataset doctamper --chunk 1
+    python scripts/train_chunked.py --dataset rtm
+    python scripts/train_chunked.py --dataset casia
+    python scripts/train_chunked.py --dataset receipts
+"""
+import argparse
+import os
+import sys
+from pathlib import Path
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+import torch
+import gc
+from src.config import get_config
+from src.training import get_trainer
+from src.utils import plot_training_curves, plot_chunked_training_progress, generate_training_report
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train forgery detection model")
+    parser.add_argument('--dataset', type=str, default='doctamper',
+                       choices=['doctamper', 'rtm', 'casia', 'receipts', 'fcd', 'scd'],
+                       help='Dataset to train on')
+    parser.add_argument('--chunk', type=int, default=None,
+                       help='Chunk number (1-4) for DocTamper chunked training')
+    parser.add_argument('--epochs', type=int, default=None,
+                       help='Number of epochs (overrides config)')
+    parser.add_argument('--resume', type=str, default=None,
+                       help='Checkpoint to resume from')
+    parser.add_argument('--config', type=str, default='config.yaml',
+                       help='Path to config file')
+    return parser.parse_args()
+def train_chunk(config, dataset_name: str, chunk_id: int, epochs: int = None, resume: str = None):
+    """Train a single chunk"""
+    # Calculate chunk boundaries
+    chunks = config.get('data.chunked_training.chunks', [])
+    if chunk_id > len(chunks):
+        raise ValueError(f"Invalid chunk ID: {chunk_id}. Max: {len(chunks)}")
+    chunk_config = chunks[chunk_id - 1]
+    chunk_start = chunk_config['start']
+    chunk_end = chunk_config['end']
+    chunk_name = chunk_config['name']
+    print(f"\n{'='*60}")
+    print(f"Training Chunk {chunk_id}: {chunk_name}")
+    print(f"Range: {chunk_start*100:.0f}% - {chunk_end*100:.0f}%")
+    print(f"{'='*60}")
+    # Create trainer
+    trainer = get_trainer(config, dataset_name)
+    # Resume from previous chunk if applicable
+    if resume:
+        # For chunked training, reset epoch counter to train full epochs on new data
+        trainer.load_checkpoint(resume, reset_epoch=True)
+    elif chunk_id > 1:
+        # Auto-resume from previous chunk
+        prev_checkpoint = f'{dataset_name}_chunk{chunk_id-1}_final.pth'
+        if (Path(config.get('outputs.checkpoints')) / prev_checkpoint).exists():
+            print(f"Auto-resuming from previous chunk: {prev_checkpoint}")
+            trainer.load_checkpoint(prev_checkpoint, reset_epoch=True)
+    # Train
+    history = trainer.train(
+        epochs=epochs,
+        chunk_start=chunk_start,
+        chunk_end=chunk_end,
+        chunk_id=chunk_id,
+        resume_from=None  # Already loaded above
+    )
+    # Plot training curves
+    plot_dir = Path(config.get('outputs.plots', 'outputs/plots'))
+    plot_dir.mkdir(parents=True, exist_ok=True)
+    plot_path = plot_dir / f'{dataset_name}_chunk{chunk_id}_curves.png'
+    plot_training_curves(
+        history,
+        str(plot_path),
+        title=f"{dataset_name.upper()} Chunk {chunk_id} Training"
+    )
+    # Generate report
+    report_path = plot_dir / f'{dataset_name}_chunk{chunk_id}_report.txt'
+    generate_training_report(history, str(report_path), f"{dataset_name} Chunk {chunk_id}")
+    # Clear memory
+    del trainer
+    gc.collect()
+    torch.cuda.empty_cache()
+    return history
+def train_full_dataset(config, dataset_name: str, epochs: int = None, resume: str = None):
+    """Train on full dataset (for smaller datasets)"""
+    print(f"\n{'='*60}")
+    print(f"Training on: {dataset_name.upper()}")
+    print(f"{'='*60}")
+    # Create trainer
+    trainer = get_trainer(config, dataset_name)
+    # Load checkpoint if resuming (reset epoch counter for new dataset)
+    if resume:
+        print(f"Loading weights from: {resume}")
+        trainer.load_checkpoint(resume, reset_epoch=True)
+        print("Epoch counter reset to 0 for new dataset training")
+    # Train
+    history = trainer.train(
+        epochs=epochs,
+        chunk_id=0,
+        resume_from=None  # Already loaded above
+    )
+    # Plot training curves
+    plot_dir = Path(config.get('outputs.plots', 'outputs/plots'))
+    plot_dir.mkdir(parents=True, exist_ok=True)
+    plot_path = plot_dir / f'{dataset_name}_training_curves.png'
+    plot_training_curves(
+        history,
+        str(plot_path),
+        title=f"{dataset_name.upper()} Training"
+    )
+    # Generate report
+    report_path = plot_dir / f'{dataset_name}_report.txt'
+    generate_training_report(history, str(report_path), dataset_name)
+    return history
+def main():
+    args = parse_args()
+    # Load config
+    config = get_config(args.config)
+    print("\n" + "="*60)
+    print("Hybrid Document Forgery Detection - Training")
+    print("="*60)
+    print(f"Dataset: {args.dataset}")
+    print(f"Device: {config.get('system.device')}")
+    print(f"CUDA Available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"GPU: {torch.cuda.get_device_name(0)}")
+        print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
+    print("="*60)
+    # DocTamper: chunked training
+    if args.dataset == 'doctamper' and args.chunk is not None:
+        history = train_chunk(
+            config,
+            args.dataset,
+            args.chunk,
+            epochs=args.epochs,
+            resume=args.resume
+        )
+    # DocTamper: all chunks sequentially
+    elif args.dataset == 'doctamper' and args.chunk is None:
+        print("Training DocTamper in 4 chunks...")
+        all_histories = []
+        for chunk_id in range(1, 5):
+            history = train_chunk(
+                config,
+                args.dataset,
+                chunk_id,
+                epochs=args.epochs,
+                resume=None if chunk_id == 1 else None  # Auto-resume from prev chunk
+            )
+            all_histories.append(history)
+        # Plot combined progress
+        plot_dir = Path(config.get('outputs.plots', 'outputs/plots'))
+        combined_path = plot_dir / 'doctamper_all_chunks_progress.png'
+        plot_chunked_training_progress(
+            all_histories,
+            str(combined_path),
+            title="DocTamper Chunked Training Progress"
+        )
+    # Other datasets: full training
+    else:
+        history = train_full_dataset(
+            config,
+            args.dataset,
+            epochs=args.epochs,
+            resume=args.resume
+        )
+    print("\n" + "="*60)
+    print("Training Complete!")
+    print("="*60)
+if __name__ == '__main__':
+    main()

scripts/train_classifier_doctamper_fixed.py ADDED Viewed

	@@ -0,0 +1,368 @@

+"""
+LightGBM Classifier Training - DocTamper with Tampering Labels
+FIXED VERSION with proper checkpointing and feature dimension handling
+Implements Algorithm Steps 7-8:
+  7. Hybrid Feature Extraction
+  8. Region-wise Forgery Classification
+Uses:
+- Localization: best_doctamper.pth (Steps 1-6 complete)
+- Training: DocTamper TrainingSet + tampering/DocTamperV1-TrainingSet.pk
+- Testing: DocTamper TestingSet + tampering/DocTamperV1-TestingSet.pk
+- Classes: Copy-Move (CM), Splicing (SP), Generation (GE)
+Features:
+- ✅ Checkpoint saving every 1000 samples
+- ✅ Resume from checkpoint if interrupted
+- ✅ Fixed feature dimension mismatch
+- ✅ Robust error handling
+Usage:
+    python scripts/train_classifier_doctamper_fixed.py
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import pickle
+import lmdb
+import cv2
+import torch
+from tqdm import tqdm
+import json
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.config import get_config
+from src.models import get_model
+from src.features import get_feature_extractor
+from src.training.classifier import get_classifier
+# Configuration
+MODEL_PATH = 'outputs/checkpoints/best_doctamper.pth'
+OUTPUT_DIR = 'outputs/classifier'
+MAX_SAMPLES = 999999  # Use all available samples
+# Label mapping (Algorithm Step 8.2) - 3 classes
+LABEL_MAP = {
+    'CM': 0,  # Copy-Move
+    'SP': 1,  # Splicing
+    'GE': 2,  # Generation (AI-generated, separate from Splicing)
+}
+def load_tampering_labels(label_file):
+    """Load forgery type labels from tampering folder"""
+    with open(label_file, 'rb') as f:
+        labels = pickle.load(f)
+    print(f"Loaded {len(labels)} labels from {label_file}")
+    return labels
+def load_sample_from_lmdb(lmdb_env, index):
+    """Load image and mask from LMDB"""
+    txn = lmdb_env.begin()
+    # Get image
+    img_key = f'image-{index:09d}'.encode('utf-8')
+    img_data = txn.get(img_key)
+    if not img_data:
+        return None, None
+    # Get mask (DocTamper uses 'label-' not 'mask-')
+    mask_key = f'label-{index:09d}'.encode('utf-8')
+    mask_data = txn.get(mask_key)
+    if not mask_data:
+        return None, None
+    # Decode
+    img_array = np.frombuffer(img_data, dtype=np.uint8)
+    image = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    mask_array = np.frombuffer(mask_data, dtype=np.uint8)
+    mask = cv2.imdecode(mask_array, cv2.IMREAD_GRAYSCALE)
+    return image, mask
+def extract_features(config, model, lmdb_path, tampering_labels,
+                    max_samples, device, split_name):
+    """
+    Extract hybrid features with checkpointing and resume capability
+    """
+    print(f"\n{'='*60}")
+    print(f"Extracting features from {split_name}")
+    print(f"{'='*60}")
+    # Setup checkpoint directory
+    checkpoint_dir = Path(OUTPUT_DIR)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Check for existing checkpoint to resume
+    checkpoints = list(checkpoint_dir.glob(f'checkpoint_{split_name}_*.npz'))
+    if checkpoints:
+        latest_checkpoint = max(checkpoints, key=lambda p: int(p.stem.split('_')[-1]))
+        print(f"✓ Found checkpoint: {latest_checkpoint.name}")
+        data = np.load(latest_checkpoint, allow_pickle=True)
+        all_features = data['features'].tolist()
+        all_labels = data['labels'].tolist()
+        expected_dim = int(data['feature_dim'])
+        start_idx = len(all_features)
+        print(f"✓ Resuming from sample {start_idx}, feature_dim={expected_dim}")
+    else:
+        all_features = []
+        all_labels = []
+        expected_dim = None
+        start_idx = 0
+    # Open LMDB
+    env = lmdb.open(lmdb_path, readonly=True, lock=False)
+    # Initialize feature extractor
+    feature_extractor = get_feature_extractor(config, is_text_document=True)
+    # Process samples
+    num_processed = start_idx
+    dim_mismatch_count = 0
+    for i in tqdm(range(start_idx, min(len(tampering_labels), max_samples)),
+                  desc=f"Processing {split_name}", initial=start_idx,
+                  total=min(len(tampering_labels), max_samples)):
+        try:
+            # Skip if no label
+            if i not in tampering_labels:
+                continue
+            # Get forgery type label
+            forgery_type = tampering_labels[i]
+            if forgery_type not in LABEL_MAP:
+                continue
+            label = LABEL_MAP[forgery_type]
+            # Load image and mask
+            image, mask = load_sample_from_lmdb(env, i)
+            if image is None or mask is None:
+                continue
+            # Skip if no forgery
+            if mask.max() == 0:
+                continue
+            # Prepare for model
+            image_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+            image_tensor = image_tensor.unsqueeze(0).to(device)
+            # Get deep features from localization model
+            with torch.no_grad():
+                logits, decoder_features = model(image_tensor)
+            # Use ground truth mask for feature extraction
+            mask_binary = (mask > 127).astype(np.uint8)
+            # Extract hybrid features
+            features = feature_extractor.extract(
+                image / 255.0,
+                mask_binary,
+                [f.cpu() for f in decoder_features]
+            )
+            # Set expected dimension from first valid sample
+            if expected_dim is None:
+                expected_dim = len(features)
+                print(f"\n✓ Feature dimension set to: {expected_dim}")
+            # Ensure consistent feature dimension
+            if len(features) != expected_dim:
+                if len(features) < expected_dim:
+                    features = np.pad(features, (0, expected_dim - len(features)), mode='constant')
+                else:
+                    features = features[:expected_dim]
+                dim_mismatch_count += 1
+            all_features.append(features)
+            all_labels.append(label)
+            num_processed += 1
+            # Save checkpoint every 10,000 samples (only 12 checkpoints total)
+            if num_processed % 10000 == 0:
+                checkpoint_path = checkpoint_dir / f'checkpoint_{split_name}_{num_processed}.npz'
+                features_array = np.array(all_features, dtype=np.float32)
+                labels_array = np.array(all_labels, dtype=np.int32)
+                np.savez_compressed(checkpoint_path,
+                                   features=features_array,
+                                   labels=labels_array,
+                                   feature_dim=expected_dim)
+                print(f"\n✓ Checkpoint: {num_processed} samples (dim={expected_dim}, mismatches={dim_mismatch_count})")
+                # Delete old checkpoints to save space (keep only last 2)
+                old_checkpoints = sorted(checkpoint_dir.glob(f'checkpoint_{split_name}_*.npz'))
+                if len(old_checkpoints) > 2:
+                    for old_cp in old_checkpoints[:-2]:
+                        old_cp.unlink()
+                        print(f"  Cleaned up: {old_cp.name}")
+        except Exception as e:
+            print(f"\n⚠ Error at sample {i}: {str(e)[:80]}")
+            continue
+    env.close()
+    print(f"\n✓ Extracted {num_processed} samples")
+    if dim_mismatch_count > 0:
+        print(f"⚠ Fixed {dim_mismatch_count} dimension mismatches")
+    # Save final features
+    final_path = checkpoint_dir / f'features_{split_name}_final.npz'
+    if len(all_features) > 0:
+        features_array = np.array(all_features, dtype=np.float32)
+        labels_array = np.array(all_labels, dtype=np.int32)
+        np.savez_compressed(final_path,
+                           features=features_array,
+                           labels=labels_array,
+                           feature_dim=expected_dim)
+        print(f"✓ Final features saved: {final_path}")
+        print(f"  Shape: features={features_array.shape}, labels={labels_array.shape}")
+        return features_array, labels_array
+    return None, None
+def main():
+    config = get_config('config.yaml')
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print("\n" + "="*60)
+    print("LightGBM Classifier Training - DocTamper (FIXED)")
+    print("Implements Algorithm Steps 7-8")
+    print("="*60)
+    print(f"Model: {MODEL_PATH}")
+    print(f"Device: {device}")
+    print(f"Max samples: {MAX_SAMPLES}")
+    print("="*60)
+    print("\nForgery Type Classes (Step 8.2):")
+    print("  0: Copy-Move (CM)")
+    print("  1: Splicing (SP)")
+    print("  2: Generation (GE)")
+    print("="*60)
+    # Load localization model
+    print("\nLoading localization model...")
+    model = get_model(config).to(device)
+    checkpoint = torch.load(MODEL_PATH, map_location=device)
+    model.load_state_dict(checkpoint['model_state_dict'])
+    model.eval()
+    print(f"✓ Model loaded (Val Dice: {checkpoint.get('best_metric', 0):.4f})")
+    # Load tampering labels
+    train_labels = load_tampering_labels(
+        'datasets/DocTamper/tampering/DocTamperV1-TrainingSet.pk'
+    )
+    test_labels = load_tampering_labels(
+        'datasets/DocTamper/tampering/DocTamperV1-TestingSet.pk'
+    )
+    # Extract features from TrainingSet
+    X_train, y_train = extract_features(
+        config, model,
+        'datasets/DocTamper/DocTamperV1-TrainingSet',
+        train_labels,
+        MAX_SAMPLES,
+        device,
+        'TrainingSet'
+    )
+    # Extract features from TestingSet
+    X_test, y_test = extract_features(
+        config, model,
+        'datasets/DocTamper/DocTamperV1-TestingSet',
+        test_labels,
+        MAX_SAMPLES // 4,
+        device,
+        'TestingSet'
+    )
+    if X_train is None or X_test is None:
+        print("\n❌ No features extracted!")
+        return
+    # Summary
+    print("\n" + "="*60)
+    print("Dataset Summary")
+    print("="*60)
+    print(f"Training samples: {len(X_train):,}")
+    print(f"Testing samples: {len(X_test):,}")
+    print(f"Feature dimension: {X_train.shape[1]}")
+    print(f"\nTraining class distribution:")
+    train_counts = np.bincount(y_train)
+    class_names = ['Copy-Move', 'Splicing', 'Generation']
+    for i, count in enumerate(train_counts):
+        if i < len(class_names):
+            print(f"  {class_names[i]}: {count:,} ({count/len(y_train)*100:.1f}%)")
+    print(f"\nTesting class distribution:")
+    test_counts = np.bincount(y_test)
+    for i, count in enumerate(test_counts):
+        if i < len(class_names):
+            print(f"  {class_names[i]}: {count:,} ({count/len(y_test)*100:.1f}%)")
+    # Train classifier
+    print("\n" + "="*60)
+    print("Training LightGBM Classifier (Step 8.1)")
+    print("="*60)
+    output_dir = Path(OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    classifier = get_classifier(config)
+    feature_names = get_feature_extractor(config, is_text_document=True).get_feature_names()
+    # Combine train and test for sklearn train_test_split
+    X_combined = np.vstack([X_train, X_test])
+    y_combined = np.concatenate([y_train, y_test])
+    metrics = classifier.train(X_combined, y_combined, feature_names=feature_names)
+    # Save results
+    classifier.save(str(output_dir))
+    print(f"\n✓ Classifier saved to: {output_dir}")
+    # Save metrics
+    metrics_path = output_dir / 'training_metrics.json'
+    with open(metrics_path, 'w') as f:
+        json.dump(metrics, f, indent=2)
+    # Save class mapping
+    class_mapping = {
+        0: 'Copy-Move',
+        1: 'Splicing',
+        2: 'Generation'
+    }
+    mapping_path = output_dir / 'class_mapping.json'
+    with open(mapping_path, 'w') as f:
+        json.dump(class_mapping, f, indent=2)
+    print("\n" + "="*60)
+    print("✅ Classifier Training Complete!")
+    print("Algorithm Steps 7-8: DONE")
+    print("="*60)
+    print(f"\nResults:")
+    print(f"  Test Accuracy: {metrics.get('test_accuracy', 'N/A')}")
+    print(f"  Test F1 Score: {metrics.get('test_f1', 'N/A')}")
+    print(f"\nOutput: {output_dir}")
+    print("\nNext: Implement Steps 9-11 in inference pipeline")
+    print("="*60 + "\n")
+if __name__ == '__main__':
+    main()