| |
| """Evaluate forensic detector on test datasets.""" |
|
|
| import sys |
| import os |
| import glob |
| import json |
| import numpy as np |
|
|
| sys.path.insert(0, '.') |
| from src.forensics.detector import ForensicDetector |
|
|
| def evaluate_dataset(detector, image_paths, label, threshold=0.5): |
| """Evaluate detector on a set of images with known label.""" |
| results = [] |
| for path in image_paths: |
| try: |
| result = detector.analyze(path) |
| result['path'] = os.path.basename(path) |
| result['true_label'] = label |
| result['predicted'] = 'fake' if result['aggregate_score'] >= threshold else 'real' |
| result['correct'] = (label == 'fake' and result['predicted'] == 'fake') or \ |
| (label == 'real' and result['predicted'] == 'real') |
| results.append(result) |
| except Exception as e: |
| print(f"Error processing {path}: {e}") |
| return results |
|
|
| def print_analysis(all_results, threshold=0.5): |
| """Print detailed analysis of results.""" |
| fake_results = [r for r in all_results if r['true_label'] == 'fake'] |
| real_results = [r for r in all_results if r['true_label'] == 'real'] |
|
|
| |
| fake_correct = sum(1 for r in fake_results if r['correct']) |
| real_correct = sum(1 for r in real_results if r['correct']) |
|
|
| print(f"\n{'='*60}") |
| print(f"OVERALL RESULTS (threshold={threshold})") |
| print(f"{'='*60}") |
| print(f"FAKE images: {fake_correct}/{len(fake_results)} correct ({100*fake_correct/max(1,len(fake_results)):.1f}%)") |
| print(f"REAL images: {real_correct}/{len(real_results)} correct ({100*real_correct/max(1,len(real_results)):.1f}%)") |
| print(f"Total accuracy: {(fake_correct+real_correct)}/{len(all_results)} ({100*(fake_correct+real_correct)/max(1,len(all_results)):.1f}%)") |
|
|
| |
| features = [k for k in all_results[0].keys() if k.endswith('_score') and k != 'aggregate_score'] |
|
|
| print(f"\n{'='*60}") |
| print("FEATURE DISCRIMINATION ANALYSIS") |
| print("(Higher fake_mean - real_mean = better discriminator)") |
| print(f"{'='*60}") |
|
|
| discriminators = [] |
| for feat in features: |
| fake_scores = [r[feat] for r in fake_results] |
| real_scores = [r[feat] for r in real_results] |
| fake_mean = np.mean(fake_scores) |
| real_mean = np.mean(real_scores) |
| discrimination = fake_mean - real_mean |
| discriminators.append((feat, discrimination, fake_mean, real_mean, np.std(fake_scores), np.std(real_scores))) |
|
|
| |
| discriminators.sort(key=lambda x: x[1], reverse=True) |
|
|
| print(f"\n{'Feature':<30} {'Discrim':>8} {'Fake μ':>8} {'Real μ':>8} {'Fake σ':>8} {'Real σ':>8}") |
| print("-" * 78) |
| for feat, disc, fake_m, real_m, fake_s, real_s in discriminators: |
| print(f"{feat:<30} {disc:>+8.3f} {fake_m:>8.3f} {real_m:>8.3f} {fake_s:>8.3f} {real_s:>8.3f}") |
|
|
| |
| print(f"\n{'='*60}") |
| print("AGGREGATE SCORE DISTRIBUTION") |
| print(f"{'='*60}") |
| fake_agg = [r['aggregate_score'] for r in fake_results] |
| real_agg = [r['aggregate_score'] for r in real_results] |
| print(f"FAKE: mean={np.mean(fake_agg):.3f}, std={np.std(fake_agg):.3f}, min={np.min(fake_agg):.3f}, max={np.max(fake_agg):.3f}") |
| print(f"REAL: mean={np.mean(real_agg):.3f}, std={np.std(real_agg):.3f}, min={np.min(real_agg):.3f}, max={np.max(real_agg):.3f}") |
|
|
| |
| print(f"\n{'='*60}") |
| print("MISCLASSIFIED EXAMPLES") |
| print(f"{'='*60}") |
|
|
| missed_fakes = [r for r in fake_results if not r['correct']] |
| false_positives = [r for r in real_results if not r['correct']] |
|
|
| print(f"\nMissed FAKE images (predicted as real): {len(missed_fakes)}") |
| for r in missed_fakes[:10]: |
| print(f" {r['path']}: agg={r['aggregate_score']:.3f}") |
|
|
| print(f"\nFalse positives (real predicted as fake): {len(false_positives)}") |
| for r in false_positives[:10]: |
| print(f" {r['path']}: agg={r['aggregate_score']:.3f}") |
|
|
| return discriminators |
|
|
| def main(): |
| detector = ForensicDetector() |
| all_results = [] |
|
|
| |
| data_dir = '/home/omer_aims_ac_za/digital-integrity-challenge/data' |
|
|
| |
| fake_paths = [] |
| fake_paths.extend(glob.glob(f'{data_dir}/ai_generated_v2/*.png')) |
| fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.png')) |
| fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.jpg')) |
| fake_paths.extend(glob.glob(f'{data_dir}/manipulated/*.jpg')) |
| fake_paths.extend(glob.glob(f'{data_dir}/test_subset/manip/*.jpg')) |
|
|
| |
| real_paths = [] |
| real_paths.extend(glob.glob(f'{data_dir}/real/*.jpg')) |
| real_paths.extend(glob.glob(f'{data_dir}/test_subset/real/*.jpg')) |
|
|
| print(f"Found {len(fake_paths)} fake images and {len(real_paths)} real images") |
|
|
| |
| print("\nProcessing fake images...") |
| fake_results = evaluate_dataset(detector, fake_paths, 'fake') |
| print(f"Processed {len(fake_results)} fake images") |
|
|
| print("\nProcessing real images...") |
| real_results = evaluate_dataset(detector, real_paths, 'real') |
| print(f"Processed {len(real_results)} real images") |
|
|
| all_results = fake_results + real_results |
|
|
| |
| for threshold in [0.35, 0.40, 0.45, 0.50]: |
| |
| for r in all_results: |
| r['predicted'] = 'fake' if r['aggregate_score'] >= threshold else 'real' |
| r['correct'] = (r['true_label'] == 'fake' and r['predicted'] == 'fake') or \ |
| (r['true_label'] == 'real' and r['predicted'] == 'real') |
|
|
| print_analysis(all_results, threshold) |
|
|
| |
| with open('/tmp/forensic_eval_results.json', 'w') as f: |
| json.dump(all_results, f, indent=2) |
| print(f"\nDetailed results saved to /tmp/forensic_eval_results.json") |
|
|
| if __name__ == "__main__": |
| main() |
|
|