#!/usr/bin/env python3
"""Evaluate forensic detector on test datasets."""

import sys
import os
import glob
import json
import numpy as np

sys.path.insert(0, '.')
from src.forensics.detector import ForensicDetector

def evaluate_dataset(detector, image_paths, label, threshold=0.5):
    """Evaluate detector on a set of images with known label."""
    results = []
    for path in image_paths:
        try:
            result = detector.analyze(path)
            result['path'] = os.path.basename(path)
            result['true_label'] = label
            result['predicted'] = 'fake' if result['aggregate_score'] >= threshold else 'real'
            result['correct'] = (label == 'fake' and result['predicted'] == 'fake') or \
                               (label == 'real' and result['predicted'] == 'real')
            results.append(result)
        except Exception as e:
            print(f"Error processing {path}: {e}")
    return results

def print_analysis(all_results, threshold=0.5):
    """Print detailed analysis of results."""
    fake_results = [r for r in all_results if r['true_label'] == 'fake']
    real_results = [r for r in all_results if r['true_label'] == 'real']

    # Calculate accuracy
    fake_correct = sum(1 for r in fake_results if r['correct'])
    real_correct = sum(1 for r in real_results if r['correct'])

    print(f"\n{'='*60}")
    print(f"OVERALL RESULTS (threshold={threshold})")
    print(f"{'='*60}")
    print(f"FAKE images: {fake_correct}/{len(fake_results)} correct ({100*fake_correct/max(1,len(fake_results)):.1f}%)")
    print(f"REAL images: {real_correct}/{len(real_results)} correct ({100*real_correct/max(1,len(real_results)):.1f}%)")
    print(f"Total accuracy: {(fake_correct+real_correct)}/{len(all_results)} ({100*(fake_correct+real_correct)/max(1,len(all_results)):.1f}%)")

    # Per-feature analysis
    features = [k for k in all_results[0].keys() if k.endswith('_score') and k != 'aggregate_score']

    print(f"\n{'='*60}")
    print("FEATURE DISCRIMINATION ANALYSIS")
    print("(Higher fake_mean - real_mean = better discriminator)")
    print(f"{'='*60}")

    discriminators = []
    for feat in features:
        fake_scores = [r[feat] for r in fake_results]
        real_scores = [r[feat] for r in real_results]
        fake_mean = np.mean(fake_scores)
        real_mean = np.mean(real_scores)
        discrimination = fake_mean - real_mean  # Positive = good (fake scores higher)
        discriminators.append((feat, discrimination, fake_mean, real_mean, np.std(fake_scores), np.std(real_scores)))

    # Sort by discrimination power
    discriminators.sort(key=lambda x: x[1], reverse=True)

    print(f"\n{'Feature':<30} {'Discrim':>8} {'Fake μ':>8} {'Real μ':>8} {'Fake σ':>8} {'Real σ':>8}")
    print("-" * 78)
    for feat, disc, fake_m, real_m, fake_s, real_s in discriminators:
        print(f"{feat:<30} {disc:>+8.3f} {fake_m:>8.3f} {real_m:>8.3f} {fake_s:>8.3f} {real_s:>8.3f}")

    # Aggregate score distribution
    print(f"\n{'='*60}")
    print("AGGREGATE SCORE DISTRIBUTION")
    print(f"{'='*60}")
    fake_agg = [r['aggregate_score'] for r in fake_results]
    real_agg = [r['aggregate_score'] for r in real_results]
    print(f"FAKE: mean={np.mean(fake_agg):.3f}, std={np.std(fake_agg):.3f}, min={np.min(fake_agg):.3f}, max={np.max(fake_agg):.3f}")
    print(f"REAL: mean={np.mean(real_agg):.3f}, std={np.std(real_agg):.3f}, min={np.min(real_agg):.3f}, max={np.max(real_agg):.3f}")

    # Show misclassified examples
    print(f"\n{'='*60}")
    print("MISCLASSIFIED EXAMPLES")
    print(f"{'='*60}")

    missed_fakes = [r for r in fake_results if not r['correct']]
    false_positives = [r for r in real_results if not r['correct']]

    print(f"\nMissed FAKE images (predicted as real): {len(missed_fakes)}")
    for r in missed_fakes[:10]:
        print(f"  {r['path']}: agg={r['aggregate_score']:.3f}")

    print(f"\nFalse positives (real predicted as fake): {len(false_positives)}")
    for r in false_positives[:10]:
        print(f"  {r['path']}: agg={r['aggregate_score']:.3f}")

    return discriminators

def main():
    detector = ForensicDetector()
    all_results = []

    # Collect image paths
    data_dir = '/home/omer_aims_ac_za/digital-integrity-challenge/data'

    # AI generated images (fake)
    fake_paths = []
    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated_v2/*.png'))
    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.png'))
    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.jpg'))
    fake_paths.extend(glob.glob(f'{data_dir}/manipulated/*.jpg'))
    fake_paths.extend(glob.glob(f'{data_dir}/test_subset/manip/*.jpg'))

    # Real images
    real_paths = []
    real_paths.extend(glob.glob(f'{data_dir}/real/*.jpg'))
    real_paths.extend(glob.glob(f'{data_dir}/test_subset/real/*.jpg'))

    print(f"Found {len(fake_paths)} fake images and {len(real_paths)} real images")

    # Run evaluation
    print("\nProcessing fake images...")
    fake_results = evaluate_dataset(detector, fake_paths, 'fake')
    print(f"Processed {len(fake_results)} fake images")

    print("\nProcessing real images...")
    real_results = evaluate_dataset(detector, real_paths, 'real')
    print(f"Processed {len(real_results)} real images")

    all_results = fake_results + real_results

    # Test different thresholds
    for threshold in [0.35, 0.40, 0.45, 0.50]:
        # Recalculate predictions with new threshold
        for r in all_results:
            r['predicted'] = 'fake' if r['aggregate_score'] >= threshold else 'real'
            r['correct'] = (r['true_label'] == 'fake' and r['predicted'] == 'fake') or \
                          (r['true_label'] == 'real' and r['predicted'] == 'real')

        print_analysis(all_results, threshold)

    # Save detailed results
    with open('/tmp/forensic_eval_results.json', 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\nDetailed results saved to /tmp/forensic_eval_results.json")

if __name__ == "__main__":
    main()