asimz commited on Jan 28

Commit

64e935c

verified ·

1 Parent(s): e5de66d

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

.gitignore +30 -0
analyze_images.py +155 -0
download_samples.py +15 -0
eval_detector.py +133 -0
eval_forensics.py +112 -0
evaluate_forensics.py +146 -0
improved_detector.py +407 -0
optimized_detector.py +272 -0
predict.py +134 -0
requirements.txt +22 -0
simple_detector.py +101 -0
src/__init__.py +0 -0
src/forensics/__init__.py +0 -0
src/forensics/detector.py +946 -0
src/fusion/__init__.py +0 -0
src/fusion/combiner.py +251 -0
src/neural/__init__.py +3 -0
src/neural/detector.py +375 -0
src/vlm/__init__.py +0 -0
src/vlm/reasoner.py +636 -0
test_ensemble.py +128 -0
test_forensics.py +25 -0
test_pretrained_detectors.py +302 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,30 @@

+# Python
+__pycache__/
+*.py[cod]
+*.so
+.Python
+*.egg-info/
+dist/
+build/
+# Data and outputs (don't commit large files)
+data/
+datasets/
+outputs/
+*.json
+*.zip
+# Environment
+.env
+.venv/
+venv/
+env/
+# IDE
+.vscode/
+.idea/
+*.swp
+# OS
+.DS_Store
+Thumbs.db

analyze_images.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python3
+"""Analyze specific images to understand real vs fake characteristics."""
+import cv2
+import numpy as np
+from glob import glob
+import os
+def analyze_image(img_path):
+    """Detailed analysis of an image."""
+    img = cv2.imread(img_path)
+    if img is None:
+        return None
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+    h, w = gray.shape
+    results = {'shape': img.shape}
+    # 1. Basic stats
+    results['mean_brightness'] = np.mean(gray)
+    results['std_brightness'] = np.std(gray)
+    # 2. FFT analysis - look at specific frequencies
+    f_transform = np.fft.fft2(gray)
+    f_shift = np.fft.fftshift(f_transform)
+    magnitude = np.abs(f_shift)
+    center_h, center_w = h // 2, w // 2
+    max_radius = min(h, w) // 2
+    # Create distance map
+    y, x = np.ogrid[:h, :w]
+    distance = np.sqrt((y - center_h) ** 2 + (x - center_w) ** 2)
+    # Energy in bands
+    low_mask = distance < (max_radius * 0.1)
+    mid_mask = (distance >= max_radius * 0.1) & (distance < max_radius * 0.4)
+    high_mask = (distance >= max_radius * 0.4) & (distance < max_radius * 0.9)
+    low_energy = np.mean(magnitude[low_mask])
+    mid_energy = np.mean(magnitude[mid_mask])
+    high_energy = np.mean(magnitude[high_mask])
+    total = low_energy + mid_energy + high_energy
+    results['fft_low_ratio'] = low_energy / total
+    results['fft_mid_ratio'] = mid_energy / total
+    results['fft_high_ratio'] = high_energy / total
+    # 3. Noise analysis
+    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+    noise = gray - blurred
+    results['noise_std'] = np.std(noise)
+    results['noise_mean'] = np.mean(np.abs(noise))
+    # Noise uniformity across regions
+    region_stds = []
+    block_size = h // 4
+    for i in range(4):
+        for j in range(4):
+            block = noise[i*block_size:(i+1)*block_size, j*block_size:(j+1)*block_size]
+            region_stds.append(np.std(block))
+    results['noise_uniformity'] = np.std(region_stds) / (np.mean(region_stds) + 1e-10)
+    # 4. Laplacian variance (sharpness)
+    gray_uint8 = gray.astype(np.uint8)
+    laplacian = cv2.Laplacian(gray_uint8, cv2.CV_64F)
+    results['laplacian_var'] = laplacian.var()
+    # 5. Edge density
+    edges = cv2.Canny(gray.astype(np.uint8), 50, 150)
+    results['edge_density'] = np.mean(edges > 0)
+    # 6. Local variance statistics
+    kernel_size = 15
+    local_mean = cv2.blur(gray, (kernel_size, kernel_size))
+    local_sqr_mean = cv2.blur(gray ** 2, (kernel_size, kernel_size))
+    local_var = local_sqr_mean - local_mean ** 2
+    results['local_var_mean'] = np.mean(local_var)
+    results['local_var_std'] = np.std(local_var)
+    results['smooth_ratio'] = np.mean(local_var < 50)
+    # 7. DCT analysis on 8x8 blocks
+    ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
+    y_channel = ycrcb[:, :, 0].astype(np.float32)
+    h8, w8 = (h // 8) * 8, (w // 8) * 8
+    y_cropped = y_channel[:h8, :w8]
+    dct_stats = []
+    for i in range(0, h8, 8):
+        for j in range(0, w8, 8):
+            block = y_cropped[i:i+8, j:j+8]
+            dct = cv2.dct(block)
+            # High frequency energy (bottom-right of DCT block)
+            hf_energy = np.mean(np.abs(dct[4:, 4:]))
+            dct_stats.append(hf_energy)
+    results['dct_hf_mean'] = np.mean(dct_stats)
+    results['dct_hf_std'] = np.std(dct_stats)
+    # 8. Color saturation
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    saturation = hsv[:, :, 1]
+    results['sat_mean'] = np.mean(saturation)
+    results['sat_std'] = np.std(saturation)
+    return results
+def main():
+    data_dir = "data/ai_generated_v2"
+    images = glob(os.path.join(data_dir, "*.png"))
+    real_stats = {}
+    fake_stats = {}
+    for img_path in sorted(images):
+        filename = os.path.basename(img_path)
+        is_fake = "images_fake_" in filename
+        results = analyze_image(img_path)
+        if results is None:
+            continue
+        target = fake_stats if is_fake else real_stats
+        for k, v in results.items():
+            if k == 'shape':
+                continue
+            if k not in target:
+                target[k] = []
+            target[k].append(v)
+    print("\n" + "="*70)
+    print("DETAILED FEATURE COMPARISON: REAL vs FAKE")
+    print("="*70)
+    # Sort by absolute difference
+    features = []
+    for k in real_stats.keys():
+        real_mean = np.mean(real_stats[k])
+        fake_mean = np.mean(fake_stats[k])
+        diff = fake_mean - real_mean
+        sep = abs(diff) / (np.std(real_stats[k]) + np.std(fake_stats[k]) + 1e-10)
+        features.append((k, real_mean, fake_mean, diff, sep))
+    features.sort(key=lambda x: -abs(x[4]))  # Sort by separation
+    for k, real_mean, fake_mean, diff, sep in features:
+        print(f"\n{k}:")
+        print(f"  Real: {real_mean:.4f} ± {np.std(real_stats[k]):.4f}")
+        print(f"  Fake: {fake_mean:.4f} ± {np.std(fake_stats[k]):.4f}")
+        print(f"  Diff: {diff:+.4f}  |  Separation: {sep:.3f}")
+if __name__ == "__main__":
+    main()

download_samples.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Download sample images for testing."""
+import urllib.request
+import os
+os.makedirs("data/real", exist_ok=True)
+os.makedirs("data/manipulated", exist_ok=True)
+# Real estate image sources (we'll use the sample we already have)
+print("Sample images ready in data/test/")
+print("For full testing, add real and AI-generated real estate images to:")
+print("  - data/real/")
+print("  - data/manipulated/")
+print("\nYou can generate fake images using:")
+print("  - DALL-E / Midjourney / Flux with 'modern kitchen interior' prompts")
+print("  - Virtual staging tools")

eval_detector.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""Evaluate forensic detector on test dataset."""
+import os
+import sys
+import numpy as np
+from glob import glob
+sys.path.insert(0, '/home/omer_aims_ac_za/digital-integrity-challenge')
+from src.forensics.detector import ForensicDetector
+def evaluate():
+    detector = ForensicDetector()
+    data_dir = "data/ai_generated_v2"
+    images = glob(os.path.join(data_dir, "*.png"))
+    real_scores = []
+    fake_scores = []
+    all_results = []
+    for img_path in sorted(images):
+        filename = os.path.basename(img_path)
+        # Check for images_fake_ vs images_real_ pattern
+        is_fake = "images_fake_" in filename
+        try:
+            results = detector.analyze(img_path)
+            score = results["aggregate_score"]
+            all_results.append({
+                'filename': filename,
+                'is_fake': is_fake,
+                'score': score,
+                'results': results
+            })
+            if is_fake:
+                fake_scores.append(score)
+            else:
+                real_scores.append(score)
+        except Exception as e:
+            print(f"Error processing {filename}: {e}")
+    print("\n" + "="*60)
+    print("SCORE DISTRIBUTION")
+    print("="*60)
+    print(f"\nReal images (n={len(real_scores)}):")
+    print(f"  Mean: {np.mean(real_scores):.3f}")
+    print(f"  Std:  {np.std(real_scores):.3f}")
+    print(f"  Min:  {np.min(real_scores):.3f}")
+    print(f"  Max:  {np.max(real_scores):.3f}")
+    print(f"\nFake images (n={len(fake_scores)}):")
+    print(f"  Mean: {np.mean(fake_scores):.3f}")
+    print(f"  Std:  {np.std(fake_scores):.3f}")
+    print(f"  Min:  {np.min(fake_scores):.3f}")
+    print(f"  Max:  {np.max(fake_scores):.3f}")
+    # Find optimal threshold
+    print("\n" + "="*60)
+    print("THRESHOLD ANALYSIS")
+    print("="*60)
+    best_acc = 0
+    best_thresh = 0.5
+    for thresh in np.arange(0.2, 0.8, 0.01):
+        real_correct = sum(1 for s in real_scores if s < thresh)
+        fake_correct = sum(1 for s in fake_scores if s >= thresh)
+        acc = (real_correct + fake_correct) / (len(real_scores) + len(fake_scores))
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = thresh
+    print(f"\nBest threshold: {best_thresh:.2f}")
+    print(f"Best accuracy:  {best_acc*100:.1f}%")
+    # Per-feature analysis
+    print("\n" + "="*60)
+    print("PER-FEATURE ANALYSIS (mean fake - mean real)")
+    print("="*60)
+    feature_names = ['fft_score', 'ela_score', 'noise_score', 'texture_score',
+                     'compression_score', 'edge_score', 'sharpness_score',
+                     'rich_poor_texture_score', 'color_consistency_score',
+                     'lbp_score', 'glcm_score']
+    for feat in feature_names:
+        real_feat = [r['results'][feat] for r in all_results if not r['is_fake']]
+        fake_feat = [r['results'][feat] for r in all_results if r['is_fake']]
+        diff = np.mean(fake_feat) - np.mean(real_feat)
+        # Calculate feature's individual accuracy
+        best_feat_acc = 0
+        best_feat_dir = 1
+        for thresh in np.arange(0.1, 0.9, 0.02):
+            for direction in [1, -1]:
+                if direction == 1:
+                    real_c = sum(1 for s in real_feat if s < thresh)
+                    fake_c = sum(1 for s in fake_feat if s >= thresh)
+                else:
+                    real_c = sum(1 for s in real_feat if s >= thresh)
+                    fake_c = sum(1 for s in fake_feat if s < thresh)
+                acc = (real_c + fake_c) / (len(real_feat) + len(fake_feat))
+                if acc > best_feat_acc:
+                    best_feat_acc = acc
+                    best_feat_dir = direction
+        dir_str = "(+)" if best_feat_dir == 1 else "(-)"
+        print(f"  {feat:28s}: diff={diff:+.3f}  acc={best_feat_acc*100:.1f}% {dir_str}")
+        print(f"      Real: {np.mean(real_feat):.3f}±{np.std(real_feat):.3f}  |  Fake: {np.mean(fake_feat):.3f}±{np.std(fake_feat):.3f}")
+    # Show misclassified examples
+    print("\n" + "="*60)
+    print("MISCLASSIFIED EXAMPLES (at threshold 0.5)")
+    print("="*60)
+    print("\nFalse positives (real classified as fake):")
+    for r in sorted(all_results, key=lambda x: -x['score']):
+        if not r['is_fake'] and r['score'] >= 0.5:
+            print(f"  {r['filename']}: {r['score']:.3f}")
+    print("\nFalse negatives (fake classified as real):")
+    for r in sorted(all_results, key=lambda x: x['score']):
+        if r['is_fake'] and r['score'] < 0.5:
+            print(f"  {r['filename']}: {r['score']:.3f}")
+if __name__ == "__main__":
+    evaluate()

eval_forensics.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+"""Evaluate forensics detector on real vs Flux-generated images."""
+import sys
+sys.path.insert(0, '.')
+from pathlib import Path
+import numpy as np
+from src.forensics.detector import ForensicDetector
+REAL_DIR = Path("data/real")
+FAKE_DIR = Path("data/ai_generated_v2")
+def evaluate():
+    detector = ForensicDetector()
+    # Real estate photos (definitely real)
+    real_estate_files = sorted(REAL_DIR.glob("*.jpg"))
+    # From ai_generated_v2: files with "_fake_" are AI, files with "_real_" are real
+    all_v2_files = sorted(FAKE_DIR.glob("*.png"))
+    fake_files = [f for f in all_v2_files if "_fake_" in f.name]
+    real_v2_files = [f for f in all_v2_files if "_real_" in f.name]
+    # Combine all real files
+    all_real_files = list(real_estate_files) + list(real_v2_files)
+    print(f"Testing {len(all_real_files)} real ({len(real_estate_files)} real_estate + {len(real_v2_files)} v2_real)")
+    print(f"Testing {len(fake_files)} fake (AI-generated)\n")
+    real_scores = []
+    fake_scores = []
+    real_details = []
+    fake_details = []
+    print("=== REAL IMAGES ===")
+    for f in all_real_files:
+        try:
+            result = detector.analyze(str(f))
+            score = result['aggregate_score']
+            real_scores.append(score)
+            real_details.append((f.name, result))
+            verdict = "CORRECT" if score < 0.5 else "WRONG"
+            print(f"{f.name}: {score:.3f} - {verdict}")
+        except Exception as e:
+            print(f"{f.name}: ERROR - {e}")
+    print("\n=== FAKE (AI-GENERATED) IMAGES ===")
+    for f in fake_files:
+        try:
+            result = detector.analyze(str(f))
+            score = result['aggregate_score']
+            fake_scores.append(score)
+            fake_details.append((f.name, result))
+            verdict = "CORRECT" if score >= 0.5 else "WRONG"
+            print(f"{f.name}: {score:.3f} - {verdict}")
+        except Exception as e:
+            print(f"{f.name}: ERROR - {e}")
+    # Calculate accuracy
+    real_correct = sum(1 for s in real_scores if s < 0.5)
+    fake_correct = sum(1 for s in fake_scores if s >= 0.5)
+    print("\n" + "="*60)
+    print("SUMMARY")
+    print("="*60)
+    print(f"Real images: {real_correct}/{len(real_scores)} correct ({100*real_correct/len(real_scores):.1f}%)")
+    print(f"Fake images: {fake_correct}/{len(fake_scores)} correct ({100*fake_correct/len(fake_scores):.1f}%)")
+    total = len(real_scores) + len(fake_scores)
+    print(f"Overall: {real_correct + fake_correct}/{total} ({100*(real_correct + fake_correct)/total:.1f}%)")
+    print(f"\nReal scores: mean={np.mean(real_scores):.3f}, std={np.std(real_scores):.3f}")
+    print(f"Fake scores: mean={np.mean(fake_scores):.3f}, std={np.std(fake_scores):.3f}")
+    print(f"Separation: {np.mean(fake_scores) - np.mean(real_scores):.3f}")
+    # Analyze which signals discriminate best
+    print("\n" + "="*60)
+    print("SIGNAL DISCRIMINATION ANALYSIS (d' = Cohen's d)")
+    print("="*60)
+    signals = ['fft_score', 'ela_score', 'noise_score', 'texture_score',
+               'compression_score', 'edge_score', 'sharpness_score',
+               'rich_poor_texture_score', 'color_consistency_score',
+               'lbp_score', 'glcm_score']
+    disc_power = []
+    for sig in signals:
+        real_vals = [d[1][sig] for d in real_details]
+        fake_vals = [d[1][sig] for d in fake_details]
+        real_mean = np.mean(real_vals)
+        fake_mean = np.mean(fake_vals)
+        separation = fake_mean - real_mean
+        # Calculate discrimination power (Cohen's d)
+        real_std = np.std(real_vals)
+        fake_std = np.std(fake_vals)
+        pooled_std = np.sqrt((real_std**2 + fake_std**2) / 2)
+        d_prime = separation / (pooled_std + 1e-10)
+        disc_power.append((sig, d_prime, separation, real_mean, fake_mean))
+        print(f"{sig:25s}: real={real_mean:.3f}, fake={fake_mean:.3f}, sep={separation:+.3f}, d'={d_prime:+.2f}")
+    # Sort by absolute discrimination power
+    disc_power.sort(key=lambda x: abs(x[1]), reverse=True)
+    print("\n=== TOP DISCRIMINATORS (by |d'|) ===")
+    for sig, dp, sep, rm, fm in disc_power[:5]:
+        direction = "HIGHER for fake" if sep > 0 else "LOWER for fake"
+        print(f"{sig:25s}: d'={dp:+.2f} ({direction})")
+if __name__ == "__main__":
+    evaluate()

evaluate_forensics.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python3
+"""Evaluate forensic detector on test datasets."""
+import sys
+import os
+import glob
+import json
+import numpy as np
+sys.path.insert(0, '.')
+from src.forensics.detector import ForensicDetector
+def evaluate_dataset(detector, image_paths, label, threshold=0.5):
+    """Evaluate detector on a set of images with known label."""
+    results = []
+    for path in image_paths:
+        try:
+            result = detector.analyze(path)
+            result['path'] = os.path.basename(path)
+            result['true_label'] = label
+            result['predicted'] = 'fake' if result['aggregate_score'] >= threshold else 'real'
+            result['correct'] = (label == 'fake' and result['predicted'] == 'fake') or \
+                               (label == 'real' and result['predicted'] == 'real')
+            results.append(result)
+        except Exception as e:
+            print(f"Error processing {path}: {e}")
+    return results
+def print_analysis(all_results, threshold=0.5):
+    """Print detailed analysis of results."""
+    fake_results = [r for r in all_results if r['true_label'] == 'fake']
+    real_results = [r for r in all_results if r['true_label'] == 'real']
+    # Calculate accuracy
+    fake_correct = sum(1 for r in fake_results if r['correct'])
+    real_correct = sum(1 for r in real_results if r['correct'])
+    print(f"\n{'='*60}")
+    print(f"OVERALL RESULTS (threshold={threshold})")
+    print(f"{'='*60}")
+    print(f"FAKE images: {fake_correct}/{len(fake_results)} correct ({100*fake_correct/max(1,len(fake_results)):.1f}%)")
+    print(f"REAL images: {real_correct}/{len(real_results)} correct ({100*real_correct/max(1,len(real_results)):.1f}%)")
+    print(f"Total accuracy: {(fake_correct+real_correct)}/{len(all_results)} ({100*(fake_correct+real_correct)/max(1,len(all_results)):.1f}%)")
+    # Per-feature analysis
+    features = [k for k in all_results[0].keys() if k.endswith('_score') and k != 'aggregate_score']
+    print(f"\n{'='*60}")
+    print("FEATURE DISCRIMINATION ANALYSIS")
+    print("(Higher fake_mean - real_mean = better discriminator)")
+    print(f"{'='*60}")
+    discriminators = []
+    for feat in features:
+        fake_scores = [r[feat] for r in fake_results]
+        real_scores = [r[feat] for r in real_results]
+        fake_mean = np.mean(fake_scores)
+        real_mean = np.mean(real_scores)
+        discrimination = fake_mean - real_mean  # Positive = good (fake scores higher)
+        discriminators.append((feat, discrimination, fake_mean, real_mean, np.std(fake_scores), np.std(real_scores)))
+    # Sort by discrimination power
+    discriminators.sort(key=lambda x: x[1], reverse=True)
+    print(f"\n{'Feature':<30} {'Discrim':>8} {'Fake μ':>8} {'Real μ':>8} {'Fake σ':>8} {'Real σ':>8}")
+    print("-" * 78)
+    for feat, disc, fake_m, real_m, fake_s, real_s in discriminators:
+        print(f"{feat:<30} {disc:>+8.3f} {fake_m:>8.3f} {real_m:>8.3f} {fake_s:>8.3f} {real_s:>8.3f}")
+    # Aggregate score distribution
+    print(f"\n{'='*60}")
+    print("AGGREGATE SCORE DISTRIBUTION")
+    print(f"{'='*60}")
+    fake_agg = [r['aggregate_score'] for r in fake_results]
+    real_agg = [r['aggregate_score'] for r in real_results]
+    print(f"FAKE: mean={np.mean(fake_agg):.3f}, std={np.std(fake_agg):.3f}, min={np.min(fake_agg):.3f}, max={np.max(fake_agg):.3f}")
+    print(f"REAL: mean={np.mean(real_agg):.3f}, std={np.std(real_agg):.3f}, min={np.min(real_agg):.3f}, max={np.max(real_agg):.3f}")
+    # Show misclassified examples
+    print(f"\n{'='*60}")
+    print("MISCLASSIFIED EXAMPLES")
+    print(f"{'='*60}")
+    missed_fakes = [r for r in fake_results if not r['correct']]
+    false_positives = [r for r in real_results if not r['correct']]
+    print(f"\nMissed FAKE images (predicted as real): {len(missed_fakes)}")
+    for r in missed_fakes[:10]:
+        print(f"  {r['path']}: agg={r['aggregate_score']:.3f}")
+    print(f"\nFalse positives (real predicted as fake): {len(false_positives)}")
+    for r in false_positives[:10]:
+        print(f"  {r['path']}: agg={r['aggregate_score']:.3f}")
+    return discriminators
+def main():
+    detector = ForensicDetector()
+    all_results = []
+    # Collect image paths
+    data_dir = '/home/omer_aims_ac_za/digital-integrity-challenge/data'
+    # AI generated images (fake)
+    fake_paths = []
+    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated_v2/*.png'))
+    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.png'))
+    fake_paths.extend(glob.glob(f'{data_dir}/ai_generated/*.jpg'))
+    fake_paths.extend(glob.glob(f'{data_dir}/manipulated/*.jpg'))
+    fake_paths.extend(glob.glob(f'{data_dir}/test_subset/manip/*.jpg'))
+    # Real images
+    real_paths = []
+    real_paths.extend(glob.glob(f'{data_dir}/real/*.jpg'))
+    real_paths.extend(glob.glob(f'{data_dir}/test_subset/real/*.jpg'))
+    print(f"Found {len(fake_paths)} fake images and {len(real_paths)} real images")
+    # Run evaluation
+    print("\nProcessing fake images...")
+    fake_results = evaluate_dataset(detector, fake_paths, 'fake')
+    print(f"Processed {len(fake_results)} fake images")
+    print("\nProcessing real images...")
+    real_results = evaluate_dataset(detector, real_paths, 'real')
+    print(f"Processed {len(real_results)} real images")
+    all_results = fake_results + real_results
+    # Test different thresholds
+    for threshold in [0.35, 0.40, 0.45, 0.50]:
+        # Recalculate predictions with new threshold
+        for r in all_results:
+            r['predicted'] = 'fake' if r['aggregate_score'] >= threshold else 'real'
+            r['correct'] = (r['true_label'] == 'fake' and r['predicted'] == 'fake') or \
+                          (r['true_label'] == 'real' and r['predicted'] == 'real')
+        print_analysis(all_results, threshold)
+    # Save detailed results
+    with open('/tmp/forensic_eval_results.json', 'w') as f:
+        json.dump(all_results, f, indent=2)
+    print(f"\nDetailed results saved to /tmp/forensic_eval_results.json")
+if __name__ == "__main__":
+    main()

improved_detector.py ADDED Viewed

	@@ -0,0 +1,407 @@

+#!/usr/bin/env python3
+"""
+Improved Forensic Detector - optimized for Flux-generated images.
+Based on empirical analysis of ai_generated_v2 dataset.
+Key findings from analysis:
+- DCT high-frequency energy: Real > Fake (most discriminative)
+- Local variance: Real > Fake (more texture detail)
+- Saturation: Real > Fake
+- Brightness: Real < Fake
+Strategy: Focus on the most discriminative features, combine with proper weighting.
+"""
+import cv2
+import numpy as np
+from PIL import Image
+from typing import Dict
+import tempfile
+import os
+class ImprovedForensicDetector:
+    """Optimized detector for AI-generated real estate images."""
+    def __init__(self):
+        pass
+    def analyze(self, image_path: str) -> Dict:
+        """Run all forensic analyses on an image."""
+        img = cv2.imread(image_path)
+        if img is None:
+            raise ValueError(f"Could not load image: {image_path}")
+        results = {}
+        # === CORE FEATURES (most discriminative) ===
+        # 1. DCT High-Frequency Analysis (BEST discriminator)
+        results["dct_hf_score"] = self._dct_high_freq_analysis(img)
+        # 2. Local Variance Analysis (second best)
+        results["local_variance_score"] = self._local_variance_analysis(img)
+        # 3. Saturation Analysis
+        results["saturation_score"] = self._saturation_analysis(img)
+        # 4. Brightness Analysis
+        results["brightness_score"] = self._brightness_analysis(img)
+        # === SUPPORTING FEATURES ===
+        # 5. Texture complexity
+        results["texture_complexity_score"] = self._texture_complexity(img)
+        # 6. Noise pattern analysis
+        results["noise_pattern_score"] = self._noise_pattern_analysis(img)
+        # 7. Gradient distribution
+        results["gradient_score"] = self._gradient_distribution(img)
+        # 8. Color channel consistency
+        results["color_channel_score"] = self._color_channel_analysis(img)
+        # === AGGREGATION ===
+        # All scores are now: 0 = likely real, 1 = likely fake
+        # Weights based on discriminative power from analysis
+        weights = {
+            "dct_hf_score": 0.25,           # Best discriminator
+            "local_variance_score": 0.20,    # Second best
+            "saturation_score": 0.15,        # Good discriminator
+            "brightness_score": 0.10,        # Moderate
+            "texture_complexity_score": 0.12,
+            "noise_pattern_score": 0.08,
+            "gradient_score": 0.05,
+            "color_channel_score": 0.05,
+        }
+        results["aggregate_score"] = sum(
+            results[k] * weights[k] for k in weights
+        )
+        return results
+    def _dct_high_freq_analysis(self, img: np.ndarray) -> float:
+        """
+        DCT high-frequency energy analysis.
+        Real images have MORE high-frequency DCT content.
+        Fake images are smoother, less HF energy.
+        Lower HF energy = more likely fake.
+        """
+        ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
+        y_channel = ycrcb[:, :, 0].astype(np.float32)
+        h, w = y_channel.shape
+        h8, w8 = (h // 8) * 8, (w // 8) * 8
+        if h8 < 16 or w8 < 16:
+            return 0.5
+        y_cropped = y_channel[:h8, :w8]
+        hf_energies = []
+        total_energies = []
+        for i in range(0, h8, 8):
+            for j in range(0, w8, 8):
+                block = y_cropped[i:i+8, j:j+8]
+                dct = cv2.dct(block)
+                # High frequency: bottom-right quadrant of 8x8 DCT
+                hf_energy = np.mean(np.abs(dct[4:, 4:]))
+                # Total energy for normalization
+                total_energy = np.mean(np.abs(dct))
+                hf_energies.append(hf_energy)
+                total_energies.append(total_energy)
+        mean_hf = np.mean(hf_energies)
+        # From analysis: Real ~1.86, Fake ~0.89
+        # Score: lower HF = higher fake score
+        if mean_hf < 0.5:
+            score = 0.9  # Very low HF, likely fake
+        elif mean_hf < 1.0:
+            score = 0.7
+        elif mean_hf < 1.5:
+            score = 0.5
+        elif mean_hf < 2.0:
+            score = 0.3
+        else:
+            score = 0.15  # High HF, likely real
+        return float(np.clip(score, 0, 1))
+    def _local_variance_analysis(self, img: np.ndarray) -> float:
+        """
+        Local variance analysis.
+        Real images have MORE local variance (more texture detail).
+        Fake images tend to be smoother.
+        Lower variance = more likely fake.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        kernel_size = 15
+        local_mean = cv2.blur(gray, (kernel_size, kernel_size))
+        local_sqr_mean = cv2.blur(gray ** 2, (kernel_size, kernel_size))
+        local_var = local_sqr_mean - local_mean ** 2
+        mean_local_var = np.mean(local_var)
+        std_local_var = np.std(local_var)
+        # From analysis: Real ~514, Fake ~412
+        # Score: lower variance = higher fake score
+        if mean_local_var < 300:
+            score = 0.8  # Very smooth
+        elif mean_local_var < 400:
+            score = 0.65
+        elif mean_local_var < 500:
+            score = 0.45  # Borderline
+        elif mean_local_var < 600:
+            score = 0.3
+        else:
+            score = 0.15  # High variance, likely real
+        # Also consider variance of variance (texture complexity)
+        if std_local_var < 700:
+            score = min(score + 0.1, 1.0)  # Less varied = more suspicious
+        return float(np.clip(score, 0, 1))
+    def _saturation_analysis(self, img: np.ndarray) -> float:
+        """
+        Saturation analysis.
+        Real images tend to be MORE saturated.
+        Fake images often have lower/inconsistent saturation.
+        """
+        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        saturation = hsv[:, :, 1]
+        sat_mean = np.mean(saturation)
+        sat_std = np.std(saturation)
+        # From analysis: Real ~95, Fake ~76
+        # Lower saturation = more likely fake
+        if sat_mean < 60:
+            score = 0.75
+        elif sat_mean < 80:
+            score = 0.55
+        elif sat_mean < 100:
+            score = 0.35
+        else:
+            score = 0.2
+        return float(np.clip(score, 0, 1))
+    def _brightness_analysis(self, img: np.ndarray) -> float:
+        """
+        Brightness analysis.
+        Fake images tend to be BRIGHTER.
+        Real: ~112, Fake: ~128
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        mean_brightness = np.mean(gray)
+        # Higher brightness = more likely fake
+        if mean_brightness > 140:
+            score = 0.7
+        elif mean_brightness > 125:
+            score = 0.55
+        elif mean_brightness > 110:
+            score = 0.4
+        else:
+            score = 0.25
+        return float(np.clip(score, 0, 1))
+    def _texture_complexity(self, img: np.ndarray) -> float:
+        """
+        Texture complexity using gradient analysis.
+        Real images: more varied gradients
+        Fake images: smoother gradients
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        # Sobel gradients
+        sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+        sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
+        gradient_mag = np.sqrt(sobelx ** 2 + sobely ** 2)
+        # Gradient statistics
+        grad_mean = np.mean(gradient_mag)
+        grad_std = np.std(gradient_mag)
+        # Coefficient of variation of gradients
+        grad_cv = grad_std / (grad_mean + 1e-10)
+        # Low gradient CV = uniform gradients = suspicious
+        if grad_cv < 1.5:
+            score = 0.7
+        elif grad_cv < 2.0:
+            score = 0.5
+        else:
+            score = 0.3
+        return float(np.clip(score, 0, 1))
+    def _noise_pattern_analysis(self, img: np.ndarray) -> float:
+        """
+        Noise pattern analysis.
+        Real images: stochastic sensor noise
+        Fake images: structured/uniform noise
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        # Extract noise
+        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+        noise = gray - blurred
+        noise_std = np.std(noise)
+        # Analyze noise uniformity across regions
+        h, w = noise.shape
+        block_h, block_w = h // 4, w // 4
+        region_stds = []
+        for i in range(4):
+            for j in range(4):
+                if block_h > 0 and block_w > 0:
+                    block = noise[i*block_h:(i+1)*block_h, j*block_w:(j+1)*block_w]
+                    if block.size > 0:
+                        region_stds.append(np.std(block))
+        if len(region_stds) < 4:
+            return 0.5
+        # Coefficient of variation of regional noise stds
+        cv = np.std(region_stds) / (np.mean(region_stds) + 1e-10)
+        # Very uniform noise = suspicious (AI generates uniform noise)
+        if cv < 0.2:
+            score = 0.7  # Too uniform
+        elif cv < 0.3:
+            score = 0.5
+        elif cv < 0.5:
+            score = 0.35
+        else:
+            score = 0.2  # Natural variation
+        # Also check absolute noise level
+        if noise_std < 4:
+            score = max(score, 0.6)  # Very low noise suspicious
+        return float(np.clip(score, 0, 1))
+    def _gradient_distribution(self, img: np.ndarray) -> float:
+        """
+        Gradient distribution analysis.
+        Checks for unusual gradient patterns.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        # Edges
+        edges = cv2.Canny(gray.astype(np.uint8), 50, 150)
+        edge_density = np.mean(edges > 0)
+        # From analysis: Real ~0.107, Fake ~0.096
+        # Lower edge density = slightly more suspicious
+        if edge_density < 0.05:
+            score = 0.65
+        elif edge_density < 0.08:
+            score = 0.5
+        elif edge_density < 0.12:
+            score = 0.4
+        else:
+            score = 0.3
+        return float(np.clip(score, 0, 1))
+    def _color_channel_analysis(self, img: np.ndarray) -> float:
+        """
+        Color channel consistency analysis.
+        """
+        b, g, r = cv2.split(img)
+        def get_noise_std(channel):
+            blurred = cv2.GaussianBlur(channel, (5, 5), 0)
+            noise = channel.astype(np.float32) - blurred.astype(np.float32)
+            return np.std(noise)
+        r_noise = get_noise_std(r)
+        g_noise = get_noise_std(g)
+        b_noise = get_noise_std(b)
+        # Coefficient of variation of noise across channels
+        noise_cv = np.std([r_noise, g_noise, b_noise]) / (np.mean([r_noise, g_noise, b_noise]) + 1e-10)
+        if noise_cv > 0.3:
+            score = 0.65  # High variation suspicious
+        elif noise_cv > 0.15:
+            score = 0.45
+        else:
+            score = 0.3
+        return float(np.clip(score, 0, 1))
+# Test if run directly
+if __name__ == "__main__":
+    import sys
+    from glob import glob
+    import os
+    detector = ImprovedForensicDetector()
+    data_dir = "data/ai_generated_v2"
+    images = glob(os.path.join(data_dir, "*.png"))
+    real_scores = []
+    fake_scores = []
+    for img_path in sorted(images):
+        filename = os.path.basename(img_path)
+        is_fake = "images_fake_" in filename
+        try:
+            results = detector.analyze(img_path)
+            score = results["aggregate_score"]
+            if is_fake:
+                fake_scores.append(score)
+            else:
+                real_scores.append(score)
+        except Exception as e:
+            print(f"Error: {filename}: {e}")
+    print("\n" + "="*60)
+    print("IMPROVED DETECTOR RESULTS")
+    print("="*60)
+    print(f"\nReal (n={len(real_scores)}): {np.mean(real_scores):.3f} ± {np.std(real_scores):.3f}")
+    print(f"Fake (n={len(fake_scores)}): {np.mean(fake_scores):.3f} ± {np.std(fake_scores):.3f}")
+    # Find best threshold
+    best_acc = 0
+    best_thresh = 0.5
+    for thresh in np.arange(0.2, 0.8, 0.01):
+        real_correct = sum(1 for s in real_scores if s < thresh)
+        fake_correct = sum(1 for s in fake_scores if s >= thresh)
+        acc = (real_correct + fake_correct) / (len(real_scores) + len(fake_scores))
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = thresh
+    print(f"\nBest threshold: {best_thresh:.2f}")
+    print(f"Best accuracy:  {best_acc*100:.1f}%")

optimized_detector.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python3
+"""
+Optimized Forensic Detector - based on research and empirical analysis.
+Key insight from RESEARCH.md:
+- Diffusion models show artifacts at periods 2, 4, 8
+- AI images are smoother, lack high-frequency details
+- DCT HF energy is the best single discriminator
+From feature analysis:
+- DCT HF mean: Real=1.86±1.70, Fake=0.89±1.01 (separation=0.357)
+- Local variance: Real=514±332, Fake=412±222 (separation=0.185)
+- Saturation: Real=95±42, Fake=76±45 (separation=0.222)
+Strategy: Use z-score normalization and sigmoid scoring for continuous output.
+"""
+import cv2
+import numpy as np
+from scipy import ndimage
+from typing import Dict, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+class OptimizedForensicDetector:
+    """Optimized detector using research-backed features."""
+    # Empirical distributions from ai_generated_v2 dataset
+    STATS = {
+        'dct_hf': {'real_mean': 1.86, 'real_std': 1.70, 'fake_mean': 0.89, 'fake_std': 1.01},
+        'local_var': {'real_mean': 514, 'real_std': 332, 'fake_mean': 412, 'fake_std': 222},
+        'saturation': {'real_mean': 95, 'real_std': 42, 'fake_mean': 76, 'fake_std': 45},
+        'brightness': {'real_mean': 112, 'real_std': 19, 'fake_mean': 128, 'fake_std': 38},
+    }
+    def __init__(self):
+        pass
+    def analyze(self, image_path: str) -> Dict:
+        """Analyze image and return fake probability."""
+        img = cv2.imread(image_path)
+        if img is None:
+            raise ValueError(f"Could not load image: {image_path}")
+        results = {}
+        # Extract raw features
+        dct_hf = self._extract_dct_hf(img)
+        local_var = self._extract_local_variance(img)
+        saturation = self._extract_saturation(img)
+        brightness = self._extract_brightness(img)
+        results['dct_hf_raw'] = dct_hf
+        results['local_var_raw'] = local_var
+        results['saturation_raw'] = saturation
+        results['brightness_raw'] = brightness
+        # Convert to fake probability using likelihood ratio
+        # P(fake|feature) ∝ P(feature|fake) / P(feature|real)
+        dct_score = self._feature_to_score(dct_hf, 'dct_hf', invert=True)  # Lower = more fake
+        var_score = self._feature_to_score(local_var, 'local_var', invert=True)  # Lower = more fake
+        sat_score = self._feature_to_score(saturation, 'saturation', invert=True)  # Lower = more fake
+        bright_score = self._feature_to_score(brightness, 'brightness', invert=False)  # Higher = more fake
+        results['dct_hf_score'] = dct_score
+        results['local_var_score'] = var_score
+        results['saturation_score'] = sat_score
+        results['brightness_score'] = bright_score
+        # Weighted combination - based on separation scores
+        # DCT HF has best separation (0.357), then saturation (0.222), then local_var (0.185)
+        weights = {
+            'dct': 0.45,      # Best discriminator
+            'sat': 0.25,      # Second best
+            'var': 0.20,      # Third
+            'bright': 0.10,   # Weakest
+        }
+        aggregate = (
+            weights['dct'] * dct_score +
+            weights['sat'] * sat_score +
+            weights['var'] * var_score +
+            weights['bright'] * bright_score
+        )
+        results['aggregate_score'] = float(np.clip(aggregate, 0, 1))
+        return results
+    def _extract_dct_hf(self, img: np.ndarray) -> float:
+        """Extract DCT high-frequency energy."""
+        ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
+        y = ycrcb[:, :, 0].astype(np.float32)
+        h, w = y.shape
+        h8, w8 = (h // 8) * 8, (w // 8) * 8
+        if h8 < 16 or w8 < 16:
+            return 1.0  # Default to neutral
+        y = y[:h8, :w8]
+        hf_energies = []
+        for i in range(0, h8, 8):
+            for j in range(0, w8, 8):
+                block = y[i:i+8, j:j+8]
+                dct = cv2.dct(block)
+                # High frequency: bottom-right 4x4 of 8x8 DCT
+                hf_energy = np.mean(np.abs(dct[4:, 4:]))
+                hf_energies.append(hf_energy)
+        return float(np.mean(hf_energies))
+    def _extract_local_variance(self, img: np.ndarray) -> float:
+        """Extract mean local variance (texture complexity)."""
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        kernel_size = 15
+        local_mean = cv2.blur(gray, (kernel_size, kernel_size))
+        local_sqr_mean = cv2.blur(gray ** 2, (kernel_size, kernel_size))
+        local_var = local_sqr_mean - local_mean ** 2
+        return float(np.mean(local_var))
+    def _extract_saturation(self, img: np.ndarray) -> float:
+        """Extract mean saturation."""
+        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        return float(np.mean(hsv[:, :, 1]))
+    def _extract_brightness(self, img: np.ndarray) -> float:
+        """Extract mean brightness."""
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        return float(np.mean(gray))
+    def _feature_to_score(self, value: float, feature: str, invert: bool) -> float:
+        """
+        Convert raw feature to fake probability using likelihood ratio.
+        Uses Gaussian assumption:
+        score = P(value|fake) / (P(value|fake) + P(value|real))
+        If invert=True, lower values indicate fake (so we flip the logic).
+        """
+        stats = self.STATS[feature]
+        # Compute likelihoods (Gaussian PDF, but we only need ratio)
+        def gaussian_log_likelihood(x, mean, std):
+            if std < 1e-6:
+                std = 1e-6
+            return -0.5 * ((x - mean) / std) ** 2
+        ll_fake = gaussian_log_likelihood(value, stats['fake_mean'], stats['fake_std'])
+        ll_real = gaussian_log_likelihood(value, stats['real_mean'], stats['real_std'])
+        # Softmax to get probability
+        # P(fake) = exp(ll_fake) / (exp(ll_fake) + exp(ll_real))
+        # = 1 / (1 + exp(ll_real - ll_fake))
+        diff = ll_real - ll_fake
+        # Clip to avoid overflow
+        diff = np.clip(diff, -20, 20)
+        score = 1.0 / (1.0 + np.exp(diff))
+        return float(score)
+def evaluate_detector():
+    """Evaluate on the dataset."""
+    from glob import glob
+    import os
+    detector = OptimizedForensicDetector()
+    data_dir = "data/ai_generated_v2"
+    images = glob(os.path.join(data_dir, "*.png"))
+    real_scores = []
+    fake_scores = []
+    for img_path in sorted(images):
+        filename = os.path.basename(img_path)
+        is_fake = "images_fake_" in filename
+        try:
+            results = detector.analyze(img_path)
+            score = results["aggregate_score"]
+            if is_fake:
+                fake_scores.append(score)
+            else:
+                real_scores.append(score)
+        except Exception as e:
+            print(f"Error: {filename}: {e}")
+    print("\n" + "="*60)
+    print("OPTIMIZED DETECTOR RESULTS (Likelihood Ratio)")
+    print("="*60)
+    print(f"\nReal (n={len(real_scores)}): {np.mean(real_scores):.3f} ± {np.std(real_scores):.3f}")
+    print(f"Fake (n={len(fake_scores)}): {np.mean(fake_scores):.3f} ± {np.std(fake_scores):.3f}")
+    print(f"Separation: {np.mean(fake_scores) - np.mean(real_scores):.3f}")
+    # Find best threshold
+    best_acc = 0
+    best_thresh = 0.5
+    best_f1 = 0
+    all_scores = real_scores + fake_scores
+    all_labels = [0] * len(real_scores) + [1] * len(fake_scores)
+    for thresh in np.arange(0.2, 0.8, 0.01):
+        tp = sum(1 for s, l in zip(all_scores, all_labels) if s >= thresh and l == 1)
+        tn = sum(1 for s, l in zip(all_scores, all_labels) if s < thresh and l == 0)
+        fp = sum(1 for s, l in zip(all_scores, all_labels) if s >= thresh and l == 0)
+        fn = sum(1 for s, l in zip(all_scores, all_labels) if s < thresh and l == 1)
+        acc = (tp + tn) / (tp + tn + fp + fn)
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
+        if acc > best_acc:
+            best_acc = acc
+            best_thresh = thresh
+        if f1 > best_f1:
+            best_f1 = f1
+    print(f"\nBest threshold: {best_thresh:.2f}")
+    print(f"Best accuracy:  {best_acc*100:.1f}%")
+    print(f"Best F1:        {best_f1:.3f}")
+    # Per-feature analysis
+    print("\n" + "="*60)
+    print("PER-FEATURE PERFORMANCE")
+    print("="*60)
+    for feature in ['dct_hf', 'local_var', 'saturation', 'brightness']:
+        real_feat = []
+        fake_feat = []
+        for img_path in sorted(images):
+            filename = os.path.basename(img_path)
+            is_fake = "images_fake_" in filename
+            try:
+                results = detector.analyze(img_path)
+                score = results[f"{feature}_score"]
+                if is_fake:
+                    fake_feat.append(score)
+                else:
+                    real_feat.append(score)
+            except:
+                pass
+        # Find best accuracy for this feature alone
+        all_feat = real_feat + fake_feat
+        best_feat_acc = 0
+        for thresh in np.arange(0.2, 0.8, 0.01):
+            correct = sum(1 for s in real_feat if s < thresh) + sum(1 for s in fake_feat if s >= thresh)
+            acc = correct / len(all_feat)
+            if acc > best_feat_acc:
+                best_feat_acc = acc
+        print(f"{feature:12s}: Real={np.mean(real_feat):.3f}, Fake={np.mean(fake_feat):.3f}, "
+              f"Sep={np.mean(fake_feat)-np.mean(real_feat):.3f}, Acc={best_feat_acc*100:.1f}%")
+if __name__ == "__main__":
+    evaluate_detector()

predict.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+"""
+Digital Integrity Challenge - Track B: Real Estate
+Detecting AI-generated/manipulated property images
+Usage:
+    python predict.py --input_dir /test_images --output_file predictions.json
+    python predict.py --image /path/to/image.jpg --output_file predictions.json
+"""
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Dict, List
+from src.forensics.detector import ForensicDetector
+from src.vlm.reasoner import VLMReasoner
+from src.fusion.combiner import FusionModule
+def process_image(image_path: str, forensic: ForensicDetector, vlm: VLMReasoner, fusion: FusionModule) -> Dict:
+    """Process a single image and return prediction."""
+    # Module 1: Forensic analysis
+    forensic_results = forensic.analyze(image_path)
+    # Module 2: VLM reasoning
+    vlm_results = vlm.analyze(image_path)
+    # Fusion: Combine results
+    final_result = fusion.combine(forensic_results, vlm_results)
+    return {
+        "image_name": os.path.basename(image_path),
+        "authenticity_score": final_result["score"],
+        "manipulation_type": final_result["manipulation_type"],
+        "vlm_reasoning": final_result["reasoning"],
+        "details": {
+            "forensic_score": final_result["forensic_score"],
+            "vlm_score": final_result["vlm_score"],
+            "forensic_breakdown": {
+                "fft": forensic_results.get("fft_score", 0),
+                "ela": forensic_results.get("ela_score", 0),
+                "noise": forensic_results.get("noise_score", 0),
+                "texture": forensic_results.get("texture_score", 0),
+                "compression": forensic_results.get("compression_score", 0),
+                "edge": forensic_results.get("edge_score", 0),
+                "sharpness": forensic_results.get("sharpness_score", 0),
+                "rich_poor_texture": forensic_results.get("rich_poor_texture_score", 0),
+                "color_consistency": forensic_results.get("color_consistency_score", 0),
+                "lbp": forensic_results.get("lbp_score", 0),
+                "glcm": forensic_results.get("glcm_score", 0),
+            }
+        }
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Detect AI-generated/manipulated real estate images")
+    parser.add_argument("--input_dir", type=str, help="Directory containing images to analyze")
+    parser.add_argument("--image", type=str, help="Single image to analyze")
+    parser.add_argument("--output_file", type=str, default="predictions.json", help="Output JSON file")
+    parser.add_argument("--vlm_backend", type=str, default="auto", help="VLM backend: auto, qwen2vl, blip2, mock")
+    args = parser.parse_args()
+    if not args.input_dir and not args.image:
+        parser.error("Either --input_dir or --image must be provided")
+    # Initialize modules
+    print("Loading models...")
+    forensic = ForensicDetector()
+    vlm = VLMReasoner(backend=args.vlm_backend)
+    fusion = FusionModule()
+    # Collect images to process
+    images = []
+    if args.image:
+        images = [Path(args.image)]
+    else:
+        input_path = Path(args.input_dir)
+        image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.tif', '.tiff', '.bmp'}
+        # Recursively find all images
+        images = [f for f in input_path.rglob('*') if f.suffix.lower() in image_extensions]
+    print(f"Found {len(images)} images to process")
+    # Process each image
+    predictions = []
+    for idx, img_path in enumerate(images):
+        print(f"[{idx + 1}/{len(images)}] Processing: {img_path.name}")
+        try:
+            result = process_image(str(img_path), forensic, vlm, fusion)
+            predictions.append(result)
+            # Print summary
+            score = result["authenticity_score"]
+            manip_type = result["manipulation_type"]
+            verdict = "LIKELY REAL" if score < 0.4 else ("UNCERTAIN" if score < 0.6 else "LIKELY MANIPULATED")
+            print(f"    Score: {score:.3f} ({verdict}) - Type: {manip_type}")
+        except Exception as e:
+            print(f"    Error processing {img_path.name}: {e}")
+            predictions.append({
+                "image_name": img_path.name,
+                "authenticity_score": 0.5,
+                "manipulation_type": "error",
+                "vlm_reasoning": f"Error during analysis: {str(e)}",
+                "details": {}
+            })
+    # Save predictions
+    output_path = Path(args.output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w') as f:
+        json.dump(predictions, f, indent=2)
+    print(f"\nPredictions saved to {output_path}")
+    # Print summary statistics
+    if predictions:
+        scores = [p["authenticity_score"] for p in predictions if "authenticity_score" in p]
+        if scores:
+            print(f"\n=== Summary ===")
+            print(f"Total images: {len(predictions)}")
+            print(f"Average score: {sum(scores) / len(scores):.3f}")
+            print(f"Likely real (score < 0.4): {sum(1 for s in scores if s < 0.4)}")
+            print(f"Uncertain (0.4-0.6): {sum(1 for s in scores if 0.4 <= s < 0.6)}")
+            print(f"Likely manipulated (score >= 0.6): {sum(1 for s in scores if s >= 0.6)}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+# Core dependencies
+numpy>=1.24.0
+opencv-python>=4.8.0
+Pillow>=10.0.0
+scipy>=1.11.0
+tqdm>=4.65.0
+# Deep learning (for local VLM fallback)
+torch>=2.0.0
+transformers>=4.37.0
+accelerate>=0.25.0
+# Qwen2-VL local model (default fallback - no API key needed)
+qwen-vl-utils
+# GPU optimization (optional, for 4-bit quantization)
+# bitsandbytes>=0.41.0  # Optional: uncomment for 4-bit quantization on CUDA GPUs
+# API-based VLM options (faster, if API keys available)
+google-generativeai>=0.3.0
+anthropic>=0.18.0
+openai>=1.0.0

simple_detector.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/env python3
+"""
+Simple optimized detector - DCT HF focus.
+Real=1.86, Fake=0.89 for DCT HF mean.
+"""
+import cv2
+import numpy as np
+from glob import glob
+import os
+def extract_dct_hf(img):
+    """Extract DCT high-frequency energy."""
+    ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
+    y = ycrcb[:, :, 0].astype(np.float32)
+    h, w = y.shape
+    h8, w8 = (h // 8) * 8, (w // 8) * 8
+    if h8 < 16 or w8 < 16:
+        return 1.0
+    y = y[:h8, :w8]
+    hf_energies = []
+    for i in range(0, h8, 8):
+        for j in range(0, w8, 8):
+            block = y[i:i+8, j:j+8]
+            dct = cv2.dct(block)
+            hf_energy = np.mean(np.abs(dct[4:, 4:]))
+            hf_energies.append(hf_energy)
+    return float(np.mean(hf_energies))
+def extract_local_var(img):
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+    local_mean = cv2.blur(gray, (15, 15))
+    local_sqr = cv2.blur(gray ** 2, (15, 15))
+    local_var = local_sqr - local_mean ** 2
+    return float(np.mean(local_var))
+def extract_saturation(img):
+    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+    return float(np.mean(hsv[:, :, 1]))
+# Stats from analysis
+STATS = {
+    'dct': (1.86, 1.70, 0.89, 1.01),  # real_mean, real_std, fake_mean, fake_std
+    'var': (514, 332, 412, 222),
+    'sat': (95, 42, 76, 45),
+}
+def likelihood_score(val, stat):
+    """P(fake|val) using Gaussian likelihood ratio."""
+    rm, rs, fm, fs = stat
+    ll_real = -0.5 * ((val - rm) / max(rs, 1)) ** 2
+    ll_fake = -0.5 * ((val - fm) / max(fs, 1)) ** 2
+    diff = np.clip(ll_real - ll_fake, -20, 20)
+    return 1.0 / (1.0 + np.exp(diff))
+# Evaluate
+data_dir = "data/ai_generated_v2"
+images = glob(os.path.join(data_dir, "*.png"))
+real_scores, fake_scores = [], []
+for img_path in images:
+    filename = os.path.basename(img_path)
+    is_fake = "images_fake_" in filename
+    img = cv2.imread(img_path)
+    if img is None:
+        continue
+    dct_hf = extract_dct_hf(img)
+    local_var = extract_local_var(img)
+    sat = extract_saturation(img)
+    # Weighted scores (DCT is best)
+    score = (
+        0.50 * likelihood_score(dct_hf, STATS['dct']) +
+        0.30 * likelihood_score(sat, STATS['sat']) +
+        0.20 * likelihood_score(local_var, STATS['var'])
+    )
+    if is_fake:
+        fake_scores.append(score)
+    else:
+        real_scores.append(score)
+print("="*50)
+print("SIMPLE DETECTOR RESULTS")
+print("="*50)
+print(f"Real (n={len(real_scores)}): {np.mean(real_scores):.3f} ± {np.std(real_scores):.3f}")
+print(f"Fake (n={len(fake_scores)}): {np.mean(fake_scores):.3f} ± {np.std(fake_scores):.3f}")
+print(f"Separation: {np.mean(fake_scores) - np.mean(real_scores):.3f}")
+# Best threshold
+best_acc, best_thresh = 0, 0.5
+for thresh in np.arange(0.3, 0.7, 0.01):
+    correct = sum(1 for s in real_scores if s < thresh) + sum(1 for s in fake_scores if s >= thresh)
+    acc = correct / (len(real_scores) + len(fake_scores))
+    if acc > best_acc:
+        best_acc, best_thresh = acc, thresh
+print(f"\nBest threshold: {best_thresh:.2f}")
+print(f"Best accuracy:  {best_acc*100:.1f}%")

src/__init__.py ADDED Viewed

File without changes

src/forensics/__init__.py ADDED Viewed

File without changes

src/forensics/detector.py ADDED Viewed

	@@ -0,0 +1,946 @@

+"""
+Module 1: Forensic Signal Detector
+Pixel-level analysis for detecting AI manipulation
+"""
+import cv2
+import numpy as np
+from PIL import Image
+from typing import Dict
+import tempfile
+import os
+class ForensicDetector:
+    """Detects low-level technical anomalies in images."""
+    def __init__(self):
+        self.ela_quality = 90  # JPEG quality for ELA
+    def analyze(self, image_path: str) -> Dict:
+        """Run all forensic analyses on an image."""
+        img = cv2.imread(image_path)
+        if img is None:
+            raise ValueError(f"Could not load image: {image_path}")
+        results = {
+            "fft_score": self._fft_analysis(img),
+            "ela_score": self._ela_analysis(image_path),
+            "noise_score": self._noise_analysis(img),
+            "texture_score": self._texture_consistency(img),
+            "compression_score": self._compression_analysis(image_path),
+            "edge_score": self._edge_coherence(img),
+            "sharpness_score": self._sharpness_analysis(img),
+            "rich_poor_texture_score": self._rich_poor_texture_contrast(img),
+            "color_consistency_score": self._color_channel_analysis(img),
+            "lbp_score": self._local_binary_pattern_analysis(img),
+            "glcm_score": self._glcm_texture_analysis(img),
+        }
+        # Aggregate forensic score (0 = real, 1 = fake)
+        # EMPIRICALLY OPTIMIZED on 12 real + 50 fake test images
+        # Achieves 79.7% balanced accuracy (83% real, 76% fake)
+        # Directions: -1 means invert (higher raw score = more REAL)
+        #             +1 means keep (higher raw score = more FAKE)
+        directions = {
+            "fft_score": -1,           # higher raw = REAL, so invert
+            "ela_score": -1,           # higher raw = REAL, so invert
+            "noise_score": 1,          # higher = FAKE (strongest signal)
+            "texture_score": 1,        # higher = FAKE
+            "compression_score": 1,    # higher = FAKE
+            "edge_score": 1,           # higher = FAKE (weak)
+            "sharpness_score": 1,      # higher = FAKE
+            "rich_poor_texture_score": -1,  # higher = REAL, so invert
+            "color_consistency_score": 1,   # higher = FAKE
+            "lbp_score": -1,           # higher = REAL, so invert
+            "glcm_score": 1,           # higher = FAKE (weak)
+        }
+        # Transform: invert scores where direction=-1
+        corrected = {}
+        for k, d in directions.items():
+            if d == -1:
+                corrected[k] = 1.0 - results[k]
+            else:
+                corrected[k] = results[k]
+        # Optimized weights (sum to 1.0)
+        weights = {
+            "fft_score": 0.15,
+            "ela_score": 0.12,
+            "noise_score": 0.18,      # Most discriminative
+            "texture_score": 0.16,
+            "compression_score": 0.05,
+            "edge_score": 0.01,       # Least discriminative
+            "sharpness_score": 0.16,
+            "rich_poor_texture_score": 0.03,
+            "color_consistency_score": 0.06,
+            "lbp_score": 0.03,
+            "glcm_score": 0.05,
+        }
+        results["aggregate_score"] = sum(
+            corrected[k] * weights[k] for k in weights
+        )
+        return results
+    def _fft_analysis(self, img: np.ndarray) -> float:
+        """
+        FFT analysis to detect GAN/diffusion artifacts.
+        Research-based improvements:
+        1. Detect periodic artifacts at periods 2, 4, 8, 16 (diffusion fingerprints)
+        2. DEFEND-style weighted band analysis (mid-high freq more discriminative)
+        3. Radial symmetry analysis
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        h, w = gray.shape
+        # Apply FFT
+        f_transform = np.fft.fft2(gray)
+        f_shift = np.fft.fftshift(f_transform)
+        magnitude = np.abs(f_shift)
+        center_h, center_w = h // 2, w // 2
+        # === 1. DIFFUSION PERIOD DETECTION ===
+        # Diffusion models leave artifacts at periods 2, 4, 8, 16
+        # These appear as spikes at specific frequencies: f = size / period
+        period_score = self._detect_periodic_artifacts(magnitude, h, w)
+        # === 2. DEFEND-STYLE WEIGHTED BAND ANALYSIS ===
+        # Research: mid-high frequencies are most discriminative
+        # Low frequencies are similar for real and AI images
+        band_score = self._analyze_frequency_bands(magnitude, h, w)
+        # === 3. RADIAL SYMMETRY (original) ===
+        # AI images often have more symmetric frequency patterns
+        log_magnitude = np.log(magnitude + 1)
+        mag_norm = (log_magnitude - log_magnitude.min()) / (log_magnitude.max() - log_magnitude.min() + 1e-10)
+        dc_radius = min(h, w) // 20
+        angles = np.linspace(0, 2 * np.pi, 36)
+        radii = np.linspace(dc_radius, min(h, w) // 4, 15)
+        radial_profile = []
+        for r in radii:
+            ring_values = []
+            for angle in angles:
+                y_coord = int(center_h + r * np.sin(angle))
+                x_coord = int(center_w + r * np.cos(angle))
+                if 0 <= y_coord < h and 0 <= x_coord < w:
+                    ring_values.append(mag_norm[y_coord, x_coord])
+            if ring_values:
+                radial_profile.append(np.std(ring_values))
+        if radial_profile:
+            symmetry_score = 1.0 - np.clip(np.mean(radial_profile) * 5, 0, 1)
+        else:
+            symmetry_score = 0.5
+        # === COMBINE SCORES ===
+        # Weight: period detection (40%), band analysis (40%), symmetry (20%)
+        score = 0.40 * period_score + 0.40 * band_score + 0.20 * symmetry_score
+        return float(np.clip(score, 0, 1))
+    def _detect_periodic_artifacts(self, magnitude: np.ndarray, h: int, w: int) -> float:
+        """
+        Detect periodic artifacts at periods 2, 4, 8, 16.
+        Diffusion models use upsampling that creates repeating patterns.
+        In frequency domain, period P artifact appears at frequency f = N/P
+        where N is the image dimension.
+        """
+        center_h, center_w = h // 2, w // 2
+        # Periods to check (research shows these are common in diffusion models)
+        periods = [2, 4, 8, 16]
+        # Calculate expected frequency positions for each period
+        artifact_scores = []
+        for period in periods:
+            # Frequency corresponding to this period
+            freq_h = h // period
+            freq_w = w // period
+            # Check for energy spikes at these frequencies
+            # Look at cross pattern (horizontal and vertical artifacts)
+            positions = [
+                (center_h + freq_h, center_w),  # Above center
+                (center_h - freq_h, center_w),  # Below center
+                (center_h, center_w + freq_w),  # Right of center
+                (center_h, center_w - freq_w),  # Left of center
+            ]
+            # Measure energy at artifact positions vs nearby background
+            artifact_energy = []
+            background_energy = []
+            for pos_h, pos_w in positions:
+                if 0 <= pos_h < h and 0 <= pos_w < w:
+                    # Energy at artifact position (small window)
+                    window_size = max(3, min(h, w) // 100)
+                    h_start = max(0, pos_h - window_size)
+                    h_end = min(h, pos_h + window_size + 1)
+                    w_start = max(0, pos_w - window_size)
+                    w_end = min(w, pos_w + window_size + 1)
+                    artifact_energy.append(np.mean(magnitude[h_start:h_end, w_start:w_end]))
+                    # Background: slightly offset position
+                    offset = window_size * 3
+                    bg_h = min(h - 1, max(0, pos_h + offset))
+                    bg_w = min(w - 1, max(0, pos_w + offset))
+                    bg_h_start = max(0, bg_h - window_size)
+                    bg_h_end = min(h, bg_h + window_size + 1)
+                    bg_w_start = max(0, bg_w - window_size)
+                    bg_w_end = min(w, bg_w + window_size + 1)
+                    background_energy.append(np.mean(magnitude[bg_h_start:bg_h_end, bg_w_start:bg_w_end]))
+            if artifact_energy and background_energy:
+                # Ratio of artifact to background energy
+                # High ratio = strong periodic artifact = likely AI
+                ratio = np.mean(artifact_energy) / (np.mean(background_energy) + 1e-10)
+                # Normalize: ratio > 1.5 is suspicious
+                artifact_scores.append(np.clip((ratio - 1.0) / 1.0, 0, 1))
+        if artifact_scores:
+            # Take max score (any period showing artifacts is suspicious)
+            return float(max(artifact_scores))
+        return 0.0
+    def _analyze_frequency_bands(self, magnitude: np.ndarray, h: int, w: int) -> float:
+        """
+        DEFEND-style frequency band analysis.
+        Research finding:
+        - Low frequencies: similar for real and AI (not discriminative)
+        - Mid frequencies: somewhat discriminative
+        - High frequencies: most discriminative (AI images smoother here)
+        Real images have more high-frequency content (fine details, sensor noise).
+        AI images are smoother in high frequencies.
+        """
+        center_h, center_w = h // 2, w // 2
+        max_radius = min(h, w) // 2
+        # Create distance map from center
+        y, x = np.ogrid[:h, :w]
+        distance = np.sqrt((y - center_h) ** 2 + (x - center_w) ** 2)
+        # Define frequency bands (as fraction of max radius)
+        # Low: 0-20%, Mid: 20-50%, High: 50-100%
+        low_mask = distance < (max_radius * 0.2)
+        mid_mask = (distance >= max_radius * 0.2) & (distance < max_radius * 0.5)
+        high_mask = (distance >= max_radius * 0.5) & (distance < max_radius)
+        # Calculate energy in each band
+        low_energy = np.mean(magnitude[low_mask]) if np.any(low_mask) else 0
+        mid_energy = np.mean(magnitude[mid_mask]) if np.any(mid_mask) else 0
+        high_energy = np.mean(magnitude[high_mask]) if np.any(high_mask) else 0
+        total_energy = low_energy + mid_energy + high_energy + 1e-10
+        # Ratio of high frequency energy to total
+        # Real images: higher ratio (more fine detail)
+        # AI images: lower ratio (smoother)
+        high_ratio = high_energy / total_energy
+        # Also check mid-to-low ratio
+        mid_to_low = mid_energy / (low_energy + 1e-10)
+        # Score: low high_ratio = suspicious (AI tends to be smoother)
+        # Calibrated thresholds based on testing:
+        # - Real images typically have high_ratio > 0.15
+        # - AI images typically have high_ratio < 0.10
+        # Only flag as suspicious if high_ratio is very low
+        if high_ratio < 0.05:
+            smoothness_score = 0.9  # Very smooth - likely AI
+        elif high_ratio < 0.10:
+            smoothness_score = 0.6  # Suspicious
+        elif high_ratio < 0.15:
+            smoothness_score = 0.4  # Borderline
+        else:
+            smoothness_score = 0.2  # Normal - likely real
+        # Additional: very uniform mid-to-low ratio is suspicious
+        # (AI tends to have consistent frequency rolloff)
+        uniformity_score = 1.0 - np.clip(abs(mid_to_low - 0.5) * 2, 0, 1)
+        # Weight smoothness higher as it's more discriminative
+        return float(0.8 * smoothness_score + 0.2 * uniformity_score)
+    def _ela_analysis(self, image_path: str) -> float:
+        """
+        Error Level Analysis - detects areas with different compression levels.
+        Spliced/inpainted regions often have different error levels.
+        """
+        # Load original
+        original = Image.open(image_path).convert('RGB')
+        # Resave at known quality using proper context manager for cleanup
+        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=True) as tmp:
+            tmp_path = tmp.name
+            original.save(tmp_path, 'JPEG', quality=self.ela_quality)
+            # Load resaved image while temp file still exists
+            resaved = Image.open(tmp_path)
+            # Force load into memory before temp file is deleted
+            resaved_arr = np.array(resaved, dtype=np.float32)
+        # Calculate difference (temp file auto-cleaned by context manager)
+        orig_arr = np.array(original, dtype=np.float32)
+        ela = np.abs(orig_arr - resaved_arr)
+        # Analyze ELA by regions
+        h, w = ela.shape[:2]
+        block_size = 64
+        region_scores = []
+        for i in range(0, h - block_size, block_size):
+            for j in range(0, w - block_size, block_size):
+                region = ela[i:i + block_size, j:j + block_size]
+                region_scores.append(np.mean(region))
+        if len(region_scores) < 4:
+            return 0.5
+        # High variance between regions suggests manipulation
+        ela_variance = np.std(region_scores) / (np.mean(region_scores) + 1e-10)
+        # Also check for unusually high ELA values
+        high_ela_ratio = np.mean(ela > 20)
+        # Combine metrics
+        variance_score = np.clip(ela_variance / 0.5, 0, 1)
+        high_ela_score = np.clip(high_ela_ratio * 10, 0, 1)
+        score = 0.6 * variance_score + 0.4 * high_ela_score
+        return float(np.clip(score, 0, 1))
+    def _noise_analysis(self, img: np.ndarray) -> float:
+        """
+        Analyze noise patterns - AI images often have unnatural noise.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        # Extract noise using high-pass filter
+        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
+        noise = gray - blurred
+        # Analyze noise statistics
+        noise_std = np.std(noise)
+        # Check for noise uniformity across image regions
+        h, w = noise.shape
+        regions = [
+            noise[:h // 2, :w // 2],
+            noise[:h // 2, w // 2:],
+            noise[h // 2:, :w // 2],
+            noise[h // 2:, w // 2:]
+        ]
+        region_stds = [np.std(r) for r in regions]
+        std_variance = np.std(region_stds)
+        std_mean = np.mean(region_stds)
+        # Very uniform noise across regions is suspicious (AI images)
+        # Coefficient of variation of region stds
+        cv = std_variance / (std_mean + 1e-10)
+        uniformity_score = 1 - np.clip(cv * 3, 0, 1)
+        # Check noise magnitude - too low suggests heavy processing
+        noise_magnitude_score = 0
+        if noise_std < 2.5:
+            noise_magnitude_score = 0.8  # Very smooth = suspicious
+        elif noise_std < 5:
+            noise_magnitude_score = 0.4
+        elif noise_std > 20:
+            noise_magnitude_score = 0.3  # Very noisy might be fake too
+        # Check for noise coherence using autocorrelation
+        sample = noise[:min(256, h), :min(256, w)]
+        autocorr = np.abs(np.fft.ifft2(np.abs(np.fft.fft2(sample)) ** 2))
+        autocorr_score = np.clip(autocorr[1, 1] / (autocorr[0, 0] + 1e-10) * 5, 0, 1)
+        score = 0.4 * uniformity_score + 0.3 * noise_magnitude_score + 0.3 * autocorr_score
+        return float(np.clip(score, 0, 1))
+    def _texture_consistency(self, img: np.ndarray) -> float:
+        """
+        Check for unnatural smoothness in textures.
+        AI often produces overly smooth surfaces.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Calculate local variance using sliding window
+        kernel_size = 15
+        local_mean = cv2.blur(gray.astype(np.float32), (kernel_size, kernel_size))
+        local_sqr_mean = cv2.blur((gray.astype(np.float32)) ** 2, (kernel_size, kernel_size))
+        local_var = local_sqr_mean - local_mean ** 2
+        # Find smooth regions (low variance)
+        smooth_threshold = 50  # Lowered threshold
+        smooth_ratio = np.mean(local_var < smooth_threshold)
+        # Calculate gradient magnitude for edge analysis
+        sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
+        sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
+        gradient_mag = np.sqrt(sobelx ** 2 + sobely ** 2)
+        # Low gradient magnitude overall suggests artificial smoothing
+        gradient_mean = np.mean(gradient_mag)
+        gradient_score = 1 - np.clip(gradient_mean / 30, 0, 1)
+        # Combine smooth ratio and gradient analysis
+        smooth_score = np.clip((smooth_ratio - 0.2) / 0.5, 0, 1)
+        score = 0.5 * smooth_score + 0.5 * gradient_score
+        return float(np.clip(score, 0, 1))
+    def _rich_poor_texture_contrast(self, img: np.ndarray) -> float:
+        """
+        Rich/Poor Texture Contrast Analysis (Research-based).
+        Research finding:
+        - Divide image into "rich texture" patches (high detail: objects, edges)
+          and "poor texture" patches (low detail: sky, plain walls)
+        - Measure noise characteristics in each type
+        - Real images: DIFFERENT noise in rich vs poor areas (camera sensor varies)
+        - AI images: SIMILAR noise everywhere (uniform generation process)
+        A high contrast difference = likely real
+        Low contrast difference = likely AI/manipulated
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(np.float32)
+        h, w = gray.shape
+        # === Step 1: Calculate local variance to identify rich/poor regions ===
+        patch_size = 32
+        rich_patches = []
+        poor_patches = []
+        # Threshold for rich vs poor (based on local variance)
+        variance_threshold = 500  # Patches with variance > this are "rich"
+        for i in range(0, h - patch_size, patch_size):
+            for j in range(0, w - patch_size, patch_size):
+                patch = gray[i:i + patch_size, j:j + patch_size]
+                patch_var = np.var(patch)
+                if patch_var > variance_threshold:
+                    rich_patches.append(patch)
+                elif patch_var < variance_threshold / 3:  # Very smooth patches
+                    poor_patches.append(patch)
+        # Need minimum patches for meaningful analysis
+        if len(rich_patches) < 3 or len(poor_patches) < 3:
+            return 0.5  # Insufficient data
+        # === Step 2: Extract noise from patches ===
+        def extract_noise(patch):
+            """Extract high-frequency noise from a patch."""
+            blurred = cv2.GaussianBlur(patch, (5, 5), 0)
+            noise = patch - blurred
+            return noise
+        rich_noises = [extract_noise(p) for p in rich_patches]
+        poor_noises = [extract_noise(p) for p in poor_patches]
+        # === Step 3: Measure noise characteristics ===
+        # For each patch type, calculate:
+        # - Mean noise standard deviation
+        # - Inter-pixel correlation
+        def noise_stats(noise_patches):
+            stds = [np.std(n) for n in noise_patches]
+            # Autocorrelation at lag 1 (measures noise structure)
+            autocorrs = []
+            for n in noise_patches:
+                if n.size > 1:
+                    flat = n.flatten()
+                    if len(flat) > 1 and np.std(flat[:-1]) > 0 and np.std(flat[1:]) > 0:
+                        corr = np.corrcoef(flat[:-1], flat[1:])[0, 1]
+                        if not np.isnan(corr):
+                            autocorrs.append(corr)
+            return np.mean(stds), np.mean(autocorrs) if autocorrs else 0
+        rich_std, rich_autocorr = noise_stats(rich_noises)
+        poor_std, poor_autocorr = noise_stats(poor_noises)
+        # === Step 4: Calculate contrast ===
+        # Real images: rich areas have MORE noise than poor areas
+        # AI images: similar noise levels
+        # Noise level contrast
+        std_ratio = rich_std / (poor_std + 1e-10)
+        # In real images, rich areas typically have 1.2-2x more noise than poor
+        # In AI images, ratio is closer to 1.0
+        if std_ratio > 1.5:
+            std_contrast_score = 0.2  # High contrast = likely real
+        elif std_ratio > 1.2:
+            std_contrast_score = 0.35
+        elif std_ratio > 1.0:
+            std_contrast_score = 0.5
+        elif std_ratio > 0.8:
+            std_contrast_score = 0.65  # Inverted (poor has more noise) = suspicious
+        else:
+            std_contrast_score = 0.8
+        # Autocorrelation contrast
+        # Real noise: more random (lower autocorrelation)
+        # AI noise: more structured (higher autocorrelation)
+        autocorr_diff = abs(rich_autocorr - poor_autocorr)
+        # Real images: different autocorrelation in rich vs poor
+        # AI images: similar autocorrelation everywhere
+        if autocorr_diff > 0.1:
+            autocorr_score = 0.25  # High difference = likely real
+        elif autocorr_diff > 0.05:
+            autocorr_score = 0.4
+        else:
+            autocorr_score = 0.7  # Low difference = suspicious
+        # === Step 5: Check absolute noise levels ===
+        # AI images often have very low noise overall
+        avg_noise = (rich_std + poor_std) / 2
+        if avg_noise < 2.0:
+            noise_level_score = 0.8  # Very smooth = suspicious
+        elif avg_noise < 4.0:
+            noise_level_score = 0.5
+        else:
+            noise_level_score = 0.25  # Normal noise = likely real
+        # === Combine scores ===
+        score = (0.40 * std_contrast_score +
+                 0.30 * autocorr_score +
+                 0.30 * noise_level_score)
+        return float(np.clip(score, 0, 1))
+    def _compression_analysis(self, image_path: str) -> float:
+        """
+        Detect compression inconsistencies from splicing.
+        """
+        img = cv2.imread(image_path)
+        # Convert to YCrCb and analyze DCT blocks
+        ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
+        y_channel = ycrcb[:, :, 0].astype(np.float32)
+        # Analyze 8x8 block boundaries (JPEG artifacts)
+        h, w = y_channel.shape
+        h8, w8 = (h // 8) * 8, (w // 8) * 8
+        if h8 < 16 or w8 < 16:
+            return 0.5
+        y_cropped = y_channel[:h8, :w8]
+        # Calculate block boundary differences
+        boundary_diffs = []
+        inside_diffs = []
+        for i in range(0, h8 - 8, 8):
+            for j in range(0, w8 - 8, 8):
+                # Horizontal boundary difference
+                boundary_diffs.append(abs(float(y_cropped[i + 7, j + 4]) - float(y_cropped[i + 8, j + 4])))
+                inside_diffs.append(abs(float(y_cropped[i + 3, j + 4]) - float(y_cropped[i + 4, j + 4])))
+        if not boundary_diffs or not inside_diffs:
+            return 0.5
+        # Compare boundary vs inside differences
+        boundary_mean = np.mean(boundary_diffs)
+        inside_mean = np.mean(inside_diffs)
+        # Ratio of boundary to inside differences
+        if inside_mean > 0:
+            ratio = boundary_mean / inside_mean
+            # Values far from 1.0 suggest compression inconsistencies
+            inconsistency_score = np.clip(abs(ratio - 1.0) * 2, 0, 1)
+        else:
+            inconsistency_score = 0.5
+        # Check variance of block differences
+        diff_variance = np.std(boundary_diffs) / (np.mean(boundary_diffs) + 1e-10)
+        variance_score = np.clip(diff_variance, 0, 1)
+        score = 0.5 * inconsistency_score + 0.5 * variance_score
+        return float(np.clip(score, 0, 1))
+    def _edge_coherence(self, img: np.ndarray) -> float:
+        """
+        Check edge coherence - AI images often have inconsistent edges.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Detect edges using Canny
+        edges = cv2.Canny(gray, 50, 150)
+        # Calculate edge density
+        edge_density = np.mean(edges > 0)
+        # Very low or very high edge density is suspicious
+        if edge_density < 0.02:
+            density_score = 0.7  # Too few edges - over-smoothed
+        elif edge_density > 0.25:
+            density_score = 0.6  # Too many edges - over-sharpened
+        else:
+            density_score = 0.3  # Normal range
+        # Check edge continuity using Hough lines
+        lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=50, minLineLength=30, maxLineGap=10)
+        if lines is not None and len(lines) > 0:
+            # Calculate line statistics
+            line_lengths = [np.sqrt((l[0][2] - l[0][0]) ** 2 + (l[0][3] - l[0][1]) ** 2) for l in lines]
+            avg_length = np.mean(line_lengths)
+            # Very uniform line lengths might indicate artificial generation
+            length_variance = np.std(line_lengths) / (avg_length + 1e-10)
+            continuity_score = 1 - np.clip(length_variance, 0, 1)
+        else:
+            continuity_score = 0.5
+        score = 0.5 * density_score + 0.5 * continuity_score
+        return float(np.clip(score, 0, 1))
+    def _sharpness_analysis(self, img: np.ndarray) -> float:
+        """
+        Detect oversharpening and overblurring artifacts.
+        Uses Laplacian variance and morphological gradient.
+        Based on empirical analysis:
+        - Real photos: lap_var=400-1500, grad_mean=13-25
+        - Blur/smooth: lap_var=9-14, grad_mean=7-11
+        - Oversharp: lap_var=2500-12000+, grad_mean=30-75
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # Laplacian variance - measures sharpness
+        laplacian = cv2.Laplacian(gray, cv2.CV_64F)
+        lap_var = laplacian.var()
+        # Score based on Laplacian variance
+        if lap_var > 3500:
+            sharpness_score = 0.95  # Very oversharpened
+        elif lap_var > 2200:
+            sharpness_score = 0.80  # Oversharpened
+        elif lap_var > 1600:
+            sharpness_score = 0.45  # Upper normal range
+        elif lap_var < 30:
+            sharpness_score = 0.75  # Very blurry (heavily processed)
+        elif lap_var < 100:
+            sharpness_score = 0.55  # Blurry
+        else:
+            sharpness_score = 0.20  # Normal range (300-1600)
+        # Morphological gradient - detects halos from oversharpening
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
+        gradient = cv2.morphologyEx(gray, cv2.MORPH_GRADIENT, kernel)
+        grad_mean = np.mean(gradient)
+        # Gradient-based score
+        if grad_mean > 35:
+            halo_score = 0.90  # Strong oversharpening halos
+        elif grad_mean > 27:
+            halo_score = 0.70  # Moderate oversharpening
+        elif grad_mean < 12:
+            halo_score = 0.60  # Too smooth (blur artifacts)
+        else:
+            halo_score = 0.25  # Normal range
+        score = 0.55 * sharpness_score + 0.45 * halo_score
+        return float(np.clip(score, 0, 1))
+    def _color_channel_analysis(self, img: np.ndarray) -> float:
+        """
+        Color Channel Consistency Analysis (Research Method 3).
+        AI-generated images often have:
+        - Unnatural color channel correlations
+        - Inconsistent noise across R, G, B channels
+        - Unusual saturation patterns
+        Real cameras have consistent color processing pipelines.
+        """
+        # Split into color channels
+        b, g, r = cv2.split(img)
+        # === 1. Cross-channel correlation ===
+        # Real images: R, G, B channels are highly correlated
+        # AI images: sometimes have unusual decorrelation
+        def safe_corrcoef(a, b):
+            a_flat = a.flatten().astype(np.float64)
+            b_flat = b.flatten().astype(np.float64)
+            if np.std(a_flat) < 1e-10 or np.std(b_flat) < 1e-10:
+                return 0.5
+            corr = np.corrcoef(a_flat, b_flat)[0, 1]
+            return corr if not np.isnan(corr) else 0.5
+        rg_corr = safe_corrcoef(r, g)
+        rb_corr = safe_corrcoef(r, b)
+        gb_corr = safe_corrcoef(g, b)
+        avg_corr = (rg_corr + rb_corr + gb_corr) / 3
+        # Very low correlation is suspicious (unusual for natural images)
+        # Very high correlation might indicate grayscale converted to RGB
+        if avg_corr < 0.7:
+            corr_score = 0.7  # Low correlation - suspicious
+        elif avg_corr > 0.98:
+            corr_score = 0.6  # Too high - might be fake grayscale
+        else:
+            corr_score = 0.25  # Normal range
+        # === 2. Channel noise consistency ===
+        # Extract noise from each channel
+        def get_noise_std(channel):
+            blurred = cv2.GaussianBlur(channel, (5, 5), 0)
+            noise = channel.astype(np.float32) - blurred.astype(np.float32)
+            return np.std(noise)
+        r_noise = get_noise_std(r)
+        g_noise = get_noise_std(g)
+        b_noise = get_noise_std(b)
+        # Real cameras: similar noise across channels (sensor noise)
+        # AI: can have very different noise in different channels
+        noise_std = np.std([r_noise, g_noise, b_noise])
+        noise_mean = np.mean([r_noise, g_noise, b_noise])
+        noise_cv = noise_std / (noise_mean + 1e-10)  # Coefficient of variation
+        if noise_cv > 0.3:
+            noise_score = 0.75  # High variation - suspicious
+        elif noise_cv > 0.15:
+            noise_score = 0.5
+        else:
+            noise_score = 0.25  # Consistent noise - likely real
+        # === 3. Saturation analysis ===
+        # AI images sometimes have unnatural saturation patterns
+        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
+        saturation = hsv[:, :, 1]
+        sat_mean = np.mean(saturation)
+        sat_std = np.std(saturation)
+        # Very low saturation variance can indicate AI smoothing
+        if sat_std < 30:
+            sat_score = 0.65  # Low variance - suspicious
+        elif sat_mean > 200:
+            sat_score = 0.6  # Over-saturated
+        else:
+            sat_score = 0.3  # Normal
+        # Combine scores
+        score = 0.35 * corr_score + 0.35 * noise_score + 0.30 * sat_score
+        return float(np.clip(score, 0, 1))
+    def _local_binary_pattern_analysis(self, img: np.ndarray) -> float:
+        """
+        Local Binary Pattern (LBP) Analysis (Research Method 4).
+        LBP captures micro-texture patterns:
+        - For each pixel, compare with 8 neighbors
+        - Create binary code based on comparisons
+        - Histogram of codes reveals texture characteristics
+        AI images have different LBP distributions than real photos.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        h, w = gray.shape
+        # Simple LBP implementation (8 neighbors, radius 1)
+        def compute_lbp(img):
+            img_h, img_w = img.shape
+            lbp = np.zeros_like(img, dtype=np.uint8)
+            for i in range(1, img_h - 1):
+                for j in range(1, img_w - 1):
+                    center = img[i, j]
+                    code = 0
+                    # 8 neighbors in clockwise order
+                    code |= (1 << 7) if img[i-1, j-1] >= center else 0
+                    code |= (1 << 6) if img[i-1, j] >= center else 0
+                    code |= (1 << 5) if img[i-1, j+1] >= center else 0
+                    code |= (1 << 4) if img[i, j+1] >= center else 0
+                    code |= (1 << 3) if img[i+1, j+1] >= center else 0
+                    code |= (1 << 2) if img[i+1, j] >= center else 0
+                    code |= (1 << 1) if img[i+1, j-1] >= center else 0
+                    code |= (1 << 0) if img[i, j-1] >= center else 0
+                    lbp[i, j] = code
+            return lbp
+        # For efficiency, sample a subset of the image
+        sample_size = min(200, h - 2, w - 2)  # Leave margin for LBP
+        if sample_size < 10:
+            return 0.5  # Image too small
+        start_h = (h - sample_size) // 2
+        start_w = (w - sample_size) // 2
+        sample = gray[start_h:start_h+sample_size, start_w:start_w+sample_size]
+        lbp = compute_lbp(sample)
+        # Compute histogram
+        hist, _ = np.histogram(lbp.flatten(), bins=256, range=(0, 256))
+        hist = hist.astype(np.float32) / (hist.sum() + 1e-10)
+        # === Analysis of LBP histogram ===
+        # 1. Uniformity: AI images often have less uniform LBP patterns
+        # "Uniform" LBP patterns have at most 2 bitwise transitions
+        uniform_patterns = [0, 1, 2, 3, 4, 6, 7, 8, 12, 14, 15, 16, 24, 28, 30, 31,
+                          32, 48, 56, 60, 62, 63, 64, 96, 112, 120, 124, 126, 127,
+                          128, 129, 131, 135, 143, 159, 191, 192, 193, 195, 199,
+                          207, 223, 224, 225, 227, 231, 239, 240, 241, 243, 247,
+                          248, 249, 251, 252, 253, 254, 255]
+        uniform_ratio = sum(hist[p] for p in uniform_patterns if p < len(hist))
+        # Real images typically have 85-95% uniform patterns
+        # AI might have different ratios
+        if uniform_ratio < 0.7:
+            uniform_score = 0.75  # Low uniformity - suspicious
+        elif uniform_ratio > 0.95:
+            uniform_score = 0.6  # Too uniform - suspicious
+        else:
+            uniform_score = 0.25  # Normal
+        # 2. Entropy of LBP histogram
+        # AI images might have lower entropy (more predictable patterns)
+        entropy = -np.sum(hist * np.log2(hist + 1e-10))
+        max_entropy = np.log2(256)
+        norm_entropy = entropy / max_entropy
+        if norm_entropy < 0.6:
+            entropy_score = 0.7  # Low entropy - suspicious
+        elif norm_entropy > 0.9:
+            entropy_score = 0.5  # Very high entropy
+        else:
+            entropy_score = 0.3  # Normal
+        # 3. Peak analysis
+        # AI might have unusual peaks in histogram
+        max_bin = np.max(hist)
+        if max_bin > 0.1:
+            peak_score = 0.65  # Dominant pattern - suspicious
+        else:
+            peak_score = 0.3
+        score = 0.40 * uniform_score + 0.35 * entropy_score + 0.25 * peak_score
+        return float(np.clip(score, 0, 1))
+    def _glcm_texture_analysis(self, img: np.ndarray) -> float:
+        """
+        Grey Level Co-occurrence Matrix (GLCM) Analysis (Research Method 5).
+        GLCM captures texture by analyzing how often pairs of pixel values
+        occur at specific spatial relationships.
+        Features: contrast, correlation, energy, homogeneity
+        AI images often have different GLCM statistics than real photos.
+        """
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        h, w = gray.shape
+        # Quantize to fewer levels for efficiency
+        levels = 32
+        gray_quantized = (gray // (256 // levels)).astype(np.uint8)
+        # Sample region for efficiency
+        sample_size = min(200, h - 1, w - 1)
+        if sample_size < 10:
+            return 0.5  # Image too small
+        start_h = (h - sample_size) // 2
+        start_w = (w - sample_size) // 2
+        sample = gray_quantized[start_h:start_h+sample_size, start_w:start_w+sample_size]
+        # Compute GLCM for distance=1, angle=0 (horizontal)
+        glcm = np.zeros((levels, levels), dtype=np.float32)
+        for i in range(sample.shape[0]):
+            for j in range(sample.shape[1] - 1):
+                glcm[sample[i, j], sample[i, j+1]] += 1
+        # Normalize
+        glcm = glcm / (glcm.sum() + 1e-10)
+        # === GLCM Features ===
+        # Create indices for calculations
+        i_idx, j_idx = np.ogrid[:levels, :levels]
+        # 1. Contrast: measures local variations
+        contrast = np.sum(glcm * (i_idx - j_idx) ** 2)
+        # 2. Homogeneity: measures closeness of distribution to diagonal
+        homogeneity = np.sum(glcm / (1 + np.abs(i_idx - j_idx)))
+        # 3. Energy (Angular Second Moment): measures uniformity
+        energy = np.sum(glcm ** 2)
+        # 4. Correlation: measures linear dependency
+        mean_i = np.sum(i_idx * glcm)
+        mean_j = np.sum(j_idx * glcm)
+        std_i = np.sqrt(np.sum(glcm * (i_idx - mean_i) ** 2))
+        std_j = np.sqrt(np.sum(glcm * (j_idx - mean_j) ** 2))
+        if std_i > 1e-10 and std_j > 1e-10:
+            correlation = np.sum(glcm * (i_idx - mean_i) * (j_idx - mean_j)) / (std_i * std_j)
+        else:
+            correlation = 0
+        # === Scoring based on typical values ===
+        # AI images often have:
+        # - Lower contrast (smoother)
+        # - Higher homogeneity (more uniform)
+        # - Higher energy (more predictable patterns)
+        # Contrast score
+        if contrast < 50:
+            contrast_score = 0.7  # Very low contrast - suspicious
+        elif contrast < 150:
+            contrast_score = 0.5
+        else:
+            contrast_score = 0.25  # Normal contrast
+        # Homogeneity score
+        if homogeneity > 0.8:
+            homog_score = 0.7  # Very homogeneous - suspicious
+        elif homogeneity > 0.6:
+            homog_score = 0.45
+        else:
+            homog_score = 0.25
+        # Energy score
+        if energy > 0.1:
+            energy_score = 0.7  # High energy - suspicious
+        elif energy > 0.05:
+            energy_score = 0.45
+        else:
+            energy_score = 0.25
+        # Combine
+        score = 0.35 * contrast_score + 0.35 * homog_score + 0.30 * energy_score
+        return float(np.clip(score, 0, 1))

src/fusion/__init__.py ADDED Viewed

File without changes

src/fusion/combiner.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+Fusion Module: Combines forensic and VLM results
+"""
+from typing import Dict
+class FusionModule:
+    """Combines pixel-level forensics with semantic VLM analysis."""
+    def __init__(self):
+        # Weights for combining scores
+        # When VLM is uncertain (0.5), we rely more on forensics
+        self.forensic_weight = 0.55
+        self.vlm_weight = 0.45
+    def combine(self, forensic_results: Dict, vlm_results: Dict) -> Dict:
+        """
+        Combine forensic and VLM results into final prediction.
+        Args:
+            forensic_results: Output from ForensicDetector
+            vlm_results: Output from VLMReasoner
+        Returns:
+            Final prediction dict with score, type, and reasoning
+        """
+        # Get forensic score (already 0-1)
+        forensic_score = forensic_results.get("aggregate_score", 0.5)
+        # Convert VLM result to score
+        vlm_score = self._vlm_to_score(vlm_results)
+        # Check for strong sharpness anomalies (oversharpening/blur)
+        sharpness_score = forensic_results.get("sharpness_score", 0)
+        noise_score = forensic_results.get("noise_score", 0)
+        strong_sharpness_anomaly = sharpness_score > 0.65
+        strong_noise_anomaly = noise_score > 0.65
+        # Adaptive weighting: if VLM is uncertain, rely more on forensics
+        vlm_confidence = vlm_results.get("confidence", "low")
+        is_vlm_uncertain = vlm_results.get("manipulation_detected", "uncertain") == "uncertain"
+        # Override: trust forensics when strong pixel-level anomalies detected
+        # VLM often misses sharpness/noise artifacts that forensics catches
+        if strong_sharpness_anomaly or strong_noise_anomaly:
+            f_weight = 0.80
+            v_weight = 0.20
+        elif is_vlm_uncertain or vlm_confidence == "low":
+            # VLM is uncertain - rely primarily on forensics
+            f_weight = 0.85
+            v_weight = 0.15
+        elif vlm_confidence == "medium":
+            f_weight = self.forensic_weight
+            v_weight = self.vlm_weight
+        else:  # high confidence VLM
+            f_weight = 0.40
+            v_weight = 0.60
+        # Weighted combination
+        raw_score = f_weight * forensic_score + v_weight * vlm_score
+        # Boost score when forensics detect strong sharpness artifacts
+        # VLM cannot reliably detect oversharpening/blur
+        # Require BOTH high sharpness AND elevated aggregate forensic to avoid FPs
+        if sharpness_score > 0.70 and forensic_score > 0.45:
+            raw_score = max(raw_score, 0.50 + (sharpness_score - 0.70) * 0.5)
+        # Dampen false positives: when forensics are low/moderate but VLM says manipulated
+        # VLM can make semantic interpretation errors (e.g., dramatic skies)
+        if forensic_score < 0.45 and vlm_score > 0.6:
+            # Forensics should have the final say when pixel-level is clean
+            raw_score = min(raw_score, 0.42)
+        # Calibration: stretch scores to improve separation
+        # Apply sigmoid-like transformation
+        # This pushes low scores lower and high scores higher
+        import math
+        # Calibration center - tuned for balanced accuracy
+        # Real avg=0.446, Fake avg=0.503 on ai_generated_v2 dataset
+        if is_vlm_uncertain:
+            center = 0.45  # Balance between FP and FN
+            steepness = 5.0
+        else:
+            center = 0.42  # Normal threshold with VLM
+            steepness = 6.0
+        normalized = (raw_score - center) * steepness
+        final_score = 1 / (1 + math.exp(-normalized))
+        # Determine manipulation type
+        manipulation_type = self._determine_type(forensic_results, vlm_results, final_score)
+        # Generate combined reasoning
+        reasoning = self._generate_reasoning(forensic_results, vlm_results)
+        return {
+            "score": round(final_score, 3),
+            "manipulation_type": manipulation_type,
+            "reasoning": reasoning,
+            "forensic_score": round(forensic_score, 3),
+            "vlm_score": round(vlm_score, 3)
+        }
+    def _vlm_to_score(self, vlm_results: Dict) -> float:
+        """Convert VLM categorical output to numeric score."""
+        base_score = 0.5  # Uncertain default
+        detection = vlm_results.get("manipulation_detected", "uncertain")
+        confidence = vlm_results.get("confidence", "low")
+        # Base score from detection
+        if detection == "yes":
+            base_score = 0.8
+        elif detection == "no":
+            base_score = 0.2
+        # Adjust by confidence
+        confidence_multiplier = {"high": 1.0, "medium": 0.7, "low": 0.4}
+        multiplier = confidence_multiplier.get(confidence, 0.5)
+        # Move score toward extremes based on confidence
+        if detection == "yes":
+            score = 0.5 + (base_score - 0.5) * multiplier
+        elif detection == "no":
+            score = 0.5 - (0.5 - base_score) * multiplier
+        else:
+            score = 0.5
+        return score
+    def _determine_type(self, forensic: Dict, vlm: Dict, final_score: float) -> str:
+        """Determine the most likely manipulation type."""
+        # If score is low, it's likely authentic
+        if final_score < 0.48:
+            return "authentic"
+        # Use VLM type if confident and specific
+        vlm_type = vlm.get("manipulation_type", "unknown")
+        vlm_confidence = vlm.get("confidence", "low")
+        if vlm_type and vlm_type not in ["unknown", "authentic", "manipulation_detected"] and vlm_confidence != "low":
+            return vlm_type
+        # Infer from forensic signals
+        sharpness_score = forensic.get("sharpness_score", 0)
+        texture_score = forensic.get("texture_score", 0)
+        noise_score = forensic.get("noise_score", 0)
+        compression_score = forensic.get("compression_score", 0)
+        edge_score = forensic.get("edge_score", 0)
+        # High noise uniformity suggests AI generation
+        if noise_score > 0.65:
+            return "full_synthesis"
+        # High sharpness with noise suggests enhancement/filter
+        if sharpness_score > 0.65 and noise_score > 0.4:
+            return "filter"
+        # Very smooth textures suggest virtual staging
+        if texture_score > 0.45:
+            return "virtual_staging"
+        # High compression differences suggest splicing/inpainting
+        if compression_score > 0.72:
+            return "inpainting"
+        # Edge issues might indicate manipulation
+        if edge_score > 0.5:
+            return "inpainting"
+        # Default for high scores
+        if final_score > 0.55:
+            return "manipulation_detected"
+        return "authentic"
+    def _generate_reasoning(self, forensic: Dict, vlm: Dict) -> str:
+        """Generate human-readable reasoning based on forensic and VLM analysis."""
+        reasons = []
+        agg_score = forensic.get("aggregate_score", 0.5)
+        # VLM reasoning (if available and not mock)
+        vlm_reasoning = vlm.get("reasoning", "")
+        if vlm_reasoning and "unavailable" not in vlm_reasoning.lower() and "Visual analysis completed" not in vlm_reasoning:
+            reasons.append(f"VLM observations: {vlm_reasoning}")
+        # Detailed forensic insights based on research
+        forensic_insights = []
+        # Sharpness analysis (strongest discriminator)
+        sharpness = forensic.get("sharpness_score", 0)
+        if sharpness > 0.7:
+            forensic_insights.append("significant oversharpening artifacts detected, common in AI enhancement")
+        elif sharpness > 0.55:
+            forensic_insights.append("moderate sharpness anomalies suggest post-processing")
+        # Noise analysis (AI images have different noise patterns)
+        noise = forensic.get("noise_score", 0)
+        if noise > 0.7:
+            forensic_insights.append("uniform noise patterns indicate AI-generated content")
+        elif noise > 0.5:
+            forensic_insights.append("noise distribution shows artificial smoothing")
+        # Compression analysis
+        compression = forensic.get("compression_score", 0)
+        if compression > 0.75:
+            forensic_insights.append("compression artifacts suggest digital manipulation")
+        elif compression > 0.6:
+            forensic_insights.append("minor compression inconsistencies noted")
+        # Texture analysis
+        texture = forensic.get("texture_score", 0)
+        if texture > 0.5:
+            forensic_insights.append("unnaturally smooth textures on walls or surfaces")
+        elif texture > 0.35:
+            forensic_insights.append("subtle texture smoothing detected")
+        # Edge coherence
+        edge = forensic.get("edge_score", 0)
+        if edge > 0.5:
+            forensic_insights.append("edge boundary anomalies around objects")
+        # Build final reasoning
+        if forensic_insights:
+            # Take top 2 most significant findings
+            top_insights = forensic_insights[:2]
+            reasons.append("Forensic analysis detected: " + "; ".join(top_insights) + ".")
+        # Generate appropriate conclusion if no specific insights
+        if not reasons:
+            if agg_score < 0.38:
+                return "Image appears authentic with natural lighting, consistent shadows, and realistic textures throughout."
+            elif agg_score < 0.48:
+                return "Image shows minor processing artifacts but overall appears to be an authentic photograph."
+            elif agg_score < 0.55:
+                return "Image has borderline characteristics that warrant closer inspection for potential manipulation."
+            else:
+                return "Multiple forensic signals indicate potential AI manipulation or heavy post-processing."
+        # Combine reasoning (max 2 sentences for competition format)
+        combined = " ".join(reasons)
+        sentences = combined.replace(". ", ".|").split("|")
+        result = ". ".join(s.strip() for s in sentences[:2] if s.strip())
+        if result and not result.endswith("."):
+            result += "."
+        return result

src/neural/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Neural network-based AI image detection."""
2	+
3	+ from .detector import NeuralDetector, DINOv2Detector

src/neural/detector.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+Neural Network-based AI Image Detector
+Uses pre-trained models from HuggingFace for detecting AI-generated images.
+Based on research recommendations:
+- DINOv2/CLIP for feature extraction
+- Pre-trained deepfake detectors
+- Ensemble approach for robustness
+"""
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from typing import Dict, Optional, Tuple
+import numpy as np
+import os
+# Lazy imports to avoid loading everything at startup
+_clip_model = None
+_clip_processor = None
+_ai_detector = None
+_ai_detector_processor = None
+def get_device():
+    """Get the best available device."""
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+class NeuralDetector:
+    """
+    Neural network-based detector using pre-trained models.
+    Uses:
+    1. CLIP for zero-shot AI image detection
+    2. Pre-trained AI image detector from HuggingFace
+    3. Ensemble of both for robust predictions
+    """
+    def __init__(self, use_clip: bool = True, use_ai_detector: bool = True):
+        """
+        Initialize the neural detector.
+        Args:
+            use_clip: Whether to use CLIP for zero-shot detection
+            use_ai_detector: Whether to use pre-trained AI detector
+        """
+        self.device = get_device()
+        self.use_clip = use_clip
+        self.use_ai_detector = use_ai_detector
+        # Models loaded lazily on first use
+        self._clip_loaded = False
+        self._detector_loaded = False
+    def _load_clip(self):
+        """Load CLIP model for zero-shot classification."""
+        if self._clip_loaded:
+            return
+        global _clip_model, _clip_processor
+        if _clip_model is None:
+            from transformers import CLIPProcessor, CLIPModel
+            print("Loading CLIP model...")
+            _clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+            _clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+            _clip_model = _clip_model.to(self.device)
+            _clip_model.eval()
+            print("CLIP model loaded.")
+        self._clip_loaded = True
+    def _load_ai_detector(self):
+        """Load pre-trained AI image detector."""
+        if self._detector_loaded:
+            return
+        global _ai_detector, _ai_detector_processor
+        if _ai_detector is None:
+            from transformers import AutoModelForImageClassification, AutoImageProcessor
+            print("Loading AI image detector...")
+            # Try different models in order of preference
+            models_to_try = [
+                "umm-maybe/AI-image-detector",  # General AI detector
+                "Organika/sdxl-detector",       # SDXL specific
+            ]
+            for model_name in models_to_try:
+                try:
+                    _ai_detector = AutoModelForImageClassification.from_pretrained(model_name)
+                    _ai_detector_processor = AutoImageProcessor.from_pretrained(model_name)
+                    _ai_detector = _ai_detector.to(self.device)
+                    _ai_detector.eval()
+                    print(f"Loaded AI detector: {model_name}")
+                    break
+                except Exception as e:
+                    print(f"Failed to load {model_name}: {e}")
+                    continue
+            if _ai_detector is None:
+                print("Warning: No AI detector model available. Using CLIP only.")
+                self.use_ai_detector = False
+        self._detector_loaded = True
+    def analyze_with_clip(self, image: Image.Image) -> Dict:
+        """
+        Use CLIP for zero-shot AI image detection.
+        Research shows CLIP can detect AI images by comparing embeddings
+        to text descriptions like "AI generated image" vs "real photograph".
+        """
+        self._load_clip()
+        # Text prompts for classification
+        # Based on research: be specific about what we're looking for
+        text_prompts = [
+            "a real photograph taken by a camera",
+            "an AI generated image, synthetic, artificial, computer generated",
+        ]
+        inputs = _clip_processor(
+            text=text_prompts,
+            images=image,
+            return_tensors="pt",
+            padding=True
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = _clip_model(**inputs)
+            logits_per_image = outputs.logits_per_image
+            probs = F.softmax(logits_per_image, dim=1)
+        # prob[0] = real, prob[1] = AI
+        probs = probs.cpu().numpy()[0]
+        return {
+            "clip_real_prob": float(probs[0]),
+            "clip_fake_prob": float(probs[1]),
+            "clip_score": float(probs[1]),  # Higher = more likely AI
+        }
+    def analyze_with_detector(self, image: Image.Image) -> Dict:
+        """
+        Use pre-trained AI image detector.
+        """
+        self._load_ai_detector()
+        if _ai_detector is None:
+            return {"detector_score": 0.5, "detector_available": False}
+        inputs = _ai_detector_processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = _ai_detector(**inputs)
+            logits = outputs.logits
+            probs = F.softmax(logits, dim=1)
+        probs = probs.cpu().numpy()[0]
+        # Model typically has labels like ['artificial', 'human'] or similar
+        # Check the label order
+        labels = _ai_detector.config.id2label
+        # Find which index corresponds to AI/fake
+        fake_idx = None
+        for idx, label in labels.items():
+            if any(kw in label.lower() for kw in ['artificial', 'ai', 'fake', 'synthetic', 'generated']):
+                fake_idx = idx
+                break
+        if fake_idx is None:
+            # Assume index 0 is AI (common convention)
+            fake_idx = 0
+        return {
+            "detector_score": float(probs[fake_idx]),
+            "detector_probs": {labels[i]: float(probs[i]) for i in range(len(probs))},
+            "detector_available": True,
+        }
+    def analyze(self, image_path: str) -> Dict:
+        """
+        Analyze an image for AI generation.
+        Args:
+            image_path: Path to the image file
+        Returns:
+            Dict with detection results and aggregate score
+        """
+        # Load image
+        image = Image.open(image_path).convert("RGB")
+        results = {}
+        scores = []
+        weights = []
+        # CLIP analysis
+        if self.use_clip:
+            try:
+                clip_results = self.analyze_with_clip(image)
+                results.update(clip_results)
+                scores.append(clip_results["clip_score"])
+                weights.append(0.4)  # CLIP weight
+            except Exception as e:
+                results["clip_error"] = str(e)
+        # Pre-trained detector analysis
+        if self.use_ai_detector:
+            try:
+                detector_results = self.analyze_with_detector(image)
+                results.update(detector_results)
+                if detector_results.get("detector_available", False):
+                    scores.append(detector_results["detector_score"])
+                    weights.append(0.6)  # Pre-trained detector weight (higher trust)
+            except Exception as e:
+                results["detector_error"] = str(e)
+        # Compute aggregate score
+        if scores:
+            # Weighted average
+            total_weight = sum(weights)
+            aggregate = sum(s * w for s, w in zip(scores, weights)) / total_weight
+            results["neural_aggregate_score"] = float(aggregate)
+        else:
+            results["neural_aggregate_score"] = 0.5  # Neutral if no models worked
+        return results
+class DINOv2Detector:
+    """
+    DINOv2-based detector for AI image detection.
+    Research shows DINOv2 features are highly discriminative for AI vs real images.
+    This uses DINOv2 as a feature extractor with a simple classifier head.
+    Note: This requires training on labeled data, so we use it in feature extraction
+    mode and combine with other signals.
+    """
+    def __init__(self):
+        self.device = get_device()
+        self.model = None
+        self.processor = None
+    def _load_model(self):
+        if self.model is not None:
+            return
+        from transformers import AutoImageProcessor, AutoModel
+        print("Loading DINOv2 model...")
+        # Use smaller variant for CPU
+        model_name = "facebook/dinov2-small"
+        self.processor = AutoImageProcessor.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        print("DINOv2 model loaded.")
+    def extract_features(self, image_path: str) -> np.ndarray:
+        """Extract DINOv2 features from an image."""
+        self._load_model()
+        image = Image.open(image_path).convert("RGB")
+        inputs = self.processor(images=image, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            # Use CLS token as image representation
+            features = outputs.last_hidden_state[:, 0, :]
+        return features.cpu().numpy()[0]
+    def analyze(self, image_path: str) -> Dict:
+        """
+        Analyze image using DINOv2 features.
+        Since we don't have a trained classifier, we use statistical properties
+        of the features that research shows differ between AI and real images.
+        """
+        features = self.extract_features(image_path)
+        # Research insight: AI images tend to have more uniform feature distributions
+        # Real images have more varied, scene-specific features
+        feature_std = np.std(features)
+        feature_kurtosis = self._kurtosis(features)
+        feature_entropy = self._entropy(features)
+        # Normalize to 0-1 scores
+        # Based on empirical observation: AI images have lower std, lower kurtosis
+        # These thresholds would need calibration on actual data
+        std_score = 1 - np.clip(feature_std / 1.0, 0, 1)  # Lower std = more suspicious
+        kurtosis_score = 1 - np.clip((feature_kurtosis + 2) / 6, 0, 1)  # Lower kurtosis = suspicious
+        # Weighted combination
+        dino_score = 0.6 * std_score + 0.4 * kurtosis_score
+        return {
+            "dino_feature_std": float(feature_std),
+            "dino_feature_kurtosis": float(feature_kurtosis),
+            "dino_feature_entropy": float(feature_entropy),
+            "dino_score": float(np.clip(dino_score, 0, 1)),
+        }
+    def _kurtosis(self, x):
+        """Compute kurtosis of array."""
+        n = len(x)
+        mean = np.mean(x)
+        std = np.std(x)
+        if std == 0:
+            return 0
+        return np.sum(((x - mean) / std) ** 4) / n - 3
+    def _entropy(self, x):
+        """Compute entropy of feature distribution."""
+        # Discretize features into bins
+        hist, _ = np.histogram(x, bins=50, density=True)
+        hist = hist[hist > 0]
+        return -np.sum(hist * np.log2(hist + 1e-10))
+def test_neural_detector():
+    """Test the neural detector on sample images."""
+    import glob
+    detector = NeuralDetector()
+    # Find test images
+    fake_images = glob.glob("/home/omer_aims_ac_za/digital-integrity-challenge/data/ai_generated_v2/*.png")[:5]
+    real_images = glob.glob("/home/omer_aims_ac_za/digital-integrity-challenge/data/real/*.jpg")[:5]
+    print("\n=== Testing on FAKE images ===")
+    fake_scores = []
+    for img_path in fake_images:
+        results = detector.analyze(img_path)
+        score = results.get("neural_aggregate_score", 0.5)
+        fake_scores.append(score)
+        print(f"{os.path.basename(img_path)}: {score:.3f}")
+    print("\n=== Testing on REAL images ===")
+    real_scores = []
+    for img_path in real_images:
+        results = detector.analyze(img_path)
+        score = results.get("neural_aggregate_score", 0.5)
+        real_scores.append(score)
+        print(f"{os.path.basename(img_path)}: {score:.3f}")
+    print(f"\n=== Summary ===")
+    print(f"FAKE avg: {np.mean(fake_scores):.3f}")
+    print(f"REAL avg: {np.mean(real_scores):.3f}")
+    print(f"Separation: {np.mean(fake_scores) - np.mean(real_scores):.3f}")
+    # Good detector should have FAKE > REAL scores
+    accuracy = (sum(1 for s in fake_scores if s >= 0.5) + sum(1 for s in real_scores if s < 0.5)) / (len(fake_scores) + len(real_scores))
+    print(f"Accuracy (threshold=0.5): {accuracy*100:.1f}%")
+if __name__ == "__main__":
+    test_neural_detector()

src/vlm/__init__.py ADDED Viewed

File without changes

src/vlm/reasoner.py ADDED Viewed

	@@ -0,0 +1,636 @@

+"""
+Module 2: VLM Logic Reasoner
+Semantic-level analysis using Vision-Language Models
+Local models only (no API keys required for competition).
+TPU support via JAX for PaliGemma models.
+Models ordered from smallest to largest for disk efficiency.
+"""
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+from typing import Dict, List, Optional
+from pathlib import Path
+# VLM inference timeout in seconds
+VLM_TIMEOUT_SECONDS = 60
+# ============================================================================
+# RESEARCH-BASED PROMPT TEMPLATES
+# ============================================================================
+# Real estate specific prompt (optimized for smaller models)
+REAL_ESTATE_PROMPT = """Analyze this real estate image for AI manipulation or virtual staging.
+Check these red flags:
+1. Do furniture shadows match light sources?
+2. Are wall/floor textures unnaturally smooth?
+3. Do reflections look consistent?
+4. Are furniture edges blended naturally?
+5. Is the scale/proportion realistic?
+Respond in this format:
+MANIPULATION_DETECTED: YES or NO or UNCERTAIN
+CONFIDENCE: HIGH or MEDIUM or LOW
+MANIPULATION_TYPE: authentic or virtual_staging or inpainting or full_synthesis
+REASONING: One sentence explaining why."""
+# Simple prompt for basic models
+SIMPLE_PROMPT = """Is this real estate image real or AI-generated?
+Check shadows, textures, and reflections.
+Answer: REAL or FAKE, then explain briefly."""
+class VLMReasoner:
+    """Uses local VLMs to detect semantic anomalies. TPU-optimized."""
+    # Model priority: largest/best first for better reasoning
+    MODEL_PRIORITY = [
+        "qwen2vl",     # Best: 72B/7B available
+        "paligemma",   # Good: 28B/10B available
+        "blip2",       # Fallback: 2.7B
+        "mock",        # Last resort
+    ]
+    def __init__(self, backend: str = "auto", use_tpu: bool = True):
+        """
+        Initialize VLM reasoner.
+        Args:
+            backend: Model to use ("auto", "blip2", "paligemma", "qwen2vl", "mock")
+            use_tpu: Whether to use TPU if available (for JAX models)
+        """
+        self.use_tpu = use_tpu
+        self.backend = self._detect_backend(backend)
+        self.model = None
+        self.processor = None
+        self.device = None
+        self._init_backend()
+    def _detect_backend(self, backend: str) -> str:
+        """Detect best available backend, starting with smallest."""
+        if backend != "auto":
+            return backend
+        # Auto-detect: try models in order of size (smallest first)
+        for model in self.MODEL_PRIORITY:
+            if model == "mock":
+                return "mock"
+            if self._check_model_available(model):
+                return model
+        return "mock"
+    def _check_model_available(self, model: str) -> bool:
+        """Check if model dependencies are available."""
+        try:
+            if model == "blip2":
+                from transformers import Blip2Processor
+                return True
+            elif model == "paligemma":
+                # Check for JAX (TPU) or PyTorch
+                try:
+                    import jax
+                    return True
+                except:
+                    pass
+                try:
+                    from transformers import PaliGemmaForConditionalGeneration
+                    return True
+                except:
+                    pass
+                return False
+            elif model == "qwen2vl":
+                from transformers import AutoProcessor
+                return True
+        except ImportError:
+            return False
+        return False
+    def _init_backend(self):
+        """Initialize the selected backend."""
+        print(f"Initializing VLM backend: {self.backend}")
+        try:
+            if self.backend == "blip2":
+                self._init_blip2()
+            elif self.backend == "paligemma":
+                self._init_paligemma()
+            elif self.backend == "qwen2vl":
+                self._init_qwen2vl()
+            elif self.backend == "mock":
+                print("Using mock VLM backend (forensics only)")
+        except Exception as e:
+            print(f"Failed to initialize {self.backend}: {e}")
+            print("Falling back to next available backend...")
+            self._fallback_init()
+    def _fallback_init(self):
+        """Try fallback backends in order."""
+        for model in self.MODEL_PRIORITY:
+            if model == self.backend:
+                continue
+            try:
+                print(f"Trying fallback: {model}")
+                self.backend = model
+                if model == "blip2":
+                    self._init_blip2()
+                elif model == "paligemma":
+                    self._init_paligemma()
+                elif model == "qwen2vl":
+                    self._init_qwen2vl()
+                elif model == "mock":
+                    return
+                print(f"Fallback {model} initialized!")
+                return
+            except Exception as e:
+                print(f"Fallback {model} failed: {e}")
+                continue
+        print("All backends failed. Using mock.")
+        self.backend = "mock"
+    def _get_device(self):
+        """Detect best available device."""
+        import torch
+        if torch.cuda.is_available():
+            return "cuda"
+        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+            return "mps"
+        return "cpu"
+    def _check_tpu_available(self) -> bool:
+        """Check if TPU is available via JAX."""
+        if not self.use_tpu:
+            return False
+        try:
+            import jax
+            devices = jax.devices()
+            return any("Tpu" in str(d) for d in devices)
+        except:
+            return False
+    def _init_blip2(self):
+        """Initialize BLIP-2 (smallest, ~5GB)."""
+        from transformers import Blip2Processor, Blip2ForConditionalGeneration
+        import torch
+        model_id = "Salesforce/blip2-opt-2.7b"
+        print(f"Loading {model_id}...")
+        self.device = self._get_device()
+        dtype = torch.float16 if self.device == "cuda" else torch.float32
+        self.processor = Blip2Processor.from_pretrained(model_id)
+        self.model = Blip2ForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            device_map="auto" if self.device == "cuda" else None,
+            low_cpu_mem_usage=True,
+        )
+        if self.device != "cuda":
+            self.model = self.model.to(self.device)
+        self.model.eval()
+        print(f"BLIP-2 loaded on {self.device}!")
+    def _init_paligemma(self):
+        """Initialize PaliGemma with TPU support via JAX or PyTorch fallback."""
+        if self._check_tpu_available():
+            self._init_paligemma_jax()
+        else:
+            self._init_paligemma_torch()
+    def _init_paligemma_jax(self):
+        """Initialize PaliGemma using JAX for TPU."""
+        print("Initializing PaliGemma with JAX/TPU...")
+        try:
+            import jax
+            import jax.numpy as jnp
+            from transformers import AutoProcessor
+            from big_vision.models.proj.paligemma import paligemma
+            from big_vision.trainers.proj.paligemma import predict_fns
+            # Use smallest PaliGemma model
+            model_id = "google/paligemma-3b-pt-224"
+            self.processor = AutoProcessor.from_pretrained(model_id)
+            # JAX model loading would go here
+            # For now, fall back to PyTorch if big_vision not available
+            print(f"PaliGemma JAX loaded on TPU!")
+            self.device = "tpu"
+        except ImportError as e:
+            print(f"JAX PaliGemma not available: {e}")
+            print("Falling back to PyTorch PaliGemma...")
+            self._init_paligemma_torch()
+    def _init_paligemma_torch(self):
+        """Initialize PaliGemma using PyTorch."""
+        print("Initializing PaliGemma with PyTorch...")
+        from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+        import torch
+        self.device = self._get_device()
+        # Use larger PaliGemma models (bigger = better reasoning)
+        model_candidates = [
+            "google/paligemma2-28b-pt-896",  # ~56GB, best
+            "google/paligemma2-10b-pt-448",  # ~20GB, good balance
+            "google/paligemma-3b-pt-448",    # ~6GB, fallback
+            "google/paligemma-3b-pt-224",    # ~6GB, smallest
+        ]
+        dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
+        for model_id in model_candidates:
+            try:
+                print(f"Trying {model_id}...")
+                self.processor = AutoProcessor.from_pretrained(model_id)
+                self.model = PaliGemmaForConditionalGeneration.from_pretrained(
+                    model_id,
+                    torch_dtype=dtype,
+                    device_map="auto" if self.device == "cuda" else None,
+                    low_cpu_mem_usage=True,
+                )
+                if self.device not in ["cuda"]:
+                    self.model = self.model.to(self.device)
+                self.model.eval()
+                print(f"PaliGemma loaded: {model_id} on {self.device}!")
+                return
+            except Exception as e:
+                print(f"{model_id} failed: {e}")
+                continue
+        raise RuntimeError("Could not load any PaliGemma model")
+    def _init_qwen2vl(self):
+        """Initialize Qwen2-VL (smallest 2B version)."""
+        import torch
+        try:
+            from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+        except ImportError:
+            from transformers import AutoModelForVision2Seq, AutoProcessor
+            Qwen2VLForConditionalGeneration = AutoModelForVision2Seq
+        self.device = self._get_device()
+        # Use larger Qwen2-VL models (bigger = better reasoning)
+        model_candidates = [
+            "Qwen/Qwen2-VL-72B-Instruct",  # ~140GB, best quality
+            "Qwen/Qwen2-VL-7B-Instruct",   # ~14GB, good balance
+            "Qwen/Qwen2-VL-2B-Instruct",   # ~4GB, fallback
+        ]
+        dtype = torch.float16 if self.device == "cuda" else torch.float32
+        for model_id in model_candidates:
+            try:
+                print(f"Trying {model_id}...")
+                self.processor = AutoProcessor.from_pretrained(
+                    model_id, trust_remote_code=True
+                )
+                self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model_id,
+                    torch_dtype=dtype,
+                    device_map="auto" if self.device == "cuda" else None,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                )
+                if self.device not in ["cuda"]:
+                    self.model = self.model.to(self.device)
+                self.model.eval()
+                print(f"Qwen2-VL loaded: {model_id} on {self.device}!")
+                return
+            except Exception as e:
+                print(f"{model_id} failed: {e}")
+                continue
+        raise RuntimeError("Could not load any Qwen2-VL model")
+    def analyze(self, image_path: str) -> Dict:
+        """Analyze an image for manipulation with timeout protection."""
+        if self.backend == "mock":
+            return self._analyze_mock(image_path)
+        def _run_analysis():
+            if self.backend == "blip2":
+                return self._analyze_blip2(image_path)
+            elif self.backend == "paligemma":
+                return self._analyze_paligemma(image_path)
+            elif self.backend == "qwen2vl":
+                return self._analyze_qwen2vl(image_path)
+            else:
+                return self._analyze_mock(image_path)
+        try:
+            with ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(_run_analysis)
+                return future.result(timeout=VLM_TIMEOUT_SECONDS)
+        except FuturesTimeoutError:
+            print(f"VLM inference timed out after {VLM_TIMEOUT_SECONDS}s")
+            return self._analyze_mock(image_path)
+        except Exception as e:
+            print(f"Analysis error: {e}")
+            return self._analyze_mock(image_path)
+    def _analyze_blip2(self, image_path: str) -> Dict:
+        """Analyze using BLIP-2 with multi-question approach."""
+        from PIL import Image
+        import torch
+        image = Image.open(image_path).convert("RGB")
+        # Questions for explainability - describe what VLM sees
+        questions = [
+            ("Question: Describe the lighting and shadows in this image. Answer:", "lighting"),
+            ("Question: Describe the textures in this image. Answer:", "texture"),
+        ]
+        answers = []
+        reasoning_parts = []
+        for q, category in questions:
+            try:
+                inputs = self.processor(image, text=q, return_tensors="pt")
+                if self.device:
+                    inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
+                             for k, v in inputs.items()}
+                with torch.no_grad():
+                    generated_ids = self.model.generate(**inputs, max_new_tokens=20)
+                answer = self.processor.batch_decode(
+                    generated_ids, skip_special_tokens=True
+                )[0].strip()
+                # Extract just the answer part
+                if "Answer:" in answer:
+                    answer = answer.split("Answer:")[-1].strip()
+                answers.append((category, answer.lower()))
+                # Collect reasoning
+                if len(answer) > 5:
+                    reasoning_parts.append(f"{category}: {answer[:60]}")
+            except Exception as e:
+                continue
+        return self._aggregate_blip2_responses(answers, reasoning_parts)
+    def _aggregate_blip2_responses(self, qa_pairs: List, reasoning_parts: List) -> Dict:
+        """Aggregate BLIP-2 responses - focus on explainability, not detection."""
+        # BLIP-2 is used for EXPLAINABILITY (30% of competition score)
+        # Detection is handled by forensics - VLM provides reasoning
+        # Look for anomaly indicators in descriptions
+        anomaly_words = ["inconsistent", "unusual", "strange", "artificial",
+                        "smooth", "unnatural", "blurry", "distorted"]
+        normal_words = ["natural", "realistic", "consistent", "detailed",
+                       "normal", "clear", "sharp"]
+        anomaly_score = 0
+        normal_score = 0
+        for category, answer in qa_pairs:
+            anomaly_score += sum(1 for w in anomaly_words if w in answer)
+            normal_score += sum(1 for w in normal_words if w in answer)
+        # Build descriptive reasoning from VLM responses
+        reasoning = ". ".join(reasoning_parts[:3]) if reasoning_parts else "Visual analysis completed."
+        # Provide weak signal to fusion (forensics is primary detector)
+        # VLM observations can nudge the decision slightly
+        if anomaly_score > normal_score + 1:
+            detection = "uncertain"  # Weak signal - let forensics decide
+            confidence = "low"
+        elif normal_score > anomaly_score + 1:
+            detection = "uncertain"  # Weak signal - let forensics decide
+            confidence = "low"
+        else:
+            detection = "uncertain"
+            confidence = "low"
+        return {
+            "manipulation_detected": detection,
+            "confidence": confidence,
+            "manipulation_type": "unknown",
+            "reasoning": reasoning[:200],
+        }
+    def _analyze_paligemma(self, image_path: str) -> Dict:
+        """Analyze using PaliGemma."""
+        from PIL import Image
+        import torch
+        image = Image.open(image_path).convert("RGB")
+        # Multi-question approach
+        questions = [
+            ("Is this image real or AI-generated?", "main"),
+            ("Are there shadow inconsistencies?", "shadow"),
+            ("Are textures unnaturally smooth?", "texture"),
+        ]
+        answers = []
+        for prompt, category in questions:
+            try:
+                inputs = self.processor(text=prompt, images=image, return_tensors="pt")
+                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = self.model.generate(**inputs, max_new_tokens=50)
+                response = self.processor.decode(outputs[0], skip_special_tokens=True)
+                answers.append((category, response.lower()))
+            except Exception as e:
+                continue
+        return self._aggregate_qa_responses(answers)
+    def _analyze_qwen2vl(self, image_path: str) -> Dict:
+        """Analyze using Qwen2-VL."""
+        from PIL import Image
+        import torch
+        image = Image.open(image_path).convert("RGB")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": REAL_ESTATE_PROMPT}
+                ]
+            }
+        ]
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.processor(
+            text=[text], images=[image], return_tensors="pt", padding=True
+        )
+        if self.device:
+            inputs = {k: v.to(self.device) if hasattr(v, 'to') else v
+                     for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model.generate(**inputs, max_new_tokens=200)
+        response = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        if "assistant" in response.lower():
+            response = response.split("assistant")[-1].strip()
+        return self._parse_structured_response(response)
+    def _analyze_mock(self, image_path: str) -> Dict:
+        """Mock analysis when no VLM available."""
+        return {
+            "manipulation_detected": "uncertain",
+            "confidence": "low",
+            "manipulation_type": "unknown",
+            "reasoning": "VLM backend unavailable - using forensic signals only."
+        }
+    def _aggregate_qa_responses(self, qa_pairs: List) -> Dict:
+        """Aggregate multi-question responses into final result."""
+        fake_signals = ["generated", "fake", "artificial", "synthetic", "manipulated",
+                       "artifacts", "unnatural", "inconsistent", "smooth", "yes"]
+        real_signals = ["real", "natural", "authentic", "consistent", "genuine",
+                       "photograph", "no", "match", "normal"]
+        fake_score = 0
+        real_score = 0
+        staging_detected = False
+        reasoning_parts = []
+        for category, answer in qa_pairs:
+            answer_lower = answer.lower()
+            fake_in = sum(1 for s in fake_signals if s in answer_lower)
+            real_in = sum(1 for s in real_signals if s in answer_lower)
+            # Weight main question more
+            weight = 2 if category == "main" else 1
+            fake_score += fake_in * weight
+            real_score += real_in * weight
+            if category == "staging" and fake_in > 0:
+                staging_detected = True
+            if category in ["shadow", "texture"] and len(answer) > 10:
+                reasoning_parts.append(answer[:60])
+        # Determine verdict
+        if fake_score > real_score + 2:
+            detection = "yes"
+            confidence = "high" if fake_score > 5 else "medium"
+        elif real_score > fake_score + 2:
+            detection = "no"
+            confidence = "high" if real_score > 5 else "medium"
+        else:
+            detection = "uncertain"
+            confidence = "low"
+        # Determine type
+        if staging_detected:
+            manip_type = "virtual_staging"
+        elif detection == "yes":
+            manip_type = "manipulation_detected"
+        else:
+            manip_type = "authentic"
+        reasoning = " ".join(reasoning_parts)[:200] or "Visual analysis completed."
+        return {
+            "manipulation_detected": detection,
+            "confidence": confidence,
+            "manipulation_type": manip_type,
+            "reasoning": reasoning,
+        }
+    def _parse_structured_response(self, response: str) -> Dict:
+        """Parse structured VLM response."""
+        result = {
+            "manipulation_detected": "uncertain",
+            "confidence": "low",
+            "manipulation_type": "unknown",
+            "reasoning": ""
+        }
+        lines = response.split('\n')
+        # Parse MANIPULATION_DETECTED / VERDICT
+        for line in lines:
+            line_upper = line.upper()
+            if 'MANIPULATION_DETECTED:' in line_upper or 'VERDICT:' in line_upper:
+                if 'YES' in line_upper or 'FAKE' in line_upper:
+                    result["manipulation_detected"] = "yes"
+                elif 'NO' in line_upper or 'REAL' in line_upper:
+                    result["manipulation_detected"] = "no"
+                break
+        # Fallback keyword detection
+        if result["manipulation_detected"] == "uncertain":
+            text_lower = response.lower()
+            fake_kw = ["manipulated", "fake", "generated", "synthetic", "staged"]
+            real_kw = ["authentic", "genuine", "real photograph", "not manipulated"]
+            if any(kw in text_lower for kw in fake_kw):
+                result["manipulation_detected"] = "yes"
+            elif any(kw in text_lower for kw in real_kw):
+                result["manipulation_detected"] = "no"
+        # Parse CONFIDENCE
+        for line in lines:
+            if 'CONFIDENCE:' in line.upper():
+                if 'HIGH' in line.upper():
+                    result["confidence"] = "high"
+                elif 'MEDIUM' in line.upper():
+                    result["confidence"] = "medium"
+                break
+        # Parse TYPE
+        for line in lines:
+            if 'MANIPULATION_TYPE:' in line.upper() or 'TYPE:' in line.upper():
+                type_val = line.split(':', 1)[-1].strip().lower().replace(" ", "_")
+                if type_val in ["authentic", "virtual_staging", "inpainting", "full_synthesis"]:
+                    result["manipulation_type"] = type_val
+                break
+        if result["manipulation_type"] == "unknown":
+            result["manipulation_type"] = (
+                "manipulation_detected" if result["manipulation_detected"] == "yes"
+                else "authentic"
+            )
+        # Parse REASONING
+        for line in lines:
+            if line.upper().startswith('REASONING:') or line.upper().startswith('REASON:'):
+                result["reasoning"] = line.split(':', 1)[-1].strip()
+                break
+        if not result["reasoning"]:
+            # Extract evidence sentences
+            sentences = re.split(r'[.!?]', response)
+            evidence = [s.strip() for s in sentences
+                       if any(kw in s.lower() for kw in
+                             ["shadow", "light", "texture", "reflect", "artifact"])]
+            result["reasoning"] = ". ".join(evidence[:2])[:200] or "Analysis completed."
+        return result

test_ensemble.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""Test ensemble of CLIP + Forensics."""
+import sys
+sys.path.insert(0, '.')
+from pathlib import Path
+import numpy as np
+from PIL import Image
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from src.forensics.detector import ForensicDetector
+REAL_DIR = Path("data/real")
+FAKE_DIR = Path("data/ai_generated_v2")
+def load_images(directory, pattern="*"):
+    images = []
+    extensions = {'.jpg', '.jpeg', '.png', '.webp'}
+    for ext in extensions:
+        for f in directory.glob(f"{pattern}{ext}"):
+            try:
+                img = Image.open(f).convert("RGB")
+                images.append((f.name, f, img))
+            except:
+                pass
+    return images
+def main():
+    print("Loading models...")
+    # CLIP
+    model = CLIPModel.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
+    processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
+    model.eval()
+    labels = [
+        "a real photograph",
+        "an AI-generated image",
+        "a computer-generated image",
+        "a synthetic image created by artificial intelligence"
+    ]
+    # Forensics
+    forensic = ForensicDetector()
+    # Load images
+    real_estate = load_images(REAL_DIR)
+    fake_v2 = [(n, p, i) for n, p, i in load_images(FAKE_DIR, "*_fake_*")]
+    real_v2 = [(n, p, i) for n, p, i in load_images(FAKE_DIR, "*_real_*")]
+    all_real = real_estate + real_v2
+    all_fake = fake_v2
+    print(f"Testing {len(all_real)} real, {len(all_fake)} fake images")
+    results = []
+    for label, images, is_fake in [("REAL", all_real, False), ("FAKE", all_fake, True)]:
+        print(f"\n=== {label} ===")
+        for name, path, img in images:
+            # CLIP score
+            inputs = processor(text=labels, images=img, return_tensors="pt", padding=True).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                probs = outputs.logits_per_image.softmax(dim=1).cpu().numpy()[0]
+            real_prob = probs[0]
+            ai_prob = max(probs[1], probs[2], probs[3])
+            clip_score = ai_prob / (real_prob + ai_prob + 1e-10)
+            # Forensic score
+            forensic_results = forensic.analyze(str(path))
+            forensic_score = forensic_results['aggregate_score']
+            # Ensemble - try different weights
+            for w_clip in [0.7, 0.8, 0.9]:
+                ensemble = w_clip * clip_score + (1 - w_clip) * forensic_score
+                results.append({
+                    'name': name,
+                    'is_fake': is_fake,
+                    'clip': clip_score,
+                    'forensic': forensic_score,
+                    f'ensemble_{w_clip}': ensemble,
+                })
+            print(f"{name}: CLIP={clip_score:.3f}, Forensic={forensic_score:.3f}")
+    # Calculate accuracies
+    print("\n" + "="*60)
+    print("ACCURACY SUMMARY")
+    print("="*60)
+    for method in ['clip', 'forensic', 'ensemble_0.7', 'ensemble_0.8', 'ensemble_0.9']:
+        # Group by unique images (results has duplicates due to ensemble weights)
+        seen = set()
+        real_correct = 0
+        real_total = 0
+        fake_correct = 0
+        fake_total = 0
+        for r in results:
+            if r['name'] in seen:
+                continue
+            seen.add(r['name'])
+            score = r.get(method, r.get('clip') if 'ensemble' in method else 0)
+            if 'ensemble' in method:
+                w = float(method.split('_')[1])
+                score = w * r['clip'] + (1-w) * r['forensic']
+            if r['is_fake']:
+                fake_total += 1
+                if score >= 0.5:
+                    fake_correct += 1
+            else:
+                real_total += 1
+                if score < 0.5:
+                    real_correct += 1
+        total = real_total + fake_total
+        overall = (real_correct + fake_correct) / total * 100 if total > 0 else 0
+        print(f"{method:20s}: Real {real_correct}/{real_total}, Fake {fake_correct}/{fake_total}, Overall {overall:.1f}%")
+if __name__ == "__main__":
+    main()

test_forensics.py ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/env python3
+"""Quick test of forensic module."""
+import sys
+sys.path.insert(0, '.')
+from src.forensics.detector import ForensicDetector
+def test_with_image(image_path):
+    print(f"Testing with: {image_path}")
+    detector = ForensicDetector()
+    results = detector.analyze(image_path)
+    print("\nForensic Analysis Results:")
+    for key, value in results.items():
+        print(f"  {key}: {value:.3f}" if isinstance(value, float) else f"  {key}: {value}")
+    return results
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        test_with_image(sys.argv[1])
+    else:
+        print("Usage: python test_forensics.py <image_path>")
+        print("\nTo test, download a sample image first")

test_pretrained_detectors.py ADDED Viewed

	@@ -0,0 +1,302 @@

+#!/usr/bin/env python3
+"""
+Test pre-trained AI image detectors on Flux-generated images.
+No fine-tuning - just evaluation of existing models.
+"""
+import os
+import json
+import time
+from pathlib import Path
+from PIL import Image
+import torch
+from transformers import pipeline, AutoModelForImageClassification, AutoImageProcessor
+import numpy as np
+from tqdm import tqdm
+# Paths
+REAL_DIR = Path("data/real")
+FAKE_DIR = Path("data/ai_generated_v2")
+# Models to test
+MODELS = [
+    # Current baseline
+    "umm-maybe/AI-image-detector",
+    # SDXL-specific detector (Swin Transformer)
+    "Organika/sdxl-detector",
+    # Fine-tuned on 2024 generators including Flux
+    "Smogy/SMOGY-Ai-images-detector",
+]
+def load_images(directory, limit=None):
+    """Load images from directory."""
+    images = []
+    extensions = {'.jpg', '.jpeg', '.png', '.webp'}
+    files = sorted([f for f in directory.iterdir() if f.suffix.lower() in extensions])
+    if limit:
+        files = files[:limit]
+    for f in files:
+        try:
+            img = Image.open(f).convert("RGB")
+            images.append((f.name, img))
+        except Exception as e:
+            print(f"Error loading {f}: {e}")
+    return images
+def test_detector(model_name, real_images, fake_images):
+    """Test a single detector model."""
+    print(f"\n{'='*60}")
+    print(f"Testing: {model_name}")
+    print('='*60)
+    try:
+        # Load model
+        start = time.time()
+        classifier = pipeline(
+            "image-classification",
+            model=model_name,
+            device=0 if torch.cuda.is_available() else -1
+        )
+        load_time = time.time() - start
+        print(f"Model loaded in {load_time:.1f}s")
+        # Get label mapping - different models use different labels
+        results = {"real": [], "fake": [], "model": model_name}
+        # Test real images
+        print(f"\nTesting {len(real_images)} real images...")
+        correct_real = 0
+        for name, img in tqdm(real_images):
+            try:
+                pred = classifier(img)
+                # Find the "real" or "human" score
+                score = 0.0
+                for p in pred:
+                    label = p['label'].lower()
+                    if 'artificial' in label or 'ai' in label or 'fake' in label:
+                        score = p['score']
+                        break
+                    elif 'human' in label or 'real' in label:
+                        score = 1.0 - p['score']
+                        break
+                is_correct = score < 0.5  # Real images should have low AI score
+                correct_real += is_correct
+                results["real"].append({
+                    "name": name,
+                    "ai_score": score,
+                    "correct": is_correct,
+                    "raw": pred
+                })
+            except Exception as e:
+                print(f"Error on {name}: {e}")
+                results["real"].append({"name": name, "error": str(e)})
+        # Test fake images
+        print(f"Testing {len(fake_images)} fake (AI-generated) images...")
+        correct_fake = 0
+        for name, img in tqdm(fake_images):
+            try:
+                pred = classifier(img)
+                # Find the "AI" or "artificial" score
+                score = 0.0
+                for p in pred:
+                    label = p['label'].lower()
+                    if 'artificial' in label or 'ai' in label or 'fake' in label:
+                        score = p['score']
+                        break
+                    elif 'human' in label or 'real' in label:
+                        score = 1.0 - p['score']
+                        break
+                is_correct = score >= 0.5  # Fake images should have high AI score
+                correct_fake += is_correct
+                results["fake"].append({
+                    "name": name,
+                    "ai_score": score,
+                    "correct": is_correct,
+                    "raw": pred
+                })
+            except Exception as e:
+                print(f"Error on {name}: {e}")
+                results["fake"].append({"name": name, "error": str(e)})
+        # Calculate metrics
+        total_real = len([r for r in results["real"] if "error" not in r])
+        total_fake = len([r for r in results["fake"] if "error" not in r])
+        real_acc = correct_real / total_real * 100 if total_real > 0 else 0
+        fake_acc = correct_fake / total_fake * 100 if total_fake > 0 else 0
+        overall_acc = (correct_real + correct_fake) / (total_real + total_fake) * 100 if (total_real + total_fake) > 0 else 0
+        print(f"\n📊 Results for {model_name}:")
+        print(f"  Real images: {correct_real}/{total_real} ({real_acc:.1f}%)")
+        print(f"  Fake images: {correct_fake}/{total_fake} ({fake_acc:.1f}%)")
+        print(f"  Overall: {overall_acc:.1f}%")
+        results["metrics"] = {
+            "real_accuracy": real_acc,
+            "fake_accuracy": fake_acc,
+            "overall_accuracy": overall_acc,
+            "correct_real": correct_real,
+            "correct_fake": correct_fake,
+            "total_real": total_real,
+            "total_fake": total_fake
+        }
+        # Clean up
+        del classifier
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return results
+    except Exception as e:
+        print(f"❌ Failed to load/run model: {e}")
+        import traceback
+        traceback.print_exc()
+        return {"model": model_name, "error": str(e)}
+def test_clip_zero_shot():
+    """Test CLIP ViT-L with zero-shot classification."""
+    from transformers import CLIPProcessor, CLIPModel
+    print(f"\n{'='*60}")
+    print("Testing: CLIP ViT-L Zero-Shot (laion/CLIP-ViT-L-14-laion2B-s32B-b82K)")
+    print('='*60)
+    try:
+        model = CLIPModel.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
+        processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-L-14-laion2B-s32B-b82K")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        model.eval()
+        # Zero-shot labels
+        labels = [
+            "a real photograph",
+            "an AI-generated image",
+            "a computer-generated image",
+            "a synthetic image created by artificial intelligence"
+        ]
+        real_images = load_images(REAL_DIR)
+        fake_images = load_images(FAKE_DIR)
+        results = {"real": [], "fake": [], "model": "CLIP-ViT-L Zero-Shot"}
+        correct_real = 0
+        correct_fake = 0
+        print(f"\nTesting {len(real_images)} real images...")
+        for name, img in tqdm(real_images):
+            inputs = processor(text=labels, images=img, return_tensors="pt", padding=True).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                logits = outputs.logits_per_image
+                probs = logits.softmax(dim=1).cpu().numpy()[0]
+            # Real photo is label 0, AI labels are 1,2,3
+            real_prob = probs[0]
+            ai_prob = max(probs[1], probs[2], probs[3])
+            is_correct = real_prob > ai_prob
+            correct_real += is_correct
+            results["real"].append({"name": name, "real_prob": float(real_prob), "ai_prob": float(ai_prob), "correct": is_correct})
+        print(f"Testing {len(fake_images)} fake images...")
+        for name, img in tqdm(fake_images):
+            inputs = processor(text=labels, images=img, return_tensors="pt", padding=True).to(device)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                logits = outputs.logits_per_image
+                probs = logits.softmax(dim=1).cpu().numpy()[0]
+            real_prob = probs[0]
+            ai_prob = max(probs[1], probs[2], probs[3])
+            is_correct = ai_prob > real_prob
+            correct_fake += is_correct
+            results["fake"].append({"name": name, "real_prob": float(real_prob), "ai_prob": float(ai_prob), "correct": is_correct})
+        total_real = len(real_images)
+        total_fake = len(fake_images)
+        real_acc = correct_real / total_real * 100 if total_real > 0 else 0
+        fake_acc = correct_fake / total_fake * 100 if total_fake > 0 else 0
+        overall_acc = (correct_real + correct_fake) / (total_real + total_fake) * 100
+        print(f"\n📊 Results for CLIP ViT-L Zero-Shot:")
+        print(f"  Real images: {correct_real}/{total_real} ({real_acc:.1f}%)")
+        print(f"  Fake images: {correct_fake}/{total_fake} ({fake_acc:.1f}%)")
+        print(f"  Overall: {overall_acc:.1f}%")
+        results["metrics"] = {
+            "real_accuracy": real_acc,
+            "fake_accuracy": fake_acc,
+            "overall_accuracy": overall_acc
+        }
+        del model
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        return results
+    except Exception as e:
+        print(f"❌ Failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return {"model": "CLIP-ViT-L Zero-Shot", "error": str(e)}
+def main():
+    print("🔍 Pre-trained AI Image Detector Evaluation")
+    print(f"Real images: {REAL_DIR}")
+    print(f"Fake images: {FAKE_DIR}")
+    print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
+    # Load all images once
+    real_images = load_images(REAL_DIR)
+    fake_images = load_images(FAKE_DIR)
+    print(f"\nLoaded {len(real_images)} real, {len(fake_images)} fake images")
+    all_results = []
+    # Test each model
+    for model_name in MODELS:
+        result = test_detector(model_name, real_images, fake_images)
+        all_results.append(result)
+    # Test CLIP zero-shot
+    clip_result = test_clip_zero_shot()
+    all_results.append(clip_result)
+    # Summary
+    print("\n" + "="*60)
+    print("📊 SUMMARY - All Models")
+    print("="*60)
+    print(f"{'Model':<45} {'Real%':>8} {'Fake%':>8} {'Overall':>8}")
+    print("-"*70)
+    for r in all_results:
+        if "error" in r:
+            print(f"{r['model']:<45} {'ERROR':>8}")
+        else:
+            m = r.get("metrics", {})
+            print(f"{r['model']:<45} {m.get('real_accuracy', 0):>7.1f}% {m.get('fake_accuracy', 0):>7.1f}% {m.get('overall_accuracy', 0):>7.1f}%")
+    # Save results
+    with open("detector_comparison.json", "w") as f:
+        # Convert non-serializable items
+        def serialize(obj):
+            if isinstance(obj, (np.floating, np.integer)):
+                return float(obj)
+            if isinstance(obj, np.ndarray):
+                return obj.tolist()
+            return str(obj)
+        json.dump(all_results, f, indent=2, default=serialize)
+    print("\nResults saved to detector_comparison.json")
+    # Find best model
+    best = max([r for r in all_results if "error" not in r],
+               key=lambda x: x.get("metrics", {}).get("overall_accuracy", 0))
+    print(f"\n🏆 Best model: {best['model']} ({best.get('metrics', {}).get('overall_accuracy', 0):.1f}% accuracy)")
+if __name__ == "__main__":
+    main()