Spaces:

abinazebinoy
/

verifile-x-api

Running

abinazebinoy commited on 27 days ago

Commit

e4d04ea

1 Parent(s): a1c7689

feat:DCT Frequency Artifact Detection (25th signal)

- dct_frequency_detector.py: 3 spectral analysis methods
* 8x8 block DCT high-frequency energy ratio
* Global FFT radial power spectrum smoothness
* Checkerboard artifact detection (GAN up-convolution)
- ensemble: DCT wired in, v1.4, 7 methods
- 25 total signals
- Tests updated

Files changed (9) hide show

backend/services/advanced_ensemble_detector.py +19 -13
backend/services/dct_frequency_detector.py +213 -0
backend/tests/test_advanced_ai_detector.py +1 -1
backend/tests/test_advanced_ensemble.py +5 -5
backend/tests/test_covariance_detector.py +1 -1
backend/tests/test_determinism.py +6 -6
backend/tests/test_statistical_detector.py +1 -1
backend/tests/test_ultra_advanced_detector.py +1 -1
frontend/index.html +1 -1

backend/services/advanced_ensemble_detector.py CHANGED Viewed

@@ -10,6 +10,7 @@ from backend.services.clip_detector import CLIPDetector
 from backend.services.prnu_detector import detect_prnu
 from backend.services.ela_detector import detect_ela
 from backend.services.metadata_forensics import analyze_metadata
 logger = setup_logger(__name__)
@@ -39,7 +40,7 @@ class AdvancedEnsembleDetector(StatisticalDetector):
         Run complete advanced detection with all methods.
         Returns:
-            Complete report with 24 detection signals
         """
         logger.info(f"Starting advanced ensemble detection for {self.filename}")
@@ -61,8 +62,11 @@ class AdvancedEnsembleDetector(StatisticalDetector):
         # Add metadata forensics signal
         metadata_result = analyze_metadata(self.image_bytes, self.filename)
-        # Combine all signals (now 24 total)
-        all_signals = base_report["all_signals"] + [dire_result, clip_result, prnu_result, ela_result, metadata_result]
         # Recalculate final score with weighted ensemble
         # Weights based on validation performance
@@ -75,21 +79,23 @@ class AdvancedEnsembleDetector(StatisticalDetector):
         if dire_confidence > 0.0:
             weighted_score = (
-                0.33 * base_report["ai_probability"] +
-                0.26 * dire_result["score"] +
-                0.19 * clip_result["score"] +
                 0.09 * prnu_result["score"] +
                 0.07 * ela_result["score"] +
-                0.06 * metadata_result["score"]
             )
         else:
-            logger.info("DIRE unavailable — using statistical+CLIP+PRNU+ELA+metadata")
             weighted_score = (
-                0.52 * base_report["ai_probability"] +
-                0.23 * clip_result["score"] +
                 0.11 * prnu_result["score"] +
                 0.08 * ela_result["score"] +
-                0.06 * metadata_result["score"]
             )
         suspicious_count = sum(1 for s in all_signals if s["score"] > 0.5)
@@ -132,8 +138,8 @@ class AdvancedEnsembleDetector(StatisticalDetector):
             "summary": f"Analyzed using {len(all_signals)} independent signals including "
                       f"statistical analysis, diffusion reconstruction, and semantic embeddings. "
                       f"{suspicious_count} signals indicate AI generation.",
-            "detection_version": "advanced-ensemble-v1.3",
-            "methods_used": ["statistical", "dire", "clip", "prnu", "ela", "metadata"]
         }
         logger.info(

 from backend.services.prnu_detector import detect_prnu
 from backend.services.ela_detector import detect_ela
 from backend.services.metadata_forensics import analyze_metadata
+from backend.services.dct_frequency_detector import detect_dct_artifacts
 logger = setup_logger(__name__)
         Run complete advanced detection with all methods.
         Returns:
+            Complete report with 25 detection signals
         """
         logger.info(f"Starting advanced ensemble detection for {self.filename}")
         # Add metadata forensics signal
         metadata_result = analyze_metadata(self.image_bytes, self.filename)
+        # Add DCT frequency signal
+        dct_result = detect_dct_artifacts(self.image_bytes, self.filename)
+        # Combine all signals (now 25 total)
+        all_signals = base_report["all_signals"] + [dire_result, clip_result, prnu_result, ela_result, metadata_result, dct_result]
         # Recalculate final score with weighted ensemble
         # Weights based on validation performance
         if dire_confidence > 0.0:
             weighted_score = (
+                0.31 * base_report["ai_probability"] +
+                0.24 * dire_result["score"] +
+                0.18 * clip_result["score"] +
                 0.09 * prnu_result["score"] +
                 0.07 * ela_result["score"] +
+                0.06 * metadata_result["score"] +
+                0.05 * dct_result["score"]
             )
         else:
+            logger.info("DIRE unavailable — using statistical+CLIP+PRNU+ELA+metadata+DCT")
             weighted_score = (
+                0.49 * base_report["ai_probability"] +
+                0.22 * clip_result["score"] +
                 0.11 * prnu_result["score"] +
                 0.08 * ela_result["score"] +
+                0.06 * metadata_result["score"] +
+                0.04 * dct_result["score"]
             )
         suspicious_count = sum(1 for s in all_signals if s["score"] > 0.5)
             "summary": f"Analyzed using {len(all_signals)} independent signals including "
                       f"statistical analysis, diffusion reconstruction, and semantic embeddings. "
                       f"{suspicious_count} signals indicate AI generation.",
+            "detection_version": "advanced-ensemble-v1.4",
+            "methods_used": ["statistical", "dire", "clip", "prnu", "ela", "metadata", "dct"]
         }
         logger.info(

backend/services/dct_frequency_detector.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+DCT Frequency Domain Artifact Detection.
+GAN-generated images and some diffusion models leave characteristic
+artifacts in the Discrete Cosine Transform (DCT) frequency domain.
+These artifacts are invisible to the human eye but mathematically
+detectable as spectral peaks or unusual energy distributions.
+Key papers:
+- "Detecting GAN-Generated Fake Images Using Co-occurrence Matrices" (2019)
+- "Leveraging Frequency Analysis for Deep Fake Image Recognition" (ICML 2020)
+- "Watch Your Up-Convolution: CNN Based Generative Deep Neural Networks are
+   Failing to Reproduce Spectral Distributions" (CVPR 2020)
+"""
+import numpy as np
+from typing import Dict, Any
+from PIL import Image
+from io import BytesIO
+from backend.core.logger import setup_logger
+logger = setup_logger(__name__)
+def _compute_dct_2d(block: np.ndarray) -> np.ndarray:
+    """Compute 2D DCT using scipy if available, else numpy."""
+    try:
+        from scipy.fft import dctn
+        return dctn(block, norm='ortho')
+    except ImportError:
+        # Fallback: use FFT as approximation
+        return np.abs(np.fft.fft2(block))
+def detect_dct_artifacts(image_bytes: bytes, filename: str = "unknown") -> Dict[str, Any]:
+    """
+    Detect DCT frequency domain artifacts in image.
+    Method:
+    1. Convert to grayscale, divide into 8x8 blocks (like JPEG)
+    2. Compute DCT of each block
+    3. Analyze frequency energy distribution across blocks
+    4. Check for characteristic GAN/AI spectral peaks
+    5. Measure high-frequency energy (AI images are often too smooth)
+    """
+    try:
+        img = Image.open(BytesIO(image_bytes)).convert("L")
+        arr = np.array(img, dtype=np.float64)
+        h, w = arr.shape
+        if h < 32 or w < 32:
+            return {
+                "signal_name": "DCT Frequency Artifacts",
+                "score": 0.5,
+                "confidence": 0.0,
+                "explanation": "Image too small for DCT analysis",
+                "method": "dct_frequency"
+            }
+        # === Signal 1: 8x8 block DCT energy distribution ===
+        block_size = 8
+        dct_blocks = []
+        hf_ratios = []  # High-frequency to total energy ratios
+        for y in range(0, h - block_size, block_size):
+            for x in range(0, w - block_size, block_size):
+                block = arr[y:y+block_size, x:x+block_size]
+                block = block - block.mean()  # Zero-center
+                dct = _compute_dct_2d(block)
+                dct_abs = np.abs(dct)
+                total_energy = np.sum(dct_abs ** 2) + 1e-10
+                # High freq = bottom-right of DCT block
+                hf_energy = np.sum(dct_abs[4:, 4:] ** 2)
+                hf_ratio = hf_energy / total_energy
+                hf_ratios.append(hf_ratio)
+                dct_blocks.append(dct_abs)
+        if not hf_ratios:
+            return {
+                "signal_name": "DCT Frequency Artifacts",
+                "score": 0.5,
+                "confidence": 0.1,
+                "explanation": "Insufficient blocks for DCT analysis",
+                "method": "dct_frequency"
+            }
+        mean_hf = float(np.mean(hf_ratios))
+        std_hf = float(np.std(hf_ratios))
+        # === Signal 2: Global FFT spectrum analysis ===
+        # AI images often have unnaturally smooth spectra
+        fft = np.fft.fft2(arr)
+        fft_shift = np.fft.fftshift(fft)
+        magnitude = np.abs(fft_shift)
+        magnitude_log = np.log1p(magnitude)
+        # Radial power spectrum
+        cy, cx = h // 2, w // 2
+        y_idx, x_idx = np.ogrid[:h, :w]
+        radius = np.sqrt((y_idx - cy)**2 + (x_idx - cx)**2).astype(int)
+        max_r = min(cy, cx)
+        radial_power = []
+        for r in range(1, max_r):
+            mask = radius == r
+            if mask.sum() > 0:
+                radial_power.append(float(np.mean(magnitude_log[mask])))
+        if len(radial_power) > 10:
+            # Natural images: power decreases roughly as 1/f
+            # AI images: often too uniform or have peaks
+            rp = np.array(radial_power)
+            rp_diff = np.diff(rp)
+            # Measure smoothness of spectral rolloff
+            spectral_smoothness = float(np.std(rp_diff))
+            # Check for spectral peaks (GAN artifact)
+            rp_norm = rp / (rp.max() + 1e-10)
+            above_trend = np.sum(rp_norm[10:] > 0.3)
+            spectral_peaks = int(above_trend)
+        else:
+            spectral_smoothness = 0.5
+            spectral_peaks = 0
+        # === Signal 3: Checkerboard artifact detection ===
+        # Up-convolution in GANs creates checkerboard patterns at 2x, 4x, 8x freq
+        fft_magnitude = np.abs(np.fft.fft2(arr))
+        h2, w2 = h // 2, w // 2
+        # Check for energy at Nyquist-related frequencies
+        nyquist_energy = float(np.mean(fft_magnitude[h2-2:h2+2, w2-2:w2+2]))
+        total_fft_energy = float(np.mean(fft_magnitude))
+        checkerboard_ratio = nyquist_energy / (total_fft_energy + 1e-10)
+        # === Compute AI score ===
+        # Low high-frequency energy = AI (too smooth)
+        if mean_hf < 0.02:
+            hf_score = 0.75
+        elif mean_hf < 0.05:
+            hf_score = 0.55
+        elif mean_hf < 0.15:
+            hf_score = 0.35
+        else:
+            hf_score = 0.20
+        # Very smooth spectrum (low variation) = AI
+        if spectral_smoothness < 0.05:
+            smooth_score = 0.70
+        elif spectral_smoothness < 0.10:
+            smooth_score = 0.50
+        else:
+            smooth_score = 0.25
+        # Checkerboard artifacts = GAN
+        if checkerboard_ratio > 3.0:
+            checker_score = 0.80
+        elif checkerboard_ratio > 1.5:
+            checker_score = 0.55
+        else:
+            checker_score = 0.25
+        ai_score = float(np.clip(
+            0.45 * hf_score + 0.35 * smooth_score + 0.20 * checker_score,
+            0.0, 1.0
+        ))
+        confidence = min(0.75, 0.35 + (h * w) / (1024 * 1024) * 0.40)
+        if mean_hf < 0.03:
+            explanation = (
+                f"Very low high-frequency DCT energy ({mean_hf:.3f}) — "
+                "image is unnaturally smooth, consistent with AI synthesis"
+            )
+        elif checkerboard_ratio > 2.0:
+            explanation = (
+                f"Checkerboard frequency artifacts detected "
+                f"(ratio={checkerboard_ratio:.2f}) — "
+                "typical of GAN up-convolution artifacts"
+            )
+        else:
+            explanation = (
+                f"Normal DCT frequency distribution "
+                f"(HF={mean_hf:.3f}, smoothness={spectral_smoothness:.3f})"
+            )
+        logger.info(
+            f"DCT analysis: score={ai_score:.3f}, "
+            f"hf={mean_hf:.3f}, checker={checkerboard_ratio:.2f}, "
+            f"file={filename}"
+        )
+        return {
+            "signal_name": "DCT Frequency Artifacts",
+            "score": ai_score,
+            "confidence": confidence,
+            "explanation": explanation,
+            "raw_value": mean_hf,
+            "expected_range": "< 0.03 HF energy for AI images",
+            "method": "dct_frequency"
+        }
+    except Exception as e:
+        logger.warning(f"DCT frequency analysis failed: {e}")
+        return {
+            "signal_name": "DCT Frequency Artifacts",
+            "score": 0.5,
+            "confidence": 0.0,
+            "explanation": f"DCT analysis unavailable: {str(e)}",
+            "raw_value": 0.0,
+            "method": "dct_frequency"
+        }

backend/tests/test_advanced_ai_detector.py CHANGED Viewed

@@ -67,4 +67,4 @@ def test_forensics_integration(sample_image_bytes):
     assert "ai_detection" in report
     assert "all_signals" in report["ai_detection"]
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
-    assert report["summary"]["total_detection_signals"] == 24

     assert "ai_detection" in report
     assert "all_signals" in report["ai_detection"]
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
+    assert report["summary"]["total_detection_signals"] == 25

backend/tests/test_advanced_ensemble.py CHANGED Viewed

@@ -26,8 +26,8 @@ def test_advanced_ensemble_complete_detection(sample_image_bytes):
     assert "methods_used" in report
     # Should have 21 signals (19 statistical + DIRE + CLIP)
-    assert report["total_signals"] == 24
-    assert len(report["all_signals"]) == 24
     # Check methods used
     assert "statistical" in report["methods_used"]
@@ -36,7 +36,7 @@ def test_advanced_ensemble_complete_detection(sample_image_bytes):
     assert "prnu" in report["methods_used"]
     # Check version
-    assert report["detection_version"] == "advanced-ensemble-v1.3"
     # Cleanup
     detector.cleanup()
@@ -50,7 +50,7 @@ def test_advanced_ensemble_forensics_integration(sample_image_bytes):
     report = forensics.generate_forensic_report()
     # Check advanced detection was used
-    assert report["ai_detection"]["total_signals"] == 24
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "methods_used" in report["ai_detection"]
-    assert len(report["ai_detection"]["methods_used"]) == 6

     assert "methods_used" in report
     # Should have 21 signals (19 statistical + DIRE + CLIP)
+    assert report["total_signals"] == 25
+    assert len(report["all_signals"]) == 25
     # Check methods used
     assert "statistical" in report["methods_used"]
     assert "prnu" in report["methods_used"]
     # Check version
+    assert report["detection_version"] == "advanced-ensemble-v1.4"
     # Cleanup
     detector.cleanup()
     report = forensics.generate_forensic_report()
     # Check advanced detection was used
+    assert report["ai_detection"]["total_signals"] == 25
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "methods_used" in report["ai_detection"]
+    assert len(report["ai_detection"]["methods_used"]) == 7

backend/tests/test_covariance_detector.py CHANGED Viewed

@@ -62,7 +62,7 @@ def test_covariance_forensics_integration(sample_image_bytes):
     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
-    assert report["ai_detection"]["total_signals"] == 24
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
+    assert report["ai_detection"]["total_signals"] == 25
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

backend/tests/test_determinism.py CHANGED Viewed

@@ -20,8 +20,8 @@ def test_detection_is_deterministic(sample_image_bytes):
     assert report1["summary"]["ai_classification"] == report2["summary"]["ai_classification"]
     # Signal counts should be identical
-    assert report1["summary"]["total_detection_signals"] == 24
-    assert report2["summary"]["total_detection_signals"] == 24
 def test_hash_generation_is_consistent(sample_image_bytes):
@@ -61,8 +61,8 @@ def test_forensic_report_stability(sample_image_bytes):
     assert report1["hashes"]["sha256"] == report2["hashes"]["sha256"]
     # Signal counts should be identical
-    assert report1["summary"]["total_detection_signals"] == 24
-    assert report2["summary"]["total_detection_signals"] == 24
     assert report1["summary"]["total_detection_signals"] == report2["summary"]["total_detection_signals"]
     # AI probability: allow 20% variance for CLIP randomness
@@ -114,8 +114,8 @@ def test_signal_ordering_is_stable(sample_image_bytes):
     assert "ai_detection" in report2
     # Both should have 21 signals total
-    assert report1["ai_detection"]["total_signals"] == 24
-    assert report2["ai_detection"]["total_signals"] == 24
     # Classification keys should be consistent
     assert report1["ai_detection"]["classification"] == report2["ai_detection"]["classification"]

     assert report1["summary"]["ai_classification"] == report2["summary"]["ai_classification"]
     # Signal counts should be identical
+    assert report1["summary"]["total_detection_signals"] == 25
+    assert report2["summary"]["total_detection_signals"] == 25
 def test_hash_generation_is_consistent(sample_image_bytes):
     assert report1["hashes"]["sha256"] == report2["hashes"]["sha256"]
     # Signal counts should be identical
+    assert report1["summary"]["total_detection_signals"] == 25
+    assert report2["summary"]["total_detection_signals"] == 25
     assert report1["summary"]["total_detection_signals"] == report2["summary"]["total_detection_signals"]
     # AI probability: allow 20% variance for CLIP randomness
     assert "ai_detection" in report2
     # Both should have 21 signals total
+    assert report1["ai_detection"]["total_signals"] == 25
+    assert report2["ai_detection"]["total_signals"] == 25
     # Classification keys should be consistent
     assert report1["ai_detection"]["classification"] == report2["ai_detection"]["classification"]

backend/tests/test_statistical_detector.py CHANGED Viewed

@@ -61,7 +61,7 @@ def test_statistical_forensics_integration(sample_image_bytes):
     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
-    assert report["ai_detection"]["total_signals"] == 24
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
+    assert report["ai_detection"]["total_signals"] == 25
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

backend/tests/test_ultra_advanced_detector.py CHANGED Viewed

@@ -60,6 +60,6 @@ def test_ultra_forensics_integration(sample_image_bytes):
     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
-    assert report["ai_detection"]["total_signals"] == 24
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

     assert "ai_detection" in report
     # System has 21 signals: 19 statistical + 1 DIRE + 1 CLIP
+    assert report["ai_detection"]["total_signals"] == 25
     assert report["metadata"]["analyzer_version"] == "6.0.0"
     assert "detection_version" in report["ai_detection"]

frontend/index.html CHANGED Viewed

@@ -122,7 +122,7 @@
     <nav class="navbar">
         <div class="nav-container">
             <div class="logo">VeriFile-X</div>
-            <div class="nav-badge">24 Detection Signals • 96-98% Accuracy</div>
         </div>
     </nav>

     <nav class="navbar">
         <div class="nav-container">
             <div class="logo">VeriFile-X</div>
+            <div class="nav-badge">25 Detection Signals • 96-98% Accuracy</div>
         </div>
     </nav>