satyaki-mitra commited on
Commit
f142520
·
1 Parent(s): 676e2fe

EXIF Analysis and Watermark Analysis added

Browse files
app.py CHANGED
@@ -8,7 +8,6 @@ from typing import List
8
  from typing import Dict
9
  from pathlib import Path
10
  from fastapi import File
11
- from typing import Optional
12
  from fastapi import Request
13
  from fastapi import FastAPI
14
  from fastapi import UploadFile
@@ -24,7 +23,6 @@ from utils.validators import ImageValidator
24
  from fastapi.staticfiles import StaticFiles
25
  from utils.helpers import generate_unique_id
26
  from reporter.csv_reporter import CSVReporter
27
- from reporter.pdf_reporter import PDFReporter
28
  from config.schemas import BatchAnalysisResult
29
  from reporter.json_reporter import JSONReporter
30
  from utils.image_processor import ImageProcessor
@@ -63,12 +61,10 @@ image_validator = ImageValidator()
63
  image_processor = ImageProcessor()
64
 
65
  threshold_manager = ThresholdManager()
66
- threshold_manager = threshold_manager
67
  batch_processor = BatchProcessor(threshold_manager = threshold_manager)
68
 
69
  json_reporter = JSONReporter()
70
  csv_reporter = CSVReporter()
71
- pdf_reporter = PDFReporter()
72
 
73
  UPLOAD_DIR = settings.UPLOAD_DIR
74
  CACHE_DIR = settings.CACHE_DIR
@@ -157,21 +153,21 @@ async def analyze_single_image(file: UploadFile = File(...)):
157
  image_id = generate_unique_id()
158
  image_path = UPLOAD_DIR / f"{image_id}_{file.filename}"
159
 
160
- image_validator.validate_image(file_path = image_path,
161
- filename = file.filename,
162
- file_size = file.size,
163
- )
164
-
165
  try:
166
  with open(image_path, "wb") as f:
167
  shutil.copyfileobj(file.file, f)
168
 
 
 
 
 
 
169
  image = image_processor.load_image(image_path)
170
 
171
  # image is a NumPy array → shape = (H, W, C) or (H, W)
172
  height, width = image.shape[:2]
173
 
174
- result: AnalysisResult = batch_processor.process_single(image = image_path,
175
  filename = file.filename,
176
  image_size = (width, height),
177
  )
@@ -210,15 +206,15 @@ async def analyze_batch(files: List[UploadFile] = File(...)):
210
 
211
  with open(path, "wb") as f:
212
  shutil.copyfileobj(file.file, f)
213
-
214
- image = image_processor.load_image(path)
215
- height, width = image.shape[:2]
216
 
217
  image_validator.validate_image(file_path = path,
218
  filename = file.filename,
219
  file_size = file.size,
220
  )
221
 
 
 
 
222
  image_entries.append({"path" : path,
223
  "filename" : file.filename,
224
  "size" : (width, height),
@@ -297,6 +293,8 @@ def export_csv(batch_id: str):
297
 
298
  # Clean up the file after sending
299
  path.unlink(missing_ok = True)
 
 
300
 
301
  return Response(content = content,
302
  media_type = "text/csv",
@@ -306,33 +304,6 @@ def export_csv(batch_id: str):
306
  )
307
 
308
 
309
- @app.api_route("/report/pdf/{batch_id}", methods = ["GET", "POST"])
310
- def export_pdf(batch_id: str):
311
- session = SESSION_STORE.get(batch_id)
312
-
313
- if (not session or ("result" not in session)):
314
- raise HTTPException(status_code = 404,
315
- detail = "Batch result not found",
316
- )
317
-
318
- path = pdf_reporter.export_batch(session["result"])
319
-
320
- # Read the file and send it as a download
321
- with open(path, "rb") as f:
322
- content = f.read()
323
-
324
- # Clean up the file after sending
325
- path.unlink(missing_ok = True)
326
-
327
- return Response(content = content,
328
- media_type = "application/pdf",
329
- headers = {"Content-Disposition" : f"attachment; filename=ai_screener_report_{batch_id}.pdf",
330
- "Content-Type" : "application/pdf"
331
- }
332
- )
333
-
334
-
335
-
336
  # ==================== MAIN ====================
337
  if __name__ == "__main__":
338
  # Explicit startup log (forces log file creation)
 
8
  from typing import Dict
9
  from pathlib import Path
10
  from fastapi import File
 
11
  from fastapi import Request
12
  from fastapi import FastAPI
13
  from fastapi import UploadFile
 
23
  from fastapi.staticfiles import StaticFiles
24
  from utils.helpers import generate_unique_id
25
  from reporter.csv_reporter import CSVReporter
 
26
  from config.schemas import BatchAnalysisResult
27
  from reporter.json_reporter import JSONReporter
28
  from utils.image_processor import ImageProcessor
 
61
  image_processor = ImageProcessor()
62
 
63
  threshold_manager = ThresholdManager()
 
64
  batch_processor = BatchProcessor(threshold_manager = threshold_manager)
65
 
66
  json_reporter = JSONReporter()
67
  csv_reporter = CSVReporter()
 
68
 
69
  UPLOAD_DIR = settings.UPLOAD_DIR
70
  CACHE_DIR = settings.CACHE_DIR
 
153
  image_id = generate_unique_id()
154
  image_path = UPLOAD_DIR / f"{image_id}_{file.filename}"
155
 
 
 
 
 
 
156
  try:
157
  with open(image_path, "wb") as f:
158
  shutil.copyfileobj(file.file, f)
159
 
160
+ image_validator.validate_image(file_path = image_path,
161
+ filename = file.filename,
162
+ file_size = file.size,
163
+ )
164
+
165
  image = image_processor.load_image(image_path)
166
 
167
  # image is a NumPy array → shape = (H, W, C) or (H, W)
168
  height, width = image.shape[:2]
169
 
170
+ result: AnalysisResult = batch_processor.process_single(image_path = image_path,
171
  filename = file.filename,
172
  image_size = (width, height),
173
  )
 
206
 
207
  with open(path, "wb") as f:
208
  shutil.copyfileobj(file.file, f)
 
 
 
209
 
210
  image_validator.validate_image(file_path = path,
211
  filename = file.filename,
212
  file_size = file.size,
213
  )
214
 
215
+ image = image_processor.load_image(path)
216
+ height, width = image.shape[:2]
217
+
218
  image_entries.append({"path" : path,
219
  "filename" : file.filename,
220
  "size" : (width, height),
 
293
 
294
  # Clean up the file after sending
295
  path.unlink(missing_ok = True)
296
+ SESSION_STORE.pop(batch_id, None)
297
+
298
 
299
  return Response(content = content,
300
  media_type = "text/csv",
 
304
  )
305
 
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  # ==================== MAIN ====================
308
  if __name__ == "__main__":
309
  # Explicit startup log (forces log file creation)
config/constants.py CHANGED
@@ -5,7 +5,7 @@ from dataclasses import dataclass
5
 
6
  class DetectionStatus(str, Enum):
7
  """
8
- Overall detection status
9
  """
10
  LIKELY_AUTHENTIC = "LIKELY_AUTHENTIC"
11
  REVIEW_REQUIRED = "REVIEW_REQUIRED"
@@ -41,6 +41,37 @@ class MetricType(str, Enum):
41
  COLOR = "color"
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Signal thresholds
46
  SIGNAL_THRESHOLDS = {SignalStatus.FLAGGED : 0.7,
@@ -314,6 +345,79 @@ class ColorAnalysisParams:
314
  )
315
 
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
318
  # Singleton instances for parameter classes
319
  GRADIENT_FIELD_PCA_PARAMS = GradientFieldPCAParams()
@@ -321,5 +425,19 @@ FREQUENCY_ANALYSIS_PARAMS = FrequencyAnalysisParams()
321
  NOISE_ANALYSIS_PARAMS = NoiseAnalysisParams()
322
  TEXTURE_ANALYSIS_PARAMS = TextureAnalysisParams()
323
  COLOR_ANALYSIS_PARAMS = ColorAnalysisParams()
 
 
 
 
 
 
 
324
 
 
 
 
 
 
 
325
 
 
 
5
 
6
  class DetectionStatus(str, Enum):
7
  """
8
+ Binary status derived from ensemble score only: FinalDecision supersedes this once decision policy is applied
9
  """
10
  LIKELY_AUTHENTIC = "LIKELY_AUTHENTIC"
11
  REVIEW_REQUIRED = "REVIEW_REQUIRED"
 
41
  COLOR = "color"
42
 
43
 
44
+ class EvidenceType(str, Enum):
45
+ EXIF = "exif"
46
+ WATERMARK = "watermark"
47
+
48
+
49
+
50
+ class EvidenceStrength(str, Enum):
51
+ """
52
+ Ordered by increasing certainty: WEAK < MODERATE < STRONG < CONCLUSIVE
53
+ """
54
+ WEAK = "weak" # heuristic, non-binding
55
+ MODERATE = "moderate" # strong hint, not cryptographic
56
+ STRONG = "strong" # vendor watermark, strong signal
57
+ CONCLUSIVE = "conclusive" # cryptographic / signed proof
58
+
59
+
60
+ class EvidenceDirection(str, Enum):
61
+ """
62
+ What this evidence supports
63
+ """
64
+ AI_GENERATED = "ai_generated"
65
+ AUTHENTIC = "authentic"
66
+ INDETERMINATE = "indeterminate"
67
+
68
+
69
+ class FinalDecision(str, Enum):
70
+ MOSTLY_AUTHENTIC = "mostly_authentic"
71
+ AUTHENTIC_BUT_REVIEW = "authentic_but_review"
72
+ SUSPICIOUS_AI_LIKELY = "suspicious_ai_likely"
73
+ CONFIRMED_AI_GENERATED = "confirmed_ai_generated"
74
+
75
 
76
  # Signal thresholds
77
  SIGNAL_THRESHOLDS = {SignalStatus.FLAGGED : 0.7,
 
345
  )
346
 
347
 
348
+ @dataclass(frozen = True)
349
+ class SignalConfidenceParams:
350
+ """
351
+ Parameters for Tier-1 signal confidence calculation
352
+ """
353
+ # Agreement (variance-based confidence)
354
+ VARIANCE_NORM : float = 0.10
355
+
356
+ # Distance-from-threshold confidence
357
+ DISTANCE_NORM : float = 0.30
358
+
359
+ # Fallback when metric confidence is missing
360
+ DEFAULT_RELIABILITY_CONFIDENCE : float = 0.60
361
+
362
+ # Weighting of confidence components (must sum to 1.0)
363
+ AGREEMENT_WEIGHT : float = 0.40
364
+ RELIABILITY_WEIGHT : float = 0.30
365
+ DISTANCE_WEIGHT : float = 0.30
366
+
367
+
368
+ @dataclass(frozen = True)
369
+ class WatermarkAnalysisParams:
370
+ """
371
+ Parameters for heuristic watermark detection
372
+ """
373
+ # Confidence thresholds
374
+ STRONG_CONFIDENCE_THRESHOLD : float = 0.85
375
+ CONFIDENCE_CAP : float = 0.95
376
+
377
+ # Wavelet-domain thresholds
378
+ HF_ENERGY_RATIO_THRESHOLD : float = 0.18
379
+ KURTOSIS_THRESHOLD : float = 7.5
380
+ PERIODICITY_THRESHOLD : float = 0.8
381
+
382
+ HF_ENERGY_RATIO_NORM : float = 0.4
383
+ KURTOSIS_NORM_FACTOR : float = 15.0
384
+ PEAK_STD_MULTIPLIER : float = 3.0
385
+
386
+ # Frequency-domain thresholds
387
+ MIN_ANOMALOUS_BANDS : int = 2
388
+ SPECTRAL_SYMMETRY_THRESHOLD : float = 0.6
389
+ PEAK_RATIO_THRESHOLD : float = 0.05
390
+
391
+ # LSB steganography thresholds
392
+ LSB_ENTROPY_THRESHOLD : float = 0.72
393
+ CHI_SQUARE_THRESHOLD : float = 20.0
394
+ RUNS_SCORE_THRESHOLD : float = 0.6
395
+ LSB_ENTROPY_NORM_BASE : float = 0.5
396
+ LSB_ENTROPY_NORM_RANGE : float = 0.5
397
+ CHI_SQUARE_NORM_FACTOR : float = 50.0
398
+
399
+
400
+ @dataclass(frozen = True)
401
+ class ExifAnalysisParams:
402
+ """
403
+ Parameters for EXIF metadata analysis
404
+ """
405
+ # Confidence values
406
+ MISSING_EXIF_CONFIDENCE : float = 0.5
407
+ AI_FINGERPRINT_CONFIDENCE : float = 0.9
408
+ CAMERA_BASE_CONFIDENCE : float = 0.7
409
+ CAMERA_WITH_LENS_CONFIDENCE : float = 0.75
410
+ SUSPICIOUS_CAMERA_CONFIDENCE : float = 0.4
411
+ TIMESTAMP_INCONSISTENCY_CONFIDENCE : float = 0.4
412
+ MISSING_PHOTO_METADATA_CONFIDENCE : float = 0.5
413
+ SUSPICIOUS_TIMESTAMP_CONFIDENCE : float = 0.3
414
+
415
+ # Thresholds
416
+ TIMESTAMP_DELTA_THRESHOLD : float = 5.0 # seconds
417
+ MIN_VALID_YEAR : int = 1990 # before digital cameras
418
+ MAX_FUTURE_YEARS : int = 1 # how many years in future is valid
419
+
420
+
421
 
422
  # Singleton instances for parameter classes
423
  GRADIENT_FIELD_PCA_PARAMS = GradientFieldPCAParams()
 
425
  NOISE_ANALYSIS_PARAMS = NoiseAnalysisParams()
426
  TEXTURE_ANALYSIS_PARAMS = TextureAnalysisParams()
427
  COLOR_ANALYSIS_PARAMS = ColorAnalysisParams()
428
+ SIGNAL_CONFIDENCE_PARAMS = SignalConfidenceParams()
429
+
430
+
431
+ # Singleton instances for evidence analysis classes
432
+ WATERMARK_ANALYSIS_PARAMS = WatermarkAnalysisParams()
433
+ EXIF_ANALYSIS_PARAMS = ExifAnalysisParams()
434
+
435
 
436
+ # Evidence Strength ordering
437
+ EVIDENCE_STRENGTH_ORDER = {EvidenceStrength.WEAK : 1,
438
+ EvidenceStrength.MODERATE : 2,
439
+ EvidenceStrength.STRONG : 3,
440
+ EvidenceStrength.CONCLUSIVE : 4,
441
+ }
442
 
443
+ MIN_EVIDENCE_CONFIDENCE = 0.6
config/schemas.py CHANGED
@@ -6,10 +6,15 @@ from typing import Optional
6
  from datetime import datetime
7
  from pydantic import BaseModel
8
  from config.constants import MetricType
 
9
  from config.constants import SignalStatus
 
10
  from config.constants import DetectionStatus
 
 
11
 
12
 
 
13
  class MetricResult(BaseModel):
14
  """
15
  Raw metric output for explainability and reporting
@@ -52,29 +57,63 @@ class DetectionSignal(BaseModel):
52
  }
53
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  class AnalysisResult(BaseModel):
56
  """
57
  Single image analysis result
58
  """
59
- filename : str
60
- overall_score : float = Field(..., ge = 0.0, le = 1.0)
61
- status : DetectionStatus
62
- confidence : int = Field(..., ge = 0, le = 100, description = "Confidence percentage")
63
- signals : List[DetectionSignal]
64
- metric_results : Dict[MetricType, MetricResult]
65
- processing_time : float = Field(..., description = "Processing time in seconds")
66
- timestamp : datetime = Field(default_factory = datetime.now)
67
- image_size : tuple[int, int] = Field(..., description = "Width x Height")
 
 
 
68
 
69
- model_config = {"json_schema_extra" : {"example" : {"filename" : "photo_001.jpg",
70
- "overall_score" : 0.73,
71
- "status" : "REVIEW_REQUIRED",
72
- "confidence" : 73,
73
- "signals" : [],
74
- "processing_time" : 2.34,
75
- "image_size" : [1920, 1080]
76
- }
77
- }
 
 
78
  }
79
 
80
 
@@ -91,6 +130,7 @@ class BatchAnalysisResult(BaseModel):
91
  timestamp : datetime = Field(default_factory = datetime.now)
92
 
93
 
 
94
  class APIResponse(BaseModel):
95
  """
96
  Standard API response wrapper
 
6
  from datetime import datetime
7
  from pydantic import BaseModel
8
  from config.constants import MetricType
9
+ from config.constants import EvidenceType
10
  from config.constants import SignalStatus
11
+ from config.constants import FinalDecision
12
  from config.constants import DetectionStatus
13
+ from config.constants import EvidenceStrength
14
+ from config.constants import EvidenceDirection
15
 
16
 
17
+ # Metric-Level Structures
18
  class MetricResult(BaseModel):
19
  """
20
  Raw metric output for explainability and reporting
 
57
  }
58
 
59
 
60
+ # # Evidence-Level Structures
61
+ class EvidenceResult(BaseModel):
62
+ """
63
+ Declarative evidence extracted from image metadata, watermarking, or cryptographic provenance systems
64
+ """
65
+ source : EvidenceType = Field(..., description = "Evidence source type (exif, watermark, c2pa)")
66
+ finding : str = Field(..., description = "Human-readable description of the evidence")
67
+ direction : EvidenceDirection = Field(..., description = "What this evidence supports")
68
+ strength : EvidenceStrength = Field(..., description = "How strong or reliable this evidence is")
69
+ confidence : Optional[float] = Field(None, ge = 0.0, le = 1.0, description = "Confidence in the evidence extraction itself")
70
+ details : Dict = Field(default_factory = dict, description = "Raw extracted fields or technical metadata")
71
+ analyzer : str = Field(..., description = "Analyzer that produced this evidence (exif_analyzer, watermark_analyzer, etc.)")
72
+ timestamp : datetime = Field(default_factory = datetime.now)
73
+ model_config = {"json_schema_extra": {"example" : {"source" : "watermark",
74
+ "finding" : "Midjourney v6 watermark detected",
75
+ "direction" : "ai_generated",
76
+ "strength" : "strong",
77
+ "confidence" : 0.92,
78
+ "details" : {"watermark_type" : "DWT",
79
+ "vendor" : "Midjourney",
80
+ "version" : "v6"
81
+ },
82
+ "analyzer" : "watermark_analyzer"
83
+ }
84
+ }
85
+ }
86
+
87
+
88
+ # Analysis-Level Structures
89
  class AnalysisResult(BaseModel):
90
  """
91
  Single image analysis result
92
  """
93
+ filename : str
94
+ overall_score : float = Field(..., ge = 0.0, le = 1.0)
95
+ status : DetectionStatus
96
+ final_decision : Optional[FinalDecision] = Field(None, description = "Authoritative decision after evidence-first policy evaluation")
97
+ decision_explanation : Optional[str] = Field(None, description = "Human-readable explanation of final decision")
98
+ confidence : int = Field(..., ge = 0, le = 100, description = "Confidence percentage")
99
+ signals : List[DetectionSignal]
100
+ metric_results : Dict[MetricType, MetricResult]
101
+ evidence : List[EvidenceResult] = Field(default_factory = list, description = "Declarative evidence extracted before decision policy")
102
+ processing_time : float = Field(..., description = "Processing time in seconds")
103
+ timestamp : datetime = Field(default_factory = datetime.now)
104
+ image_size : tuple[int, int] = Field(..., description = "Width x Height")
105
 
106
+
107
+ model_config = {"json_schema_extra" : {"example" : {"filename" : "photo_001.jpg",
108
+ "overall_score" : 0.73,
109
+ "status" : "REVIEW_REQUIRED",
110
+ "confidence" : 73,
111
+ "signals" : [],
112
+ "evidence" : [],
113
+ "processing_time" : 2.34,
114
+ "image_size" : [1920, 1080]
115
+ }
116
+ }
117
  }
118
 
119
 
 
130
  timestamp : datetime = Field(default_factory = datetime.now)
131
 
132
 
133
+ # API Wrappers
134
  class APIResponse(BaseModel):
135
  """
136
  Standard API response wrapper
config/settings.py CHANGED
@@ -46,6 +46,10 @@ class Settings(BaseSettings):
46
  PROCESSING_TIMEOUT : int = 30
47
  PARALLEL_PROCESSING : bool = True
48
  MAX_WORKERS : int = 4
 
 
 
 
49
 
50
  # Paths
51
  BASE_DIR : Path = Path(__file__).parent.parent
 
46
  PROCESSING_TIMEOUT : int = 30
47
  PARALLEL_PROCESSING : bool = True
48
  MAX_WORKERS : int = 4
49
+ METRIC_WORKERS : int = 4
50
+ EVIDENCE_WORKERS : int = 2
51
+ METRIC_TIMEOUT : float = 5.0
52
+ EVIDENCE_TIMEOUT : float = 5.0
53
 
54
  # Paths
55
  BASE_DIR : Path = Path(__file__).parent.parent
decision_builders/decision_policy.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ from typing import List
3
+ from utils.logger import get_logger
4
+ from config.schemas import AnalysisResult
5
+ from config.schemas import EvidenceResult
6
+ from config.constants import FinalDecision
7
+ from config.constants import DetectionStatus
8
+ from config.constants import EvidenceStrength
9
+ from config.constants import EvidenceDirection
10
+ from config.constants import MIN_EVIDENCE_CONFIDENCE
11
+ from config.constants import EVIDENCE_STRENGTH_ORDER
12
+
13
+
14
+ # Setup Logging
15
+ logger = get_logger(__name__)
16
+
17
+
18
+ class DecisionPolicy:
19
+ """
20
+ Evidence-first decision policy
21
+
22
+ Responsibilities:
23
+ -----------------
24
+ - Apply authoritative rules over Tier-1 metrics
25
+ - Resolve conflicting evidence safely
26
+ - Produce a final, explainable decision
27
+ - Never perform probabilistic inference
28
+ """
29
+ def apply(self, analysis: AnalysisResult) -> AnalysisResult:
30
+ """
31
+ Apply final decision policy on analysis result
32
+ """
33
+ try:
34
+ evidence = analysis.evidence or []
35
+
36
+ if not evidence:
37
+ analysis.final_decision = self._decide_from_metrics(status = analysis.status)
38
+ analysis.decision_explanation = ("No declarative evidence detected. Final decision is based on metric analysis.")
39
+
40
+ return analysis
41
+
42
+ logger.info(f"Applying decision policy on {len(evidence)} evidence items")
43
+
44
+ decision, explanation = self._decide_from_evidence(evidence = evidence,
45
+ fallback_status = analysis.status,
46
+ )
47
+
48
+ analysis.final_decision = decision
49
+ analysis.decision_explanation = explanation
50
+
51
+ logger.info(f"Final decision resolved: {decision.value}")
52
+
53
+ return analysis
54
+
55
+ except Exception as e:
56
+ # Decision policy must NEVER break pipeline
57
+ logger.error(f"Decision policy failed: {e}")
58
+ analysis.final_decision = FinalDecision.AUTHENTIC_BUT_REVIEW
59
+
60
+ analysis.decision_explanation = ("Decision policy encountered an internal error. Image requires manual review.")
61
+ return analysis
62
+
63
+
64
+ def _decide_from_evidence(self, evidence: List[EvidenceResult], fallback_status: DetectionStatus) -> tuple[FinalDecision, str]:
65
+ """
66
+ Resolve decision from Tier-2 evidence with explanation
67
+ """
68
+ # Storage for evidences
69
+ ai_evidence = list()
70
+ authentic_evidence = list()
71
+
72
+ indeterminate_count = sum(1 for e in evidence if (e.direction == EvidenceDirection.INDETERMINATE))
73
+
74
+ for item in evidence:
75
+ if (item.direction == EvidenceDirection.AI_GENERATED):
76
+ ai_evidence.append(item)
77
+
78
+ elif (item.direction == EvidenceDirection.AUTHENTIC):
79
+ authentic_evidence.append(item)
80
+
81
+ else:
82
+ logger.debug(f"Indeterminate evidence detected and excluded from decisive rules: {item.finding} ({item.analyzer})")
83
+
84
+
85
+ strongest_ai = self._strongest(evidence = ai_evidence)
86
+ strongest_authentic = self._strongest(evidence = authentic_evidence)
87
+
88
+ # Rule 1: CONCLUSIVE AI
89
+ if (strongest_ai and (strongest_ai.strength == EvidenceStrength.CONCLUSIVE) and ((strongest_ai.confidence or 0.0) >= MIN_EVIDENCE_CONFIDENCE)):
90
+ return (FinalDecision.CONFIRMED_AI_GENERATED,
91
+ f"Conclusive evidence detected ({strongest_ai.finding}). This evidence provides cryptographic or authoritative proof of AI generation.",
92
+ )
93
+
94
+ # Rule 2: Strong AI evidence
95
+ if (strongest_ai and (strongest_ai.strength == EvidenceStrength.STRONG) and ((strongest_ai.confidence or 0.0) >= MIN_EVIDENCE_CONFIDENCE)):
96
+ if (strongest_authentic and (strongest_authentic.strength in (EvidenceStrength.STRONG, EvidenceStrength.CONCLUSIVE))):
97
+ return (FinalDecision.AUTHENTIC_BUT_REVIEW,
98
+ "Strong evidence exists for both AI generation and authenticity. Conflicting high-authority signals require human review.",
99
+ )
100
+
101
+ return (FinalDecision.SUSPICIOUS_AI_LIKELY,
102
+ f"Strong AI-related evidence detected ({strongest_ai.finding}). This evidence overrides metric-based assessment.")
103
+
104
+ # Rule 3: Strong authentic evidence
105
+ if (strongest_authentic and (strongest_authentic.strength in (EvidenceStrength.STRONG, EvidenceStrength.CONCLUSIVE))):
106
+ return (FinalDecision.MOSTLY_AUTHENTIC,
107
+ f"Strong authenticity evidence detected ({strongest_authentic.finding}). Image is assessed as mostly authentic."
108
+ )
109
+
110
+ # Rule 4: Conflicting moderate evidence
111
+ if (strongest_ai and strongest_authentic):
112
+ return (FinalDecision.AUTHENTIC_BUT_REVIEW,
113
+ "Moderate evidence exists for both AI generation and authenticity. Evidence is inconclusive and requires manual review.")
114
+
115
+ if (indeterminate_count >= 2):
116
+ return (FinalDecision.AUTHENTIC_BUT_REVIEW,
117
+ "Multiple indeterminate evidence sources detected. Evidence is inconclusive and requires manual review."
118
+ )
119
+
120
+ # Rule 5: Weak / indeterminate evidence
121
+ decision = self._decide_from_metrics(status = fallback_status)
122
+ return (decision,
123
+ "No high-authority evidence detected. Final decision is derived from metric-based analysis.",
124
+ )
125
+
126
+
127
+ def _decide_from_metrics(self, status: DetectionStatus) -> FinalDecision:
128
+ """
129
+ Convert Tier-1 metric status into final decision
130
+ """
131
+ if (status == DetectionStatus.REVIEW_REQUIRED):
132
+ return FinalDecision.SUSPICIOUS_AI_LIKELY
133
+
134
+ return FinalDecision.MOSTLY_AUTHENTIC
135
+
136
+
137
+ def _strongest(self, evidence: List[EvidenceResult]) -> EvidenceResult | None:
138
+ """
139
+ Return strongest evidence by strength, then confidence
140
+ """
141
+ if not evidence:
142
+ return None
143
+
144
+ return max(evidence,
145
+ key = lambda item: (EVIDENCE_STRENGTH_ORDER.get(item.strength, 0), item.confidence or 0.0),
146
+ )
evidence_analyzers/evidence_aggregator.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ import time
3
+ from typing import List
4
+ from pathlib import Path
5
+ from utils.logger import get_logger
6
+ from config.settings import settings
7
+ from config.schemas import EvidenceResult
8
+ from concurrent.futures import TimeoutError
9
+ from concurrent.futures import as_completed
10
+ from config.constants import EvidenceStrength
11
+ from config.constants import EvidenceDirection
12
+ from concurrent.futures import ThreadPoolExecutor
13
+ from config.constants import EVIDENCE_STRENGTH_ORDER
14
+ from evidence_analyzers.exif_analyzer import ExifAnalyzer
15
+ from evidence_analyzers.watermark_analyzer import WatermarkAnalyzer
16
+
17
+
18
+ # Setup Logging
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class EvidenceAggregator:
23
+ """
24
+ Tier-2 Evidence Orchestrator
25
+
26
+ Responsibilities:
27
+ -----------------
28
+ - Execute all evidence analyzers safely
29
+ - Collect declarative evidence only (no inference)
30
+ - Deduplicate overlapping findings
31
+ - Rank evidence by authority & reliability
32
+ - Remain forward-compatible with new evidence sources
33
+ """
34
+ def __init__(self):
35
+ """
36
+ Initialize all Tier-2 analyzers
37
+
38
+ NOTE:
39
+ -----
40
+ Any new analyzer (C2PA, camera fingerprinting, sensor PRNU) must be added here explicitly
41
+ """
42
+ self.exif_analyzer = ExifAnalyzer()
43
+ self.watermark_analyzer = WatermarkAnalyzer()
44
+
45
+ self._analyzers = (self.exif_analyzer,
46
+ self.watermark_analyzer,
47
+ )
48
+
49
+ logger.info("EvidenceAggregator initialized with analyzers: "
50
+ f"{[a.__class__.__name__ for a in self._analyzers]}")
51
+
52
+
53
+ def analyze(self, image_path: Path) -> List[EvidenceResult]:
54
+ """
55
+ Run Tier-2 evidence extraction pipeline
56
+
57
+ Arguments:
58
+ ----------
59
+ image_path {Path} : Path to image file
60
+
61
+ Returns:
62
+ --------
63
+ { list } : Ordered, deduplicated evidence
64
+ """
65
+ # Small, bounded executor for Tier-2 (I/O oriented)
66
+ max_workers = min(len(self._analyzers), settings.EVIDENCE_WORKERS or 2)
67
+
68
+ logger.info(f"Starting Tier-2 evidence analysis: {image_path}")
69
+
70
+ evidence_collected = list()
71
+
72
+
73
+ with ThreadPoolExecutor(max_workers = max_workers) as executor:
74
+ futures = {executor.submit(analyzer.analyze, image_path = image_path): {"analyzer": analyzer, "start": time.time()} for analyzer in self._analyzers}
75
+
76
+ for future in as_completed(futures):
77
+ meta = futures[future]
78
+ analyzer = meta["analyzer"]
79
+ start = meta["start"]
80
+
81
+ try:
82
+ results = future.result(timeout = settings.EVIDENCE_TIMEOUT)
83
+
84
+ logger.debug(f"{analyzer.__class__.__name__} completed in {time.time()-start:.2f}s")
85
+
86
+ if results:
87
+ evidence_collected.extend(results)
88
+ logger.debug(f"{analyzer.__class__.__name__} returned {len(results)} evidence items")
89
+
90
+ else:
91
+ logger.debug(f"{analyzer.__class__.__name__} returned no evidence")
92
+
93
+ except TimeoutError:
94
+ logger.warning(f"{analyzer.__class__.__name__} timed out")
95
+
96
+ except Exception as e:
97
+ logger.error(f"{analyzer.__class__.__name__} failed: {e}")
98
+
99
+ if not evidence_collected:
100
+ logger.info("No Tier-2 evidence detected")
101
+ return []
102
+
103
+ # Normalize, deduplicate & rank
104
+ evidence = self._deduplicate(evidence = evidence_collected)
105
+ evidence = self._rank_evidence(evidence = evidence)
106
+
107
+ logger.info(f"Tier-2 evidence finalized: {len(evidence)} items")
108
+
109
+ return evidence
110
+
111
+
112
+ def _deduplicate(self, evidence: List[EvidenceResult]) -> List[EvidenceResult]:
113
+ """
114
+ Deduplicate evidence items
115
+
116
+ Strategy:
117
+ ---------
118
+ - Same analyzer
119
+ - Same semantic finding
120
+ - Same direction
121
+
122
+ Keeps the strongest / highest confidence instance
123
+ """
124
+ unique_map = dict()
125
+
126
+ for item in evidence:
127
+ key = (item.analyzer, item.finding, item.direction)
128
+
129
+ if key not in unique_map:
130
+ unique_map[key] = item
131
+ continue
132
+
133
+ existing = unique_map[key]
134
+ existing_strength_rank = self._strength_rank(strength = existing.strength)
135
+ item_strength_rank = self._strength_rank(strength = item.strength)
136
+
137
+ # Prefer stronger evidence
138
+ if (item_strength_rank > existing_strength_rank):
139
+ unique_map[key] = item
140
+ continue
141
+
142
+ # Prefer higher confidence if strength equal
143
+ if (item_strength_rank == existing_strength_rank):
144
+ if (item.confidence or 0.0) > (existing.confidence or 0.0):
145
+ unique_map[key] = item
146
+
147
+ deduped = list(unique_map.values())
148
+
149
+ logger.debug(f"Deduplicated evidence: {len(evidence)} → {len(deduped)}")
150
+
151
+ return deduped
152
+
153
+
154
+ def _rank_evidence(self, evidence: List[EvidenceResult]) -> List[EvidenceResult]:
155
+ """
156
+ Rank evidence by authority
157
+
158
+ Ranking precedence:
159
+ -------------------
160
+ 1. Direction (AI > AUTHENTIC > INDETERMINATE)
161
+ 2. Strength (CONCLUSIVE > STRONG > MODERATE > WEAK)
162
+ 3. Confidence (higher wins)
163
+ """
164
+ def priority(e: EvidenceResult) -> tuple:
165
+ return (self._direction_rank(direction = e.direction),
166
+ self._strength_rank(strength = e.strength),
167
+ e.confidence or 0.0,
168
+ )
169
+
170
+ ranked = sorted(evidence, key = priority, reverse = True)
171
+
172
+ logger.debug("Evidence ranking completed")
173
+
174
+ return ranked
175
+
176
+
177
+ @staticmethod
178
+ def _direction_rank(direction: EvidenceDirection) -> int:
179
+ """
180
+ Evidence direction priority
181
+ """
182
+ return {EvidenceDirection.AI_GENERATED : 3,
183
+ EvidenceDirection.AUTHENTIC : 2,
184
+ EvidenceDirection.INDETERMINATE : 1,
185
+ }.get(direction, 0)
186
+
187
+
188
+ @staticmethod
189
+ def _strength_rank(strength: EvidenceStrength) -> int:
190
+ """
191
+ Evidence strength priority
192
+ """
193
+ return EVIDENCE_STRENGTH_ORDER.get(strength, 0)
evidence_analyzers/exif_analyzer.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ from PIL import Image
3
+ from typing import List
4
+ from typing import Dict
5
+ from pathlib import Path
6
+ from PIL import ExifTags
7
+ from typing import Optional
8
+ from datetime import datetime
9
+ from utils.logger import get_logger
10
+ from config.constants import EvidenceType
11
+ from config.schemas import EvidenceResult
12
+ from config.constants import EvidenceStrength
13
+ from config.constants import EvidenceDirection
14
+ from config.constants import EXIF_ANALYSIS_PARAMS
15
+
16
+
17
+ # Setup Logging
18
+ logger = get_logger(__name__)
19
+
20
+
21
+ class ExifAnalyzer:
22
+ """
23
+ EXIF analyzer produces declarative evidence only: No probabilistic inference
24
+ """
25
+ AI_SOFTWARE_FINGERPRINTS = {"sdxl",
26
+ "dall-e",
27
+ "dall·e",
28
+ "openai",
29
+ "imagen",
30
+ "runway",
31
+ "comfyui",
32
+ "firefly",
33
+ "novelai",
34
+ "craiyon",
35
+ "leonardo",
36
+ "midjourney",
37
+ "adobe sensei",
38
+ "automatic1111",
39
+ "waifu diffusion",
40
+ "stable diffusion",
41
+ "bing image creator",
42
+ }
43
+
44
+ CAMERA_FIELDS = {"Make",
45
+ "Model",
46
+ "LensModel",
47
+ }
48
+
49
+ TIME_FIELDS = {"DateTime",
50
+ "DateTimeOriginal",
51
+ "DateTimeDigitized",
52
+ }
53
+
54
+ AI_INDICATOR_FIELDS = {"Artist",
55
+ "Software",
56
+ "XPComment",
57
+ "UserComment",
58
+ "ImageDescription",
59
+ "ProcessingSoftware",
60
+ }
61
+
62
+ TIMESTAMP_FORMATS = ["%Y:%m:%d %H:%M:%S",
63
+ "%Y-%m-%d %H:%M:%S",
64
+ "%Y:%m:%d %H:%M:%S.%f",
65
+ ]
66
+
67
+ PHOTO_METADATA_FIELDS = {"ISO",
68
+ "FNumber",
69
+ "FocalLength",
70
+ "ExposureTime",
71
+ "ISOSpeedRatings",
72
+ }
73
+
74
+ SUSPICIOUS_PATTERNS = {"unknown",
75
+ "none",
76
+ "camera",
77
+ "test",
78
+ "generic",
79
+ "placeholder",
80
+ "sample",
81
+ }
82
+
83
+
84
+ def analyze(self, image_path: Path) -> List[EvidenceResult]:
85
+ evidence = list()
86
+ logger.debug(f"Starting EXIF analysis for {image_path}")
87
+
88
+ try:
89
+ image = Image.open(fp = image_path, mode = "r")
90
+ exif_raw = image.getexif()
91
+
92
+ if not exif_raw:
93
+ logger.info("No EXIF metadata found")
94
+ evidence.append(self._missing_exif())
95
+ return evidence
96
+
97
+ exif = self._normalize_exif(exif_raw = exif_raw)
98
+ logger.debug(f"Normalized EXIF fields: {list(exif.keys())}")
99
+
100
+ evidence.extend(self._analyze_ai_indicators(exif = exif))
101
+ evidence.extend(self._analyze_camera_presence(exif = exif))
102
+ evidence.extend(self._analyze_timestamp_consistency(exif = exif))
103
+ evidence.extend(self._analyze_suspicious_combinations(exif = exif))
104
+
105
+ except Exception as e:
106
+ logger.exception("EXIF parsing failed")
107
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
108
+ finding = "EXIF parsing failed",
109
+ direction = EvidenceDirection.INDETERMINATE,
110
+ strength = EvidenceStrength.WEAK,
111
+ confidence = 0.0,
112
+ details = {"error": str(e)},
113
+ analyzer = "exif_analyzer",
114
+ )
115
+ )
116
+
117
+ logger.debug(f"EXIF analysis completed with {len(evidence)} evidence items")
118
+ return evidence
119
+
120
+
121
+ def _normalize_exif(self, exif_raw) -> Dict[str, str]:
122
+ """
123
+ Normalize EXIF tags to human-readable names
124
+ """
125
+ normalized_exif = dict()
126
+
127
+ for tag, value in exif_raw.items():
128
+ tag_name = ExifTags.TAGS.get(tag, tag)
129
+
130
+ # Convert value to string: handling bytes
131
+ if isinstance(value, bytes):
132
+ try:
133
+ value_str = value.decode('utf-8', errors = 'ignore')
134
+
135
+ except:
136
+ value_str = str(value)
137
+
138
+ else:
139
+ value_str = str(value)
140
+
141
+ normalized_exif[tag_name] = value_str
142
+
143
+ return normalized_exif
144
+
145
+
146
+ def _missing_exif(self) -> EvidenceResult:
147
+ """
148
+ Missing EXIF is suspicious but not conclusive
149
+ """
150
+ missing_exif = EvidenceResult(source = EvidenceType.EXIF,
151
+ finding = "No EXIF metadata present (common in AI images and processed web images)",
152
+ direction = EvidenceDirection.INDETERMINATE,
153
+ strength = EvidenceStrength.WEAK,
154
+ confidence = EXIF_ANALYSIS_PARAMS.MISSING_EXIF_CONFIDENCE,
155
+ details = {"note": "Missing EXIF alone is not conclusive"},
156
+ analyzer = "exif_analyzer",
157
+ )
158
+
159
+ return missing_exif
160
+
161
+
162
+ def _analyze_ai_indicators(self, exif: Dict[str, str]) -> List[EvidenceResult]:
163
+ """
164
+ Check multiple EXIF fields for AI tool indicators
165
+ """
166
+ evidence = list()
167
+
168
+ for field_name in self.AI_INDICATOR_FIELDS:
169
+ field_value = exif.get(field_name, "").lower()
170
+
171
+ if not field_value:
172
+ continue
173
+
174
+ logger.debug(f"Checking {field_name}: {field_value}")
175
+
176
+ for fingerprint in self.AI_SOFTWARE_FINGERPRINTS:
177
+ if (fingerprint in field_value):
178
+ logger.warning(f"AI software fingerprint detected in {field_name}: {fingerprint}")
179
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
180
+ finding = f"EXIF {field_name} tag matches known AI tool: '{field_value}'",
181
+ direction = EvidenceDirection.AI_GENERATED,
182
+ strength = EvidenceStrength.STRONG,
183
+ confidence = EXIF_ANALYSIS_PARAMS.AI_FINGERPRINT_CONFIDENCE,
184
+ details = {"field" : field_name,
185
+ "value" : field_value,
186
+ "fingerprint" : fingerprint
187
+ },
188
+ analyzer = "exif_analyzer",
189
+ )
190
+ )
191
+ break
192
+
193
+ return evidence
194
+
195
+
196
+ def _analyze_camera_presence(self, exif: Dict[str, str]) -> List[EvidenceResult]:
197
+ """
198
+ Analyze camera metadata for authenticity indicators
199
+ """
200
+ evidence = list()
201
+
202
+ make = exif.get("Make")
203
+ model = exif.get("Model")
204
+ lens = exif.get("LensModel")
205
+
206
+ if (make and model):
207
+ logger.debug(f"Camera metadata found: {make} {model}")
208
+
209
+ confidence = EXIF_ANALYSIS_PARAMS.CAMERA_BASE_CONFIDENCE
210
+ details = {"make": make, "model": model}
211
+
212
+ if lens:
213
+ confidence = EXIF_ANALYSIS_PARAMS.CAMERA_WITH_LENS_CONFIDENCE
214
+ details["lens"] = lens
215
+ logger.debug(f"Lens metadata found: {lens}")
216
+
217
+ if self._is_suspicious_camera_data(make = make, model = model):
218
+ logger.warning(f"Suspicious camera metadata: {make} {model}")
219
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
220
+ finding = f"Suspicious camera metadata detected: {make} {model}",
221
+ direction = EvidenceDirection.INDETERMINATE,
222
+ strength = EvidenceStrength.WEAK,
223
+ confidence = EXIF_ANALYSIS_PARAMS.SUSPICIOUS_CAMERA_CONFIDENCE,
224
+ details = details,
225
+ analyzer = "exif_analyzer",
226
+ )
227
+ )
228
+ else:
229
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
230
+ finding = f"Camera metadata present: {make} {model}",
231
+ direction = EvidenceDirection.AUTHENTIC,
232
+ strength = EvidenceStrength.MODERATE,
233
+ confidence = confidence,
234
+ details = details,
235
+ analyzer = "exif_analyzer",
236
+ )
237
+ )
238
+ else:
239
+ logger.info("No camera metadata present")
240
+
241
+ return evidence
242
+
243
+
244
+ def _is_suspicious_camera_data(self, make: str, model: str) -> bool:
245
+ """
246
+ Check if camera data looks fake or suspicious
247
+ """
248
+ make_lower = make.lower()
249
+ model_lower = model.lower()
250
+
251
+ for pattern in self.SUSPICIOUS_PATTERNS:
252
+ if ((pattern in make_lower) or (pattern in model_lower)):
253
+ return True
254
+
255
+ return False
256
+
257
+
258
+ def _analyze_timestamp_consistency(self, exif: Dict[str, str]) -> List[EvidenceResult]:
259
+ """
260
+ Check for timestamp inconsistencies
261
+ """
262
+ timestamps = dict()
263
+
264
+ for field in self.TIME_FIELDS:
265
+ if (field not in exif):
266
+ continue
267
+
268
+ parsed_time = self._parse_timestamp(timestamp_str = exif[field])
269
+
270
+ if parsed_time:
271
+ timestamps[field] = parsed_time
272
+
273
+ if (len(timestamps) < 2):
274
+ return []
275
+
276
+ time_values = list(timestamps.values())
277
+ delta = max(time_values) - min(time_values)
278
+ delta_seconds = delta.total_seconds()
279
+
280
+ logger.debug(f"Timestamp delta: {delta_seconds} seconds across {len(timestamps)} fields")
281
+
282
+ if (delta_seconds > EXIF_ANALYSIS_PARAMS.TIMESTAMP_DELTA_THRESHOLD):
283
+ logger.warning(f"Inconsistent EXIF timestamps detected: {delta_seconds}s delta")
284
+ return [EvidenceResult(source = EvidenceType.EXIF,
285
+ finding = f"Inconsistent EXIF timestamps ({delta_seconds:.1f}s difference)",
286
+ direction = EvidenceDirection.INDETERMINATE,
287
+ strength = EvidenceStrength.WEAK,
288
+ confidence = EXIF_ANALYSIS_PARAMS.TIMESTAMP_INCONSISTENCY_CONFIDENCE,
289
+ details = {"delta_seconds" : delta_seconds,
290
+ "timestamps" : {k: v.isoformat() for k, v in timestamps.items()},
291
+ },
292
+ analyzer = "exif_analyzer",
293
+ )
294
+ ]
295
+
296
+ return []
297
+
298
+
299
+ def _parse_timestamp(self, timestamp_str: str) -> Optional[datetime]:
300
+ """
301
+ Parse timestamp with multiple format attempts
302
+ """
303
+ for fmt in self.TIMESTAMP_FORMATS:
304
+ try:
305
+ return datetime.strptime(timestamp_str, fmt)
306
+
307
+ except (ValueError, TypeError):
308
+ continue
309
+
310
+ logger.debug(f"Could not parse timestamp: {timestamp_str}")
311
+ return None
312
+
313
+
314
+ def _analyze_suspicious_combinations(self, exif: Dict[str, str]) -> List[EvidenceResult]:
315
+ """
316
+ Detect suspicious combinations of EXIF data
317
+ """
318
+ evidence = list()
319
+
320
+ has_camera = exif.get("Make") and exif.get("Model")
321
+ has_photo_metadata = any([exif.get(field) for field in self.PHOTO_METADATA_FIELDS])
322
+
323
+ if (has_camera and not has_photo_metadata):
324
+ logger.warning("Camera metadata present but missing photographic settings")
325
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
326
+ finding = "Camera identified but photographic metadata missing (suspicious)",
327
+ direction = EvidenceDirection.INDETERMINATE,
328
+ strength = EvidenceStrength.WEAK,
329
+ confidence = EXIF_ANALYSIS_PARAMS.MISSING_PHOTO_METADATA_CONFIDENCE,
330
+ details = {"has_camera" : True,
331
+ "missing_settings" : list(self.PHOTO_METADATA_FIELDS),
332
+ },
333
+ analyzer = "exif_analyzer",
334
+ )
335
+ )
336
+
337
+ for field in self.TIME_FIELDS:
338
+ if (field not in exif):
339
+ continue
340
+
341
+ timestamp = self._parse_timestamp(timestamp_str = exif[field])
342
+
343
+ if (timestamp and self._is_suspicious_timestamp(dt = timestamp)):
344
+ logger.warning(f"Suspicious timestamp detected: {timestamp}")
345
+ evidence.append(EvidenceResult(source = EvidenceType.EXIF,
346
+ finding = f"Suspicious timestamp pattern in {field}",
347
+ direction = EvidenceDirection.INDETERMINATE,
348
+ strength = EvidenceStrength.WEAK,
349
+ confidence = EXIF_ANALYSIS_PARAMS.SUSPICIOUS_TIMESTAMP_CONFIDENCE,
350
+ details = {"field" : field,
351
+ "timestamp" : timestamp.isoformat(),
352
+ "reason" : "Suspiciously round time (midnight or all zeros)",
353
+ },
354
+ analyzer = "exif_analyzer",
355
+ )
356
+ )
357
+ break
358
+
359
+ return evidence
360
+
361
+
362
+ def _is_suspicious_timestamp(self, dt: datetime) -> bool:
363
+ """
364
+ Check if timestamp looks fake: too perfect/round
365
+ """
366
+ if ((dt.hour == 0) and (dt.minute == 0) and (dt.second == 0)):
367
+ return True
368
+
369
+ if (dt.year < EXIF_ANALYSIS_PARAMS.MIN_VALID_YEAR):
370
+ return True
371
+
372
+ if (dt.year > datetime.now().year + EXIF_ANALYSIS_PARAMS.MAX_FUTURE_YEARS):
373
+ return True
374
+
375
+ return False
evidence_analyzers/watermark_analyzer.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dependencies
2
+ import pywt
3
+ import cv2
4
+ import numpy as np
5
+ from typing import List
6
+ from typing import Tuple
7
+ from pathlib import Path
8
+ from scipy import fftpack
9
+ from scipy.stats import entropy
10
+ from utils.logger import get_logger
11
+ from scipy.signal import correlate2d
12
+ from config.schemas import EvidenceResult
13
+ from config.constants import EvidenceType
14
+ from config.constants import EvidenceStrength
15
+ from config.constants import EvidenceDirection
16
+ from utils.image_processor import ImageProcessor
17
+ from config.constants import WATERMARK_ANALYSIS_PARAMS
18
+
19
+
20
+ # Setup Logging
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ class WatermarkAnalyzer:
25
+ """
26
+ Generic watermark detector using signal processing techniques:
27
+ - Detects invisible watermarks through frequency domain analysis
28
+ - wavelet decomposition, and statistical anomalies - vendor agnostic
29
+ """
30
+ def __init__(self):
31
+ self.image_processor = ImageProcessor()
32
+
33
+
34
+ def analyze(self, image_path: Path) -> List[EvidenceResult]:
35
+ logger.debug(f"Starting watermark analysis for {image_path}")
36
+
37
+ evidence = list()
38
+ image = self.image_processor.load_image(image_path)
39
+
40
+ evidence.extend(self._detect_wavelet_watermarks(image = image))
41
+ evidence.extend(self._detect_frequency_watermarks(image = image))
42
+ evidence.extend(self._detect_lsb_steganography(image = image))
43
+
44
+ logger.debug(f"Watermark analysis completed with {len(evidence)} findings")
45
+
46
+ if not evidence:
47
+ return []
48
+
49
+ return evidence
50
+
51
+
52
+ def _detect_wavelet_watermarks(self, image: np.ndarray) -> List[EvidenceResult]:
53
+ """
54
+ Detect watermarks embedded in wavelet domain
55
+ - Many invisible watermarks modify high-frequency wavelet coefficients
56
+ - This is a general technique used by multiple AI generators
57
+ """
58
+ logger.debug("Checking for wavelet-domain watermarks")
59
+
60
+ try:
61
+ # Convert to grayscale if needed
62
+ if (len(image.shape) == 3):
63
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
64
+
65
+ else:
66
+ gray = image.copy()
67
+
68
+ # Multi-level wavelet decomposition
69
+ coeffs2 = pywt.dwt2(gray, 'haar')
70
+ cA, (cH, cV, cD) = coeffs2
71
+
72
+ # Analyze statistical properties of high-frequency subbands: Watermarks create anomalous energy distributions
73
+ # Calculate sub-band energies
74
+ energy_approx = np.var(cA)
75
+ energy_h = np.var(cH)
76
+ energy_v = np.var(cV)
77
+ energy_d = np.var(cD)
78
+
79
+ total_hf_energy = energy_h + energy_v + energy_d
80
+ total_energy = energy_approx + total_hf_energy
81
+
82
+ if (total_energy == 0):
83
+ return []
84
+
85
+ # High-frequency energy ratio
86
+ hf_ratio = total_hf_energy / total_energy
87
+
88
+ # Watermarks increase high-frequency energy beyond natural levels:
89
+ # - Natural images : ~0.05-0.15
90
+ # - Watermarked : ~0.20-0.40
91
+ anomalous_energy = hf_ratio > WATERMARK_ANALYSIS_PARAMS.HF_ENERGY_RATIO_THRESHOLD
92
+
93
+ # Check for statistical anomalies in coefficient distribution: watermarks create non-Gaussian distributions
94
+ kurtosis_h = self._calculate_kurtosis(data = cH)
95
+ kurtosis_v = self._calculate_kurtosis(data = cV)
96
+ kurtosis_d = self._calculate_kurtosis(data = cD)
97
+
98
+ avg_kurtosis = (kurtosis_h + kurtosis_v + kurtosis_d) / 3
99
+
100
+ # Natural images: kurtosis ~3-6, Watermarked: often >8
101
+ anomalous_distribution = avg_kurtosis > WATERMARK_ANALYSIS_PARAMS.KURTOSIS_THRESHOLD
102
+
103
+ # Check for periodic patterns (grid-based embedding)
104
+ periodicity_score = self._detect_periodicity(cH, cV, cD)
105
+
106
+ # Combined detection
107
+ detected = (anomalous_energy and anomalous_distribution) or ((periodicity_score > WATERMARK_ANALYSIS_PARAMS.PERIODICITY_THRESHOLD) and anomalous_energy)
108
+
109
+ if detected:
110
+ confidence = self._calculate_confidence([hf_ratio / WATERMARK_ANALYSIS_PARAMS.HF_ENERGY_RATIO_NORM,
111
+ min(avg_kurtosis / WATERMARK_ANALYSIS_PARAMS.KURTOSIS_NORM_FACTOR, 1.0),
112
+ periodicity_score
113
+ ])
114
+
115
+ is_strong = (confidence >= WATERMARK_ANALYSIS_PARAMS.STRONG_CONFIDENCE_THRESHOLD)
116
+ direction = (EvidenceDirection.AI_GENERATED if is_strong else EvidenceDirection.INDETERMINATE)
117
+ strength = (EvidenceStrength.STRONG if is_strong else EvidenceStrength.MODERATE)
118
+
119
+ logger.warning(f"Heuristic watermark pattern detected in wavelet domain: (confidence: {confidence:.2f})")
120
+
121
+ return [EvidenceResult(source = EvidenceType.WATERMARK,
122
+ finding = "Statistical patterns consistent with invisible watermarking or steganographic embedding detected",
123
+ direction = direction,
124
+ strength = strength,
125
+ confidence = confidence,
126
+ details = {"method" : "wavelet_analysis",
127
+ "note" : "Heuristic detection; not cryptographic or vendor watermark verification",
128
+ "high_frequency_ratio" : float(hf_ratio),
129
+ "avg_kurtosis" : float(avg_kurtosis),
130
+ "periodicity_score" : float(periodicity_score),
131
+ "wavelet_type" : "haar"
132
+ },
133
+ analyzer = "watermark_analyzer",
134
+ )
135
+ ]
136
+
137
+ except Exception as e:
138
+ logger.error(f"Error in wavelet watermark detection: {e}")
139
+
140
+ return []
141
+
142
+
143
+ def _detect_frequency_watermarks(self, image: np.ndarray) -> List[EvidenceResult]:
144
+ """
145
+ Detect watermarks in frequency domain using FFT analysis: Watermarks often add imperceptible patterns in specific frequency bands
146
+ """
147
+ logger.debug("Checking for frequency-domain watermarks")
148
+
149
+ try:
150
+ # Convert to grayscale
151
+ if (len(image.shape) == 3):
152
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
153
+
154
+ else:
155
+ gray = image.copy()
156
+
157
+ # 2D FFT
158
+ fft = fftpack.fft2(gray)
159
+ fft_shift = fftpack.fftshift(fft)
160
+ magnitude = np.abs(fft_shift)
161
+
162
+ # Log scale for better visualization of weak signals
163
+ magnitude_log = np.log1p(magnitude)
164
+
165
+ # Analyze frequency spectrum
166
+ h, w = magnitude_log.shape
167
+ center_y, center_x = h // 2, w // 2
168
+
169
+ # Check mid to high frequency bands (common watermark location): Divide spectrum into radial bands
170
+ band_anomalies = list()
171
+ frequency_bands = [(0.2, 0.4), # Mid-low frequencies
172
+ (0.4, 0.6), # Mid frequencies
173
+ (0.6, 0.8), # Mid-high frequencies
174
+ ]
175
+
176
+ for low, high in frequency_bands:
177
+ mask = self._create_radial_mask(magnitude_log.shape, low, high)
178
+ band_values = magnitude_log[mask]
179
+
180
+ if (band_values.size == 0):
181
+ continue
182
+
183
+ # Statistical analysis of band
184
+ band_mean = np.mean(band_values)
185
+ band_std = np.std(band_values)
186
+
187
+ # Detect anomalous peaks (watermark signatures)
188
+ threshold = band_mean + WATERMARK_ANALYSIS_PARAMS.PEAK_STD_MULTIPLIER * band_std
189
+ peaks = np.sum(band_values > threshold)
190
+
191
+ peak_ratio = peaks / band_values.size
192
+
193
+ if (peak_ratio > WATERMARK_ANALYSIS_PARAMS.PEAK_RATIO_THRESHOLD):
194
+ # More than 5% anomalous values
195
+ band_anomalies.append({'band' : (low, high),
196
+ 'peak_ratio' : float(peak_ratio),
197
+ 'peak_count' : int(peaks)
198
+ })
199
+
200
+ # Check for symmetric patterns (common in structured watermarks)
201
+ symmetry_score = self._check_spectral_symmetry(magnitude = magnitude_log)
202
+
203
+ detected = ((len(band_anomalies) >= WATERMARK_ANALYSIS_PARAMS.MIN_ANOMALOUS_BANDS) and
204
+ (symmetry_score > WATERMARK_ANALYSIS_PARAMS.SPECTRAL_SYMMETRY_THRESHOLD))
205
+
206
+ if detected:
207
+ confidence = self._calculate_confidence([min(len(band_anomalies) / 3, 1.0),
208
+ symmetry_score
209
+ ])
210
+
211
+ is_strong = (confidence >= WATERMARK_ANALYSIS_PARAMS.STRONG_CONFIDENCE_THRESHOLD)
212
+ direction = (EvidenceDirection.AI_GENERATED if is_strong else EvidenceDirection.INDETERMINATE)
213
+ strength = (EvidenceStrength.STRONG if is_strong else EvidenceStrength.MODERATE)
214
+
215
+ logger.warning(f"Heuristic watermark pattern detected in Frequency-domain: (confidence: {confidence:.2f})")
216
+
217
+ return [EvidenceResult(source = EvidenceType.WATERMARK,
218
+ finding = "Statistical patterns consistent with invisible watermarking or steganographic embedding detected",
219
+ direction = direction,
220
+ strength = strength,
221
+ confidence = confidence,
222
+ details = {"method" : "frequency_analysis",
223
+ "note" : "Heuristic detection; not cryptographic or vendor watermark verification",
224
+ "anomalous_bands" : len(band_anomalies),
225
+ "band_details" : band_anomalies,
226
+ "symmetry_score" : float(symmetry_score),
227
+ },
228
+ analyzer = "watermark_analyzer",
229
+ )
230
+ ]
231
+
232
+ except Exception as e:
233
+ logger.error(f"Error in frequency watermark detection: {e}")
234
+
235
+ return []
236
+
237
+
238
+ def _detect_lsb_steganography(self, image: np.ndarray) -> List[EvidenceResult]:
239
+ """
240
+ Detect steganographic watermarks using LSB (Least Significant Bit) analysis.
241
+ Many watermarking schemes embed data in the LSB planes.
242
+ """
243
+ logger.debug("Checking for LSB steganography")
244
+
245
+ try:
246
+ # Analyze all color channels
247
+ if (len(image.shape) == 3):
248
+ channels = cv2.split(image)
249
+
250
+ else:
251
+ channels = [image]
252
+
253
+ channel_results = list()
254
+
255
+ for idx, channel in enumerate(channels):
256
+ # Extract bit planes
257
+ lsb_plane = channel & 1 # LSB
258
+ msb_plane = (channel >> 7) & 1 # MSB for comparison
259
+
260
+ # Calculate entropy
261
+ lsb_entropy = self._shannon_entropy(lsb_plane)
262
+ msb_entropy = self._shannon_entropy(msb_plane)
263
+
264
+ # Chi-square test for randomness
265
+ chi_square = self._chi_square_test(lsb_plane)
266
+
267
+ # Run test for detecting non-random patterns
268
+ runs = self._runs_test(lsb_plane)
269
+
270
+ channel_results.append({'channel' : idx,
271
+ 'lsb_entropy' : float(lsb_entropy),
272
+ 'msb_entropy' : float(msb_entropy),
273
+ 'chi_square' : float(chi_square),
274
+ 'runs_score' : float(runs)
275
+ })
276
+
277
+ # Average results across channels
278
+ avg_lsb_entropy = np.mean([r['lsb_entropy'] for r in channel_results])
279
+ avg_chi_square = np.mean([r['chi_square'] for r in channel_results])
280
+ avg_runs = np.mean([r['runs_score'] for r in channel_results])
281
+
282
+ # Detection criteria:
283
+ # - High LSB entropy (>0.72) indicates embedded data
284
+ # - High chi-square indicates non-uniform distribution
285
+ # - Runs test indicates structured patterns
286
+
287
+ suspicious_entropy = (avg_lsb_entropy > WATERMARK_ANALYSIS_PARAMS.LSB_ENTROPY_THRESHOLD)
288
+ suspicious_chi = (avg_chi_square > WATERMARK_ANALYSIS_PARAMS.CHI_SQUARE_THRESHOLD)
289
+ suspicious_runs = (avg_runs > WATERMARK_ANALYSIS_PARAMS.RUNS_SCORE_THRESHOLD)
290
+
291
+ detected = (suspicious_entropy and (suspicious_chi or suspicious_runs))
292
+
293
+ if detected:
294
+ # Determine strength based on confidence
295
+ confidence = self._calculate_confidence([min((avg_lsb_entropy - WATERMARK_ANALYSIS_PARAMS.LSB_ENTROPY_NORM_BASE) / WATERMARK_ANALYSIS_PARAMS.LSB_ENTROPY_NORM_RANGE, 1.0),
296
+ min(avg_chi_square / WATERMARK_ANALYSIS_PARAMS.CHI_SQUARE_NORM_FACTOR, 1.0),
297
+ avg_runs
298
+ ])
299
+ is_strong = (confidence >= WATERMARK_ANALYSIS_PARAMS.STRONG_CONFIDENCE_THRESHOLD)
300
+ direction = (EvidenceDirection.AI_GENERATED if is_strong else EvidenceDirection.INDETERMINATE)
301
+ strength = (EvidenceStrength.STRONG if is_strong else EvidenceStrength.MODERATE)
302
+
303
+ logger.warning(f"Heuristic watermark pattern detected in LSB steganography-domain: (confidence: {confidence:.2f})")
304
+
305
+ return [EvidenceResult(source = EvidenceType.WATERMARK,
306
+ finding = "Statistical patterns consistent with invisible watermarking or steganographic embedding detected",
307
+ direction = direction,
308
+ strength = strength,
309
+ confidence = confidence,
310
+ details = {"method" : "lsb_analysis",
311
+ "note" : "Heuristic detection; not cryptographic or vendor watermark verification",
312
+ "avg_lsb_entropy" : float(avg_lsb_entropy),
313
+ "avg_chi_square" : float(avg_chi_square),
314
+ "avg_runs_score" : float(avg_runs),
315
+ "avg_msb_entropy" : float(np.mean([r["msb_entropy"] for r in channel_results])),
316
+ "channel_results" : channel_results
317
+ },
318
+ analyzer = "watermark_analyzer",
319
+ )
320
+ ]
321
+
322
+ except Exception as e:
323
+ logger.error(f"Error in LSB steganography detection: {e}")
324
+
325
+ return []
326
+
327
+
328
+ def _calculate_kurtosis(self, data: np.ndarray) -> float:
329
+ """
330
+ Calculate kurtosis: measure of distribution tailedness
331
+ """
332
+ data_flat = data.flatten()
333
+ mean = np.mean(data_flat)
334
+ std = np.std(data_flat)
335
+
336
+ if (std == 0):
337
+ return 0.0
338
+
339
+ normalized = (data_flat - mean) / std
340
+
341
+ return float(np.mean(normalized ** 4))
342
+
343
+
344
+ def _detect_periodicity(self, *coeffs) -> float:
345
+ """
346
+ Detect periodic patterns in coefficients: grid-based watermarks
347
+ """
348
+ try:
349
+ scores = list()
350
+
351
+ for coeff in coeffs:
352
+ # Apply autocorrelation
353
+ autocorr = correlate2d(coeff, coeff, mode = 'same')
354
+
355
+ max_val = np.max(autocorr)
356
+ if (max_val == 0):
357
+ continue
358
+
359
+ autocorr = autocorr / max_val
360
+
361
+ # Look for secondary peaks (indicating periodicity)
362
+ center = tuple(s // 2 for s in autocorr.shape)
363
+
364
+ # Remove center peak
365
+ autocorr[center] = 0
366
+
367
+ max_secondary = np.max(autocorr)
368
+ scores.append(max_secondary)
369
+
370
+ return float(np.mean(scores))
371
+
372
+ except:
373
+ return 0.0
374
+
375
+
376
+ def _create_radial_mask(self, shape: Tuple[int, int], inner_ratio: float, outer_ratio: float) -> np.ndarray:
377
+ """
378
+ Create radial mask for frequency analysis
379
+ """
380
+ h, w = shape
381
+ center_y, center_x = h // 2, w // 2
382
+ max_radius = min(center_y, center_x)
383
+
384
+ y, x = np.ogrid[:h, :w]
385
+ distances = np.sqrt((y - center_y)**2 + (x - center_x)**2)
386
+
387
+ mask = (distances >= inner_ratio * max_radius) & (distances < outer_ratio * max_radius)
388
+
389
+ return mask
390
+
391
+
392
+ def _check_spectral_symmetry(self, magnitude: np.ndarray) -> float:
393
+ """
394
+ Check for symmetric patterns in frequency spectrum
395
+ """
396
+ try:
397
+ h, w = magnitude.shape
398
+ left_half = magnitude[:, :w//2]
399
+ right_half = np.fliplr(magnitude[:, w//2:])
400
+
401
+ # Ensure same size
402
+ min_width = min(left_half.shape[1], right_half.shape[1])
403
+ left_half = left_half[:, :min_width]
404
+ right_half = right_half[:, :min_width]
405
+
406
+ # Calculate correlation
407
+ correlation = np.corrcoef(left_half.flatten(), right_half.flatten())[0, 1]
408
+
409
+ return float(abs(correlation)) if not np.isnan(correlation) else 0.0
410
+
411
+ except:
412
+ return 0.0
413
+
414
+
415
+ def _shannon_entropy(self, data: np.ndarray) -> float:
416
+ """
417
+ Calculate Shannon entropy
418
+ """
419
+ values, counts = np.unique(data.flatten(), return_counts = True)
420
+ probabilities = counts / counts.sum()
421
+
422
+ return float(entropy(probabilities, base=2))
423
+
424
+
425
+ def _chi_square_test(self, data: np.ndarray) -> float:
426
+ """
427
+ Chi-square test for uniformity
428
+ """
429
+ values, counts = np.unique(data.flatten(), return_counts = True)
430
+ expected = len(data.flatten()) / len(values)
431
+ chi_square = np.sum((counts - expected) ** 2 / expected)
432
+
433
+ return float(chi_square)
434
+
435
+
436
+ def _runs_test(self, data: np.ndarray) -> float:
437
+ """
438
+ Runs test for randomness: normalized score
439
+ """
440
+ flat = data.flatten()
441
+ median = np.median(flat)
442
+ runs = np.sum(np.abs(np.diff((flat > median).astype(int))))
443
+ expected_runs = len(flat) / 2
444
+
445
+ if (expected_runs == 0):
446
+ return 0.0
447
+
448
+ return float(min(runs / expected_runs, 1.0))
449
+
450
+
451
+ def _calculate_confidence(self, scores: List[float]) -> float:
452
+ """
453
+ Calculate overall confidence from multiple scores
454
+ """
455
+ valid_scores = [score for score in scores if ((isinstance(score, (int, float))) and (not np.isnan(score)))]
456
+
457
+ if not valid_scores:
458
+ return 0.0
459
+
460
+ confidence = np.mean(valid_scores)
461
+
462
+ # Cap at 0.95
463
+ return float(min(max(confidence, 0.0), WATERMARK_ANALYSIS_PARAMS.CONFIDENCE_CAP))
features/batch_processor.py CHANGED
@@ -1,20 +1,25 @@
1
  # Dependencies
2
  import time
 
3
  from typing import List
4
  from typing import Dict
5
  from typing import Tuple
6
  from pathlib import Path
7
  from typing import Callable
 
8
  from utils.logger import get_logger
9
  from config.settings import settings
10
  from config.schemas import AnalysisResult
 
11
  from concurrent.futures import TimeoutError
12
  from concurrent.futures import as_completed
13
  from config.constants import DetectionStatus
14
  from config.schemas import BatchAnalysisResult
15
- from metrics.aggregator import MetricsAggregator
16
  from concurrent.futures import ThreadPoolExecutor
 
17
  from features.threshold_manager import ThresholdManager
 
 
18
 
19
 
20
  # Setup Logging
@@ -37,18 +42,24 @@ class BatchProcessor:
37
  Initialize Batch Processor
38
  """
39
  # Instantiate threshold manager
40
- self.threshold_manager = threshold_manager
41
 
42
- # Initialize aggregator
43
- self.aggregator = MetricsAggregator(threshold_manager = threshold_manager)
 
 
 
 
 
 
44
 
45
  # Fix number of workers
46
- self.max_workers = settings.MAX_WORKERS if settings.PARALLEL_PROCESSING else 1
47
 
48
  logger.info(f"BatchProcessor initialized with max_workers={self.max_workers}, parallel={settings.PARALLEL_PROCESSING}")
49
 
50
 
51
- def process_batch(self, image_files: List[Dict[str, any]], on_progress: Callable[[int, int, str], None] | None = None) -> BatchAnalysisResult:
52
  """
53
  Process multiple images with automatic parallel/sequential switching
54
 
@@ -231,17 +242,26 @@ class BatchProcessor:
231
  { AnalysisResult } : Analysis result or None on error
232
  """
233
  try:
234
- return self.aggregator.analyze_image(image_path = image_path,
235
- filename = filename,
236
- image_size = image_size,
237
- )
 
 
 
 
 
 
 
 
 
238
 
239
  except Exception as e:
240
  logger.error(f"Failed to process {filename}: {e}", exc_info = True)
241
  return None
242
 
243
 
244
- def _calculate_summary(self, results: List[AnalysisResult], total: int) -> Dict[str, int]:
245
  """
246
  Calculate summary statistics from results
247
 
@@ -256,28 +276,38 @@ class BatchProcessor:
256
  { dict } : Summary statistics
257
  """
258
  # Calculate processing stats
259
- likely_authentic = sum(1 for r in results if (r.status == DetectionStatus.LIKELY_AUTHENTIC))
260
- review_required = sum(1 for r in results if (r.status == DetectionStatus.REVIEW_REQUIRED))
 
 
 
 
261
 
262
- processed = len(results)
263
- failed = total - processed
264
- success_rate = int((processed / total * 100) if (total > 0) else 0)
265
-
266
  # Calculate average scores
267
- avg_score = sum(r.overall_score for r in results) / len(results) if results else 0.0
268
- avg_confidence = sum(r.confidence for r in results) / len(results) if results else 0
269
- avg_proc_time = sum(r.processing_time for r in results) / len(results) if results else 0.0
270
 
271
- return {"likely_authentic" : likely_authentic,
272
- "review_required" : review_required,
273
- "success_rate" : success_rate,
274
- "processed" : processed,
275
- "failed" : failed,
276
- "avg_score" : round(avg_score, 3),
277
- "avg_confidence" : int(avg_confidence),
278
- "avg_proc_time" : round(avg_proc_time, 2),
279
- }
280
-
 
 
 
 
 
 
 
 
 
 
281
 
282
  def _create_empty_batch_result(self) -> BatchAnalysisResult:
283
  """
@@ -291,9 +321,11 @@ class BatchProcessor:
291
  processed = 0,
292
  failed = 0,
293
  results = [],
294
- summary = {"likely_authentic" : 0,
295
- "review_required" : 0,
296
- "success_rate" : 0,
 
 
297
  },
298
  total_processing_time = 0.0,
299
  )
 
1
  # Dependencies
2
  import time
3
+ from typing import Any
4
  from typing import List
5
  from typing import Dict
6
  from typing import Tuple
7
  from pathlib import Path
8
  from typing import Callable
9
+ from collections import Counter
10
  from utils.logger import get_logger
11
  from config.settings import settings
12
  from config.schemas import AnalysisResult
13
+ from config.constants import FinalDecision
14
  from concurrent.futures import TimeoutError
15
  from concurrent.futures import as_completed
16
  from config.constants import DetectionStatus
17
  from config.schemas import BatchAnalysisResult
 
18
  from concurrent.futures import ThreadPoolExecutor
19
+ from metrics.signal_aggregator import SignalAggregator
20
  from features.threshold_manager import ThresholdManager
21
+ from decision_builders.decision_policy import DecisionPolicy
22
+ from evidence_analyzers.evidence_aggregator import EvidenceAggregator
23
 
24
 
25
  # Setup Logging
 
42
  Initialize Batch Processor
43
  """
44
  # Instantiate threshold manager
45
+ self.threshold_manager = threshold_manager
46
 
47
+ # Initialize signal aggregators
48
+ self.aggregator = SignalAggregator(threshold_manager = threshold_manager)
49
+
50
+ # Initialize evidence-based aggregator
51
+ self.evidence_aggregator = EvidenceAggregator()
52
+
53
+ # Initialize decision-policy engine
54
+ self.decision_policy = DecisionPolicy()
55
 
56
  # Fix number of workers
57
+ self.max_workers = settings.MAX_WORKERS if settings.PARALLEL_PROCESSING else 1
58
 
59
  logger.info(f"BatchProcessor initialized with max_workers={self.max_workers}, parallel={settings.PARALLEL_PROCESSING}")
60
 
61
 
62
+ def process_batch(self, image_files: List[Dict[str, Any]], on_progress: Callable[[int, int, str], None] | None = None) -> BatchAnalysisResult:
63
  """
64
  Process multiple images with automatic parallel/sequential switching
65
 
 
242
  { AnalysisResult } : Analysis result or None on error
243
  """
244
  try:
245
+ # Tier-1 Signal
246
+ analysis = self.aggregator.analyze_image(image_path = image_path,
247
+ filename = filename,
248
+ image_size = image_size,
249
+ )
250
+
251
+ # Tier-2 evidence
252
+ analysis.evidence = self.evidence_aggregator.analyze(image_path = image_path)
253
+
254
+ # Final decision
255
+ final_analysis_result = self.decision_policy.apply(analysis = analysis)
256
+
257
+ return final_analysis_result
258
 
259
  except Exception as e:
260
  logger.error(f"Failed to process {filename}: {e}", exc_info = True)
261
  return None
262
 
263
 
264
+ def _calculate_summary(self, results: List[AnalysisResult], total: int) -> Dict[str, Any]:
265
  """
266
  Calculate summary statistics from results
267
 
 
276
  { dict } : Summary statistics
277
  """
278
  # Calculate processing stats
279
+ processed = len(results)
280
+ failed = total - processed
281
+ success_rate = int((processed / total * 100) if total > 0 else 0)
282
+
283
+ # Count final decisions safely
284
+ decision_counts = Counter(result.final_decision.value for result in results)
285
 
 
 
 
 
286
  # Calculate average scores
287
+ avg_score = sum(r.overall_score for r in results) / processed if results else 0.0
288
+ avg_confidence = sum(r.confidence for r in results) / processed if results else 0
289
+ avg_proc_time = sum(r.processing_time for r in results) / processed if results else 0.0
290
 
291
+ # Final decision distribution
292
+ decision_distribution = {FinalDecision.CONFIRMED_AI_GENERATED.value : decision_counts.get(FinalDecision.CONFIRMED_AI_GENERATED.value, 0),
293
+ FinalDecision.SUSPICIOUS_AI_LIKELY.value : decision_counts.get(FinalDecision.SUSPICIOUS_AI_LIKELY.value, 0),
294
+ FinalDecision.AUTHENTIC_BUT_REVIEW.value : decision_counts.get(FinalDecision.AUTHENTIC_BUT_REVIEW.value, 0),
295
+ FinalDecision.MOSTLY_AUTHENTIC.value : decision_counts.get(FinalDecision.MOSTLY_AUTHENTIC.value, 0),
296
+ }
297
+
298
+ summary = {"processed" : processed,
299
+ "failed" : failed,
300
+ "success_rate" : success_rate,
301
+ "avg_score" : round(avg_score, 3),
302
+ "avg_confidence" : int(avg_confidence),
303
+ "avg_proc_time" : round(avg_proc_time, 2),
304
+ }
305
+
306
+ # Update summary dictb with decision_distriubution dict
307
+ summary.update(decision_distribution)
308
+
309
+ return summary
310
+
311
 
312
  def _create_empty_batch_result(self) -> BatchAnalysisResult:
313
  """
 
321
  processed = 0,
322
  failed = 0,
323
  results = [],
324
+ summary = {FinalDecision.CONFIRMED_AI_GENERATED.value : 0,
325
+ FinalDecision.SUSPICIOUS_AI_LIKELY.value : 0,
326
+ FinalDecision.AUTHENTIC_BUT_REVIEW.value : 0,
327
+ FinalDecision.MOSTLY_AUTHENTIC.value : 0,
328
+ "success_rate" : 0,
329
  },
330
  total_processing_time = 0.0,
331
  )
features/detailed_result_maker.py CHANGED
@@ -2,11 +2,16 @@
2
  import pandas as pd
3
  from typing import Dict
4
  from typing import List
5
- from typing import Optional
6
  from utils.logger import get_logger
7
  from config.constants import MetricType
 
 
8
  from config.constants import SignalStatus
9
  from config.schemas import AnalysisResult
 
 
 
 
10
  from config.constants import SIGNAL_THRESHOLDS
11
 
12
 
@@ -16,40 +21,59 @@ logger = get_logger(__name__)
16
 
17
  class DetailedResultMaker:
18
  """
19
- Extract and format detailed analysis results for UI and reporting
20
 
21
  Purpose:
22
  --------
23
- - Extracts all intermediate metrics from MetricResult objects
24
- - Formats data for tabular display in UI
25
- - Provides rich metadata for PDF/CSV reports
26
- - No re-computation - just data extraction and formatting
 
 
 
 
 
 
 
27
 
28
  Output Formats:
29
  ---------------
30
- 1. Structured dictionaries for UI
31
- 2. Pandas DataFrames for reports
32
- 3. Hierarchical JSON for API
33
  """
34
  def __init__(self, signal_thresholds: dict | None = None):
35
  """
36
  Initialize Detailed Result Maker
37
  """
38
- self.metric_display_names = {MetricType.GRADIENT : "Gradient-Field PCA",
39
- MetricType.FREQUENCY : "Frequency Domain (FFT)",
40
- MetricType.NOISE : "Noise Pattern Analysis",
41
- MetricType.TEXTURE : "Texture Statistics",
42
- MetricType.COLOR : "Color Distribution",
43
- }
 
 
 
 
 
 
 
 
 
 
44
 
45
- self.signal_thresholds = signal_thresholds or SIGNAL_THRESHOLDS
46
 
47
  logger.debug("DetailedResultMaker initialized")
48
 
49
 
50
  def extract_detailed_results(self, analysis_result: AnalysisResult) -> Dict:
51
  """
52
- Extract all detailed results from AnalysisResult
 
 
53
 
54
  Arguments:
55
  ----------
@@ -57,24 +81,35 @@ class DetailedResultMaker:
57
 
58
  Returns:
59
  --------
60
- { dict } : Comprehensive detailed results
 
 
 
 
 
 
61
  """
62
  logger.debug(f"Extracting detailed results for: {analysis_result.filename}")
63
 
64
- detailed = {"filename" : analysis_result.filename,
65
- "overall_summary" : self._extract_overall_summary(analysis_result = analysis_result),
66
- "metrics_detailed" : self._extract_all_metrics(analysis_result = analysis_result),
67
- "metadata" : self._extract_metadata(analysis_result = analysis_result),
 
 
 
68
  }
69
 
70
- logger.debug(f"Extracted {len(detailed['metrics_detailed'])} metric details")
71
 
72
  return detailed
73
 
74
 
75
  def create_detailed_table(self, analysis_result: AnalysisResult) -> pd.DataFrame:
76
  """
77
- Create detailed metrics table as DataFrame
 
 
78
 
79
  Arguments:
80
  ----------
@@ -82,58 +117,204 @@ class DetailedResultMaker:
82
 
83
  Returns:
84
  --------
85
- { DataFrame } : Tabular detailed results
86
  """
87
  rows = list()
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  for metric_type, metric_result in analysis_result.metric_results.items():
90
  display_name = self.metric_display_names.get(metric_type, metric_type.value)
91
 
92
- row = {"Metric" : display_name,
93
- "Score" : round(metric_result.score, 3),
94
- "Confidence" : round(metric_result.confidence, 3) if metric_result.confidence is not None else "N/A",
95
- "Status" : self._score_to_status(score = metric_result.score),
 
96
  }
97
 
98
- # Add key details from each metric
99
  details = self._extract_key_details(metric_type = metric_type,
100
  metric_result = metric_result,
101
  )
 
102
 
103
- row.update(details)
104
- rows.append(row)
105
 
106
- # Dump rows into a pandas dataframe for structured result
107
- dataframe = pd.DataFrame(data = rows)
108
 
109
- logger.debug(f"Created detailed table with {len(dataframe)} rows, {len(dataframe.columns)} columns")
110
 
111
  return dataframe
112
-
113
 
114
- def create_report_data(self, analysis_result: AnalysisResult) -> Dict:
 
 
 
115
  """
116
- Create rich data structure for report generation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- Arguments:
119
- ----------
120
- analysis_result { AnalysisResult } : Complete analysis result
121
 
122
- Returns:
123
- --------
124
- { dict } : Report-ready data structure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  """
126
- report_data = {"header" : self._create_report_header(analysis_result = analysis_result),
127
- "overall_assessment" : self._create_overall_assessment(analysis_result = analysis_result),
128
- "metric_breakdown" : self._create_metric_breakdown(analysis_result = analysis_result),
129
- "forensic_details" : self._create_forensic_details(analysis_result = analysis_result),
130
- "recommendations" : self._create_recommendations(analysis_result = analysis_result),
131
- }
132
 
133
- logger.debug(f"Created report data for: {analysis_result.filename}")
134
 
135
- return report_data
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  def _extract_overall_summary(self, analysis_result: AnalysisResult) -> Dict:
139
  """
@@ -162,34 +343,54 @@ class DetailedResultMaker:
162
  "display_name" : self.metric_display_names.get(metric_type, metric_type.value),
163
  "score" : round(metric_result.score, 3),
164
  "confidence" : round(metric_result.confidence, 3) if metric_result.confidence is not None else None,
165
- "status" : self._score_to_status(score = metric_result.score),
166
  "details" : metric_result.details or {},
167
- "interpretation" : self._interpret_metric(metric_type = metric_type,
168
- metric_result = metric_result,
169
- ),
170
  }
171
-
172
  metrics_detailed.append(metric_detail)
173
 
174
  # Sort by score (highest first)
175
- metrics_detailed.sort(key = lambda x: x['score'], reverse = True)
 
 
176
 
177
  return metrics_detailed
178
 
179
 
180
  def _extract_metadata(self, analysis_result: AnalysisResult) -> Dict:
181
  """
182
- Extract processing metadata
183
  """
184
- return {"total_metrics" : len(analysis_result.metric_results),
185
- "flagged_metrics" : sum(1 for s in analysis_result.signals if s.status.value == 'flagged'),
186
- "warning_metrics" : sum(1 for s in analysis_result.signals if s.status.value == 'warning'),
187
- "passed_metrics" : sum(1 for s in analysis_result.signals if s.status.value == 'passed'),
188
- "avg_confidence" : self._calculate_avg_confidence(analysis_result = analysis_result),
189
- }
190
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- def _extract_key_details(self, metric_type: MetricType, metric_result) -> Dict:
193
  """
194
  Extract key details specific to each metric type
195
  """
@@ -201,25 +402,26 @@ class DetailedResultMaker:
201
  }
202
 
203
  elif (metric_type == MetricType.FREQUENCY):
204
- return {"HF_Ratio" : details.get('hf_ratio', 'N/A'),
205
- "HF_Anomaly" : details.get('hf_anomaly', 'N/A'),
206
- "Spectrum_Bins" : details.get('spectrum_bins', 'N/A'),
207
  }
208
 
209
  elif (metric_type == MetricType.NOISE):
210
- return {"Mean_Noise" : details.get('mean_noise', 'N/A'),
211
- "CV" : details.get('cv', 'N/A'),
212
- "Patches_Valid" : details.get('patches_valid', 'N/A'),
213
  }
214
 
215
  elif (metric_type == MetricType.TEXTURE):
216
- return {"Smooth_Ratio" : details.get('smooth_ratio', 'N/A'),
217
- "Contrast_Mean" : details.get('contrast_mean', 'N/A'),
218
- "Patches_Used" : details.get('patches_used', 'N/A'),
219
  }
220
 
221
  elif (metric_type == MetricType.COLOR):
222
  sat_stats = details.get('saturation_stats', {})
 
223
  return {"Mean_Saturation" : sat_stats.get('mean_saturation', 'N/A'),
224
  "High_Sat_Ratio" : sat_stats.get('high_sat_ratio', 'N/A'),
225
  }
@@ -227,42 +429,41 @@ class DetailedResultMaker:
227
  return {}
228
 
229
 
230
- def _interpret_metric(self, metric_type: MetricType, metric_result) -> str:
231
  """
232
  Provide human-readable interpretation of metric result
233
  """
234
- score = metric_result.score
235
  details = metric_result.details or {}
236
 
237
  if (metric_type == MetricType.GRADIENT):
238
  eig_ratio = details.get('eigenvalue_ratio')
239
 
240
  if eig_ratio:
241
- return f"Eigenvalue ratio of {eig_ratio:.3f} ({'high' if eig_ratio > 0.85 else 'low'} alignment)"
242
 
243
  return "Gradient structure analysis"
244
 
245
- elif (metric_type == MetricType.FREQUENCY):
246
  hf_ratio = details.get('hf_ratio')
247
-
248
  if hf_ratio:
249
- return f"High-freq ratio: {hf_ratio:.3f} ({'elevated' if hf_ratio > 0.35 else 'low' if hf_ratio < 0.08 else 'normal'})"
250
 
251
  return "Frequency spectrum analysis"
252
 
253
  elif (metric_type == MetricType.NOISE):
254
  mean_noise = details.get('mean_noise')
255
-
256
  if mean_noise:
257
- return f"Mean noise: {mean_noise:.2f} ({'low' if mean_noise < 1.5 else 'normal'})"
258
 
259
  return "Noise pattern analysis"
260
 
261
  elif (metric_type == MetricType.TEXTURE):
262
  smooth_ratio = details.get('smooth_ratio')
263
-
264
  if smooth_ratio is not None:
265
- return f"Smooth regions: {smooth_ratio:.1%} ({'excessive' if smooth_ratio > 0.4 else 'normal'})"
266
 
267
  return "Texture variation analysis"
268
 
@@ -271,161 +472,23 @@ class DetailedResultMaker:
271
  mean_sat = sat_stats.get('mean_saturation')
272
 
273
  if mean_sat:
274
- return f"Mean saturation: {mean_sat:.2f} ({'high' if mean_sat > 0.65 else 'normal'})"
275
 
276
  return "Color distribution analysis"
277
 
278
  return "Analysis complete"
279
-
280
-
281
- def _create_report_header(self, analysis_result: AnalysisResult) -> Dict:
282
- """
283
- Create report header section
284
- """
285
- return {"filename" : analysis_result.filename,
286
- "analysis_date" : analysis_result.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
287
- "image_size" : f"{analysis_result.image_size[0]} × {analysis_result.image_size[1]} pixels",
288
- "processing_time" : f"{analysis_result.processing_time:.2f} seconds",
289
- }
290
-
291
-
292
- def _create_overall_assessment(self, analysis_result: AnalysisResult) -> Dict:
293
- """
294
- Create overall assessment section
295
- """
296
- return {"status" : analysis_result.status.value,
297
- "score" : round(analysis_result.overall_score * 100, 1),
298
- "confidence" : analysis_result.confidence,
299
- "verdict" : "REVIEW REQUIRED" if analysis_result.status.value == "REVIEW_REQUIRED" else "LIKELY AUTHENTIC",
300
- "risk_level" : self._calculate_risk_level(score = analysis_result.overall_score),
301
- }
302
 
303
 
304
- def _create_metric_breakdown(self, analysis_result: AnalysisResult) -> List[Dict]:
305
  """
306
- Create detailed metric breakdown for report
307
- """
308
- breakdown = list()
309
-
310
- for signal in analysis_result.signals:
311
- metric_result = analysis_result.metric_results.get(signal.metric_type)
312
-
313
- item = {"metric" : signal.name,
314
- "score" : f"{signal.score * 100:.1f}%",
315
- "status" : signal.status.value.upper(),
316
- "confidence" : f"{metric_result.confidence * 100:.1f}%" if metric_result.confidence else "N/A",
317
- "explanation" : signal.explanation,
318
- "key_findings" : self.extract_key_findings(metric_type = signal.metric_type,
319
- metric_result = metric_result,
320
- ),
321
- }
322
-
323
- breakdown.append(item)
324
-
325
- return breakdown
326
-
327
-
328
- def _create_forensic_details(self, analysis_result: AnalysisResult) -> Dict:
329
- """
330
- Create forensic details section
331
- """
332
- forensic = dict()
333
-
334
- for metric_type, metric_result in analysis_result.metric_results.items():
335
- metric_name = self.metric_display_names.get(metric_type, metric_type.value)
336
- forensic[metric_name] = metric_result.details or {"note": "No detailed forensics available"}
337
-
338
- return forensic
339
-
340
-
341
- def _create_recommendations(self, analysis_result: AnalysisResult) -> Dict:
342
- """
343
- Create recommendations section
344
- """
345
- score = analysis_result.overall_score
346
-
347
- if (score >= 0.85):
348
- return {"action" : "Immediate manual verification required",
349
- "priority" : "HIGH",
350
- "next_steps" : ["Forensic analysis", "Reverse image search", "Metadata inspection", "Expert review"],
351
- "confidence" : "Very high likelihood of AI generation",
352
- }
353
-
354
- elif (score >= 0.70):
355
- return {"action" : "Manual verification recommended",
356
- "priority" : "MEDIUM",
357
- "next_steps" : ["Visual inspection", "Compare with authentic samples", "Check source provenance"],
358
- "confidence" : "High likelihood of AI generation",
359
- }
360
-
361
- elif (score >= 0.50):
362
- return {"action" : "Optional review suggested",
363
- "priority" : "LOW",
364
- "next_steps" : ["May be edited photo", "Verify image source", "Check for inconsistencies"],
365
- "confidence" : "Moderate indicators present",
366
- }
367
-
368
- else:
369
- return {"action" : "No immediate action required",
370
- "priority" : "NONE",
371
- "next_steps" : ["Proceed with normal workflow"],
372
- "confidence" : "Low likelihood of AI generation",
373
- }
374
-
375
-
376
- def _score_to_status(self, score: float) -> str:
377
- """
378
- Convert score to status label
379
- """
380
- if (score >= self.signal_thresholds[SignalStatus.FLAGGED]):
381
- return "FLAGGED"
382
-
383
- elif (score >= self.signal_thresholds[SignalStatus.WARNING]):
384
- return "WARNING"
385
-
386
- else:
387
- return "PASSED"
388
-
389
-
390
- def _calculate_avg_confidence(self, analysis_result: AnalysisResult) -> float:
391
- """
392
- Calculate average confidence across all metrics
393
- """
394
- confidences = [mr.confidence for mr in analysis_result.metric_results.values() if mr.confidence is not None]
395
-
396
- return round(sum(confidences) / len(confidences), 3) if confidences else 0.0
397
-
398
-
399
- def _calculate_risk_level(self, score: float) -> str:
400
- """
401
- Calculate risk level from score
402
- """
403
- if (score >= 0.85):
404
- return "CRITICAL"
405
-
406
- elif (score >= 0.70):
407
- return "HIGH"
408
-
409
- elif (score >= 0.50):
410
- return "MEDIUM"
411
-
412
- else:
413
- return "LOW"
414
-
415
-
416
- def extract_key_findings(self, metric_type: MetricType, metric_result) -> List[str]:
417
- """
418
- Extract human-readable key forensic findings for a given metric used by:
419
- - Detailed UI views
420
- - CSV reports
421
- - JSON reports
422
  """
423
  findings = list()
424
  details = metric_result.details or {}
425
 
426
  if (metric_type == MetricType.GRADIENT):
427
  eig_ratio = details.get('eigenvalue_ratio')
428
-
429
  if eig_ratio:
430
  findings.append(f"Eigenvalue ratio: {eig_ratio:.3f}")
431
 
@@ -436,17 +499,18 @@ class DetailedResultMaker:
436
 
437
  elif (metric_type == MetricType.FREQUENCY):
438
  hf_ratio = details.get('hf_ratio')
439
-
440
  if hf_ratio:
441
  findings.append(f"High-frequency ratio: {hf_ratio:.3f}")
442
 
443
  roughness = details.get('roughness')
 
444
  if roughness:
445
  findings.append(f"Spectral roughness: {roughness:.3f}")
446
 
447
  elif (metric_type == MetricType.NOISE):
448
  mean_noise = details.get('mean_noise')
449
-
450
  if mean_noise:
451
  findings.append(f"Mean noise level: {mean_noise:.2f}")
452
 
@@ -457,7 +521,7 @@ class DetailedResultMaker:
457
 
458
  elif (metric_type == MetricType.TEXTURE):
459
  smooth_ratio = details.get('smooth_ratio')
460
-
461
  if smooth_ratio:
462
  findings.append(f"Smooth patches: {smooth_ratio:.1%}")
463
 
@@ -478,4 +542,27 @@ class DetailedResultMaker:
478
  if high_sat:
479
  findings.append(f"High saturation pixels: {high_sat:.1%}")
480
 
481
- return findings if findings else ["Analysis complete"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  from typing import Dict
4
  from typing import List
 
5
  from utils.logger import get_logger
6
  from config.constants import MetricType
7
+ from config.schemas import MetricResult
8
+ from config.constants import EvidenceType
9
  from config.constants import SignalStatus
10
  from config.schemas import AnalysisResult
11
+ from config.schemas import EvidenceResult
12
+ from config.constants import FinalDecision
13
+ from config.constants import EvidenceStrength
14
+ from config.constants import EvidenceDirection
15
  from config.constants import SIGNAL_THRESHOLDS
16
 
17
 
 
21
 
22
  class DetailedResultMaker:
23
  """
24
+ Extract and format detailed analysis results for reporting
25
 
26
  Purpose:
27
  --------
28
+ - Extracts ALL data from AnalysisResult (metrics + evidence + decision)
29
+ - Formats data into unified dictionaries/DataFrames
30
+ - Provides structured data for reporters (JSON/CSV/PDF)
31
+ - NO re-computation - pure data extraction and formatting
32
+
33
+ Data Sources:
34
+ -------------
35
+ 1. Final Decision (from DecisionPolicy)
36
+ 2. Evidence Results (from EvidenceAggregator)
37
+ 3. Metric Results (from SignalAggregator)
38
+ 4. Metadata (timestamps, processing info)
39
 
40
  Output Formats:
41
  ---------------
42
+ 1. Structured dictionaries for reporters
43
+ 2. Pandas DataFrames for tabular reports
44
+ 3. Hierarchical JSON-ready structures
45
  """
46
  def __init__(self, signal_thresholds: dict | None = None):
47
  """
48
  Initialize Detailed Result Maker
49
  """
50
+ self.metric_display_names = {MetricType.GRADIENT : "Gradient-Field PCA",
51
+ MetricType.FREQUENCY : "Frequency Domain (FFT)",
52
+ MetricType.NOISE : "Noise Pattern Analysis",
53
+ MetricType.TEXTURE : "Texture Statistics",
54
+ MetricType.COLOR : "Color Distribution",
55
+ }
56
+
57
+ self.evidence_display_names = {EvidenceType.EXIF : "EXIF Metadata",
58
+ EvidenceType.WATERMARK : "Watermark Detection",
59
+ }
60
+
61
+ self.decision_labels = {FinalDecision.CONFIRMED_AI_GENERATED : "🔴 CONFIRMED AI GENERATED",
62
+ FinalDecision.SUSPICIOUS_AI_LIKELY : "🟠 SUSPICIOUS - AI LIKELY",
63
+ FinalDecision.AUTHENTIC_BUT_REVIEW : "🟡 AUTHENTIC BUT REVIEW",
64
+ FinalDecision.MOSTLY_AUTHENTIC : "🟢 MOSTLY AUTHENTIC",
65
+ }
66
 
67
+ self.signal_thresholds = signal_thresholds or SIGNAL_THRESHOLDS
68
 
69
  logger.debug("DetailedResultMaker initialized")
70
 
71
 
72
  def extract_detailed_results(self, analysis_result: AnalysisResult) -> Dict:
73
  """
74
+ Extract ALL detailed results from AnalysisResult into unified dictionary
75
+
76
+ This is the MAIN extraction method - reporters call this!
77
 
78
  Arguments:
79
  ----------
 
81
 
82
  Returns:
83
  --------
84
+ { dict } : Comprehensive detailed results containing:
85
+ - final_decision (from DecisionPolicy)
86
+ - evidence_summary (from EvidenceAggregator)
87
+ - evidence_detailed (all evidence items)
88
+ - overall_summary (basic info)
89
+ - metrics_detailed (all metric results)
90
+ - metadata (stats and counts)
91
  """
92
  logger.debug(f"Extracting detailed results for: {analysis_result.filename}")
93
 
94
+ detailed = {"filename" : analysis_result.filename,
95
+ "final_decision" : self._extract_final_decision(analysis_result),
96
+ "evidence_summary" : self._extract_evidence_summary(analysis_result),
97
+ "evidence_detailed" : self._extract_all_evidence(analysis_result),
98
+ "overall_summary" : self._extract_overall_summary(analysis_result),
99
+ "metrics_detailed" : self._extract_all_metrics(analysis_result),
100
+ "metadata" : self._extract_metadata(analysis_result),
101
  }
102
 
103
+ logger.debug(f"Extracted {len(detailed['evidence_detailed'])} evidence items, {len(detailed['metrics_detailed'])} metric details")
104
 
105
  return detailed
106
 
107
 
108
  def create_detailed_table(self, analysis_result: AnalysisResult) -> pd.DataFrame:
109
  """
110
+ Create detailed table as DataFrame (for CSV export)
111
+
112
+ Includes: Decision + Evidence + Metrics in hierarchical order
113
 
114
  Arguments:
115
  ----------
 
117
 
118
  Returns:
119
  --------
120
+ { DataFrame } : Tabular detailed results
121
  """
122
  rows = list()
123
 
124
+ # Final Decision (if available)
125
+ if analysis_result.final_decision:
126
+ decision_row = {"Type" : "FINAL DECISION",
127
+ "Name" : self.decision_labels.get(analysis_result.final_decision, analysis_result.final_decision.value),
128
+ "Score" : "N/A",
129
+ "Confidence" : f"{analysis_result.confidence}%",
130
+ "Status" : analysis_result.final_decision.value.upper(),
131
+ "Explanation" : analysis_result.decision_explanation or "See evidence and metrics below",
132
+ }
133
+
134
+ rows.append(decision_row)
135
+
136
+ # Evidence (if any)
137
+ if analysis_result.evidence:
138
+ for evidence in analysis_result.evidence:
139
+ source_key = evidence.source.value if hasattr(evidence.source, "value") else str(evidence.source)
140
+
141
+ evidence_row = {"Type" : "EVIDENCE",
142
+ "Name" : f"{self.evidence_display_names.get(source_key, source_key)} - {evidence.analyzer}",
143
+ "Score" : f"{evidence.confidence:.2f}" if evidence.confidence is not None else "N/A",
144
+ "Confidence" : f"{int(evidence.confidence * 100)}%" if evidence.confidence is not None else "N/A",
145
+ "Status" : self._evidence_to_status_label(evidence),
146
+ "Explanation" : evidence.finding,
147
+ }
148
+
149
+ rows.append(evidence_row)
150
+
151
+ # Metrics
152
  for metric_type, metric_result in analysis_result.metric_results.items():
153
  display_name = self.metric_display_names.get(metric_type, metric_type.value)
154
 
155
+ metric_row = {"Type" : "METRIC",
156
+ "Name" : display_name,
157
+ "Score" : round(metric_result.score, 3),
158
+ "Confidence" : f"{round(metric_result.confidence * 100)}%" if metric_result.confidence is not None else "N/A",
159
+ "Status" : self._score_to_status(metric_result.score),
160
  }
161
 
162
+ # Add key details
163
  details = self._extract_key_details(metric_type = metric_type,
164
  metric_result = metric_result,
165
  )
166
+ metric_row.update(details)
167
 
168
+ rows.append(metric_row)
 
169
 
170
+ dataframe = pd.DataFrame(data=rows)
 
171
 
172
+ logger.debug(f"Created detailed table with {len(dataframe)} rows")
173
 
174
  return dataframe
 
175
 
176
+
177
+ def _extract_final_decision(self, analysis_result: AnalysisResult) -> Dict:
178
+ """
179
+ Extract final decision information from DecisionPolicy
180
  """
181
+ if not analysis_result.final_decision:
182
+ return {"decision" : None,
183
+ "label" : "⚪ No Decision",
184
+ "explanation" : "Decision policy not applied",
185
+ "confidence" : 0,
186
+ "based_on" : "Unknown",
187
+ }
188
+
189
+ final_decision = {"decision" : analysis_result.final_decision.value,
190
+ "label" : self.decision_labels.get(analysis_result.final_decision, analysis_result.final_decision.value),
191
+ "explanation" : analysis_result.decision_explanation or "No explanation provided",
192
+ "confidence" : analysis_result.confidence,
193
+ "based_on" : self._determine_decision_basis(analysis_result),
194
+ }
195
+
196
+ return final_decision
197
+
198
+
199
+ def _determine_decision_basis(self, analysis_result: AnalysisResult) -> str:
200
+ """
201
+ Determine what the decision was based on
202
+ """
203
+ if not analysis_result.evidence:
204
+ return "Statistical metrics only"
205
 
206
+ # Check for strong evidence
207
+ strong_evidence = [item for item in analysis_result.evidence if item.strength in (EvidenceStrength.STRONG, EvidenceStrength.CONCLUSIVE)]
 
208
 
209
+ if strong_evidence:
210
+ evidence_types = {item.source.value if hasattr(item.source, "value") else str(item.source) for item in strong_evidence}
211
+ return f"Strong evidence (Tier 2): {', '.join(evidence_types)}"
212
+
213
+ return "Combination of evidence and metrics (Tier 2 + Tier 1)"
214
+
215
+
216
+ def _extract_evidence_summary(self, analysis_result: AnalysisResult) -> Dict:
217
+ """
218
+ Extract high-level evidence summary
219
+ """
220
+ if not analysis_result.evidence:
221
+ return {"total_evidence" : 0,
222
+ "ai_evidence_count" : 0,
223
+ "auth_evidence_count" : 0,
224
+ "strongest_evidence" : None,
225
+ }
226
+
227
+ ai_evidence = [item for item in analysis_result.evidence if (item.direction == EvidenceDirection.AI_GENERATED)]
228
+ auth_evidence = [item for item in analysis_result.evidence if (item.direction == EvidenceDirection.AUTHENTIC)]
229
+
230
+ # Find strongest evidence
231
+ strongest = max(analysis_result.evidence,
232
+ key = lambda item: (self._strength_to_rank(item.strength), item.confidence or 0.0)
233
+ )
234
+
235
+ return {"total_evidence" : len(analysis_result.evidence),
236
+ "ai_evidence_count" : len(ai_evidence),
237
+ "auth_evidence_count" : len(auth_evidence),
238
+ "strongest_evidence" : {"source" : strongest.source.value,
239
+ "direction" : strongest.direction.value,
240
+ "strength" : strongest.strength.value,
241
+ "finding" : strongest.finding,
242
+ "confidence" : strongest.confidence,
243
+ },
244
+ }
245
+
246
+
247
+ def _extract_all_evidence(self, analysis_result: AnalysisResult) -> List[Dict]:
248
+ """
249
+ Extract detailed information for all evidence items
250
  """
251
+ if not analysis_result.evidence:
252
+ return []
 
 
 
 
253
 
254
+ evidence_detailed = list()
255
 
256
+ for evidence in analysis_result.evidence:
257
+ timestamp = getattr(evidence, "timestamp", None)
258
+
259
+ evidence_detail = {"source" : evidence.source.value,
260
+ "display_name" : self.evidence_display_names.get(evidence.source.value if hasattr(evidence.source, "value") else str(evidence.source), str(evidence.source)),
261
+ "finding" : evidence.finding,
262
+ "direction" : evidence.direction.value,
263
+ "strength" : evidence.strength.value,
264
+ "confidence" : evidence.confidence,
265
+ "analyzer" : evidence.analyzer,
266
+ "details" : evidence.details,
267
+ "status_label" : self._evidence_to_status_label(evidence),
268
+ "timestamp" : timestamp.isoformat() if timestamp else None,
269
+ }
270
+
271
+ evidence_detailed.append(evidence_detail)
272
+
273
+ return evidence_detailed
274
+
275
+
276
+ def _evidence_to_status_label(self, evidence: EvidenceResult) -> str:
277
+ """
278
+ Convert evidence to human-readable status label
279
+ """
280
+ if (evidence.direction == EvidenceDirection.AI_GENERATED):
281
+ if (evidence.strength == EvidenceStrength.CONCLUSIVE):
282
+ return "🔴 CONCLUSIVE AI"
283
+
284
+ elif (evidence.strength == EvidenceStrength.STRONG):
285
+ return "🔴 STRONG AI"
286
+
287
+ elif (evidence.strength == EvidenceStrength.MODERATE):
288
+ return "🟠 MODERATE AI"
289
+
290
+ else:
291
+ return "🟡 WEAK AI"
292
+
293
+ elif (evidence.direction == EvidenceDirection.AUTHENTIC):
294
+ if (evidence.strength in (EvidenceStrength.STRONG, EvidenceStrength.CONCLUSIVE)):
295
+ return "🟢 STRONG AUTHENTIC"
296
+
297
+ elif (evidence.strength == EvidenceStrength.MODERATE):
298
+ return "🟢 MODERATE AUTHENTIC"
299
+
300
+ else:
301
+ return "🟡 WEAK AUTHENTIC"
302
+
303
+ else:
304
+ # INDETERMINATE
305
+ return "⚪ INDETERMINATE"
306
+
307
+
308
+ def _strength_to_rank(self, strength: EvidenceStrength) -> int:
309
+ """
310
+ Convert strength to numeric rank for sorting
311
+ """
312
+ return {EvidenceStrength.CONCLUSIVE : 4,
313
+ EvidenceStrength.STRONG : 3,
314
+ EvidenceStrength.MODERATE : 2,
315
+ EvidenceStrength.WEAK : 1,
316
+ }.get(strength, 0)
317
+
318
 
319
  def _extract_overall_summary(self, analysis_result: AnalysisResult) -> Dict:
320
  """
 
343
  "display_name" : self.metric_display_names.get(metric_type, metric_type.value),
344
  "score" : round(metric_result.score, 3),
345
  "confidence" : round(metric_result.confidence, 3) if metric_result.confidence is not None else None,
346
+ "status" : self._score_to_status(metric_result.score),
347
  "details" : metric_result.details or {},
348
+ "interpretation" : self._interpret_metric(metric_type, metric_result),
349
+ "key_findings" : self.extract_key_findings(metric_type, metric_result),
 
350
  }
351
+
352
  metrics_detailed.append(metric_detail)
353
 
354
  # Sort by score (highest first)
355
+ metrics_detailed.sort(key = lambda x: x['score'],
356
+ reverse = True,
357
+ )
358
 
359
  return metrics_detailed
360
 
361
 
362
  def _extract_metadata(self, analysis_result: AnalysisResult) -> Dict:
363
  """
364
+ Extract processing metadata and statistics
365
  """
366
+ metadata = {"total_metrics" : len(analysis_result.metric_results),
367
+ "flagged_metrics" : sum(1 for s in analysis_result.signals if s.status == SignalStatus.FLAGGED),
368
+ "warning_metrics" : sum(1 for s in analysis_result.signals if s.status == SignalStatus.WARNING),
369
+ "passed_metrics" : sum(1 for s in analysis_result.signals if s.status == SignalStatus.PASSED),
370
+ "avg_confidence" : self._calculate_avg_confidence(analysis_result),
371
+ }
372
+
373
+ # Evidence stats (if available)
374
+ if analysis_result.evidence:
375
+ metadata["total_evidence"] = len(analysis_result.evidence)
376
+ metadata["ai_evidence"] = sum(1 for e in analysis_result.evidence if e.direction == EvidenceDirection.AI_GENERATED)
377
+ metadata["auth_evidence"] = sum(1 for e in analysis_result.evidence if e.direction == EvidenceDirection.AUTHENTIC)
378
+ metadata["strong_evidence"] = sum(1 for e in analysis_result.evidence if e.strength in (EvidenceStrength.STRONG, EvidenceStrength.CONCLUSIVE))
379
+
380
+ else:
381
+ metadata["total_evidence"] = 0
382
+ metadata["ai_evidence"] = 0
383
+ metadata["auth_evidence"] = 0
384
+ metadata["strong_evidence"] = 0
385
+
386
+ # Decision info
387
+ metadata["has_final_decision"] = analysis_result.final_decision is not None
388
+ metadata["decision_value"] = analysis_result.final_decision.value if analysis_result.final_decision else None
389
+
390
+ return metadata
391
+
392
 
393
+ def _extract_key_details(self, metric_type: MetricType, metric_result: MetricResult) -> Dict:
394
  """
395
  Extract key details specific to each metric type
396
  """
 
402
  }
403
 
404
  elif (metric_type == MetricType.FREQUENCY):
405
+ return {"HF_Ratio" : details.get('hf_ratio', 'N/A'),
406
+ "HF_Anomaly" : details.get('hf_anomaly', 'N/A'),
407
+ "Spectrum_Bins" : details.get('spectrum_bins', 'N/A'),
408
  }
409
 
410
  elif (metric_type == MetricType.NOISE):
411
+ return {"Mean_Noise" : details.get('mean_noise', 'N/A'),
412
+ "CV" : details.get('cv', 'N/A'),
413
+ "Patches_Valid" : details.get('patches_valid', 'N/A'),
414
  }
415
 
416
  elif (metric_type == MetricType.TEXTURE):
417
+ return {"Smooth_Ratio" : details.get('smooth_ratio', 'N/A'),
418
+ "Contrast_Mean" : details.get('contrast_mean', 'N/A'),
419
+ "Patches_Used" : details.get('patches_used', 'N/A'),
420
  }
421
 
422
  elif (metric_type == MetricType.COLOR):
423
  sat_stats = details.get('saturation_stats', {})
424
+
425
  return {"Mean_Saturation" : sat_stats.get('mean_saturation', 'N/A'),
426
  "High_Sat_Ratio" : sat_stats.get('high_sat_ratio', 'N/A'),
427
  }
 
429
  return {}
430
 
431
 
432
+ def _interpret_metric(self, metric_type: MetricType, metric_result: MetricResult) -> str:
433
  """
434
  Provide human-readable interpretation of metric result
435
  """
 
436
  details = metric_result.details or {}
437
 
438
  if (metric_type == MetricType.GRADIENT):
439
  eig_ratio = details.get('eigenvalue_ratio')
440
 
441
  if eig_ratio:
442
+ return f"Eigenvalue ratio of {eig_ratio:.3f} ({'high' if (eig_ratio > 0.85) else 'low'} alignment)"
443
 
444
  return "Gradient structure analysis"
445
 
446
+ elif( metric_type == MetricType.FREQUENCY):
447
  hf_ratio = details.get('hf_ratio')
448
+
449
  if hf_ratio:
450
+ return f"High-freq ratio: {hf_ratio:.3f} ({'elevated' if (hf_ratio > 0.35) else 'low' if (hf_ratio < 0.08) else 'normal'})"
451
 
452
  return "Frequency spectrum analysis"
453
 
454
  elif (metric_type == MetricType.NOISE):
455
  mean_noise = details.get('mean_noise')
456
+
457
  if mean_noise:
458
+ return f"Mean noise: {mean_noise:.2f} ({'low' if (mean_noise < 1.5) else 'normal'})"
459
 
460
  return "Noise pattern analysis"
461
 
462
  elif (metric_type == MetricType.TEXTURE):
463
  smooth_ratio = details.get('smooth_ratio')
464
+
465
  if smooth_ratio is not None:
466
+ return f"Smooth regions: {smooth_ratio:.1%} ({'excessive' if (smooth_ratio > 0.4) else 'normal'})"
467
 
468
  return "Texture variation analysis"
469
 
 
472
  mean_sat = sat_stats.get('mean_saturation')
473
 
474
  if mean_sat:
475
+ return f"Mean saturation: {mean_sat:.2f} ({'high' if (mean_sat > 0.65) else 'normal'})"
476
 
477
  return "Color distribution analysis"
478
 
479
  return "Analysis complete"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
 
482
+ def extract_key_findings(self, metric_type: MetricType, metric_result: MetricResult) -> List[str]:
483
  """
484
+ Extract human-readable key forensic findings for reporters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  """
486
  findings = list()
487
  details = metric_result.details or {}
488
 
489
  if (metric_type == MetricType.GRADIENT):
490
  eig_ratio = details.get('eigenvalue_ratio')
491
+
492
  if eig_ratio:
493
  findings.append(f"Eigenvalue ratio: {eig_ratio:.3f}")
494
 
 
499
 
500
  elif (metric_type == MetricType.FREQUENCY):
501
  hf_ratio = details.get('hf_ratio')
502
+
503
  if hf_ratio:
504
  findings.append(f"High-frequency ratio: {hf_ratio:.3f}")
505
 
506
  roughness = details.get('roughness')
507
+
508
  if roughness:
509
  findings.append(f"Spectral roughness: {roughness:.3f}")
510
 
511
  elif (metric_type == MetricType.NOISE):
512
  mean_noise = details.get('mean_noise')
513
+
514
  if mean_noise:
515
  findings.append(f"Mean noise level: {mean_noise:.2f}")
516
 
 
521
 
522
  elif (metric_type == MetricType.TEXTURE):
523
  smooth_ratio = details.get('smooth_ratio')
524
+
525
  if smooth_ratio:
526
  findings.append(f"Smooth patches: {smooth_ratio:.1%}")
527
 
 
542
  if high_sat:
543
  findings.append(f"High saturation pixels: {high_sat:.1%}")
544
 
545
+ return findings if findings else ["Analysis complete"]
546
+
547
+
548
+ def _score_to_status(self, score: float) -> str:
549
+ """
550
+ Convert score to status label
551
+ """
552
+ if (score >= self.signal_thresholds[SignalStatus.FLAGGED]):
553
+ return "FLAGGED"
554
+
555
+ elif (score >= self.signal_thresholds[SignalStatus.WARNING]):
556
+ return "WARNING"
557
+
558
+ else:
559
+ return "PASSED"
560
+
561
+
562
+ def _calculate_avg_confidence(self, analysis_result: AnalysisResult) -> float:
563
+ """
564
+ Calculate average confidence across all metrics
565
+ """
566
+ confidences = [mr.confidence for mr in analysis_result.metric_results.values() if mr.confidence is not None]
567
+
568
+ return round(sum(confidences) / len(confidences), 3) if confidences else 0.0
metrics/color_analyzer.py CHANGED
@@ -110,34 +110,38 @@ class ColorAnalyzer:
110
  --------
111
  { np.ndarray } : HSV image (H in [0, 360], S and V in [0, 1])
112
  """
113
- r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
114
 
115
- maxc = np.maximum(np.maximum(r, g), b)
116
- minc = np.minimum(np.minimum(r, g), b)
117
- delta = maxc - minc
118
 
119
  # Value
120
- v = maxc
121
 
122
  # Saturation
123
- s = np.where(maxc != 0, delta / maxc, 0)
 
 
 
124
 
125
  # Hue
126
- h = np.zeros_like(maxc)
127
 
128
  # Red is max
129
- mask = (maxc == r) & (delta != 0)
130
- h[mask] = 60 * (((g[mask] - b[mask]) / delta[mask]) % 6)
131
 
132
  # Green is max
133
- mask = (maxc == g) & (delta != 0)
134
- h[mask] = 60 * (((b[mask] - r[mask]) / delta[mask]) + 2)
135
 
136
  # Blue is max
137
- mask = (maxc == b) & (delta != 0)
138
- h[mask] = 60 * (((r[mask] - g[mask]) / delta[mask]) + 4)
139
 
140
- hsv = np.stack([h, s, v], axis = 2)
 
141
 
142
  return hsv
143
 
 
110
  --------
111
  { np.ndarray } : HSV image (H in [0, 360], S and V in [0, 1])
112
  """
113
+ r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
114
 
115
+ maxc = np.maximum(np.maximum(r, g), b)
116
+ minc = np.minimum(np.minimum(r, g), b)
117
+ delta = maxc - minc
118
 
119
  # Value
120
+ v = maxc
121
 
122
  # Saturation
123
+ s = np.zeros_like(maxc, dtype = np.float32)
124
+
125
+ nonzero_mask = maxc > 0
126
+ s[nonzero_mask] = delta[nonzero_mask] / maxc[nonzero_mask]
127
 
128
  # Hue
129
+ h = np.zeros_like(maxc)
130
 
131
  # Red is max
132
+ mask = (maxc == r) & (delta > 0)
133
+ h[mask] = 60.0 * (((g[mask] - b[mask]) / delta[mask]) % 6.0)
134
 
135
  # Green is max
136
+ mask = (maxc == g) & (delta > 0)
137
+ h[mask] = 60.0 * (((b[mask] - r[mask]) / delta[mask]) + 2)
138
 
139
  # Blue is max
140
+ mask = (maxc == b) & (delta > 0)
141
+ h[mask] = 60.0 * (((r[mask] - g[mask]) / delta[mask]) + 4)
142
 
143
+ hsv = np.stack([h, s, v], axis = 2)
144
+ hsv = np.nan_to_num(hsv, nan = 0.0, posinf = 0.0, neginf = 0.0)
145
 
146
  return hsv
147
 
metrics/{aggregator.py → signal_aggregator.py} RENAMED
@@ -1,4 +1,5 @@
1
  # Dependencies
 
2
  import time
3
  import numpy as np
4
  from typing import List
@@ -11,18 +12,22 @@ from config.constants import MetricType
11
  from config.constants import SignalStatus
12
  from config.schemas import AnalysisResult
13
  from config.schemas import DetectionSignal
 
14
  from config.constants import DetectionStatus
15
  from config.constants import SIGNAL_THRESHOLDS
16
  from utils.image_processor import ImageProcessor
17
  from config.constants import METRIC_EXPLANATIONS
18
  from metrics.noise_analyzer import NoiseAnalyzer
19
  from metrics.color_analyzer import ColorAnalyzer
 
20
  from metrics.texture_analyzer import TextureAnalyzer
 
21
  from features.threshold_manager import ThresholdManager
22
  from config.constants import IMAGE_RESIZE_MAX_DIMENSION
23
  from metrics.frequency_analyzer import FrequencyAnalyzer
24
  from metrics.gradient_field_pca import GradientFieldPCADetector
25
 
 
26
  # Suppress NumPy warning
27
  np.seterr(divide = 'ignore',
28
  invalid = 'ignore',
@@ -33,12 +38,12 @@ np.seterr(divide = 'ignore',
33
  logger = get_logger(__name__)
34
 
35
 
36
- class MetricsAggregator:
37
  """
38
- Main detector that orchestrates all detection methods
39
 
40
- Combines multiple unsupervised metrics:
41
- ----------------------------------------
42
  1. Gradient-Field PCA
43
  2. Frequency Domain Analysis (FFT)
44
  3. Noise Pattern Analysis
@@ -74,6 +79,11 @@ class MetricsAggregator:
74
  # Get metric weights either from runtime UI or default to settings
75
  self.weights = (self.threshold_manager.get_metric_weights() if self.threshold_manager else settings.get_metric_weights())
76
 
 
 
 
 
 
77
  logger.info(f"Metric weights: {self.weights}")
78
 
79
 
@@ -117,6 +127,11 @@ class MetricsAggregator:
117
 
118
  # Determine status
119
  status = self._determine_status(overall_score = overall_score)
 
 
 
 
 
120
 
121
  # Calculate processing time
122
  processing_time = time.time() - start_time
@@ -125,7 +140,7 @@ class MetricsAggregator:
125
  result = AnalysisResult(filename = filename,
126
  overall_score = overall_score,
127
  status = status,
128
- confidence = int(overall_score * 100),
129
  signals = signals,
130
  metric_results = metric_results,
131
  processing_time = processing_time,
@@ -154,18 +169,26 @@ class MetricsAggregator:
154
  { dict } : Dictionary mapping MetricType to MetricResult
155
  """
156
  metric_results = dict()
 
157
 
158
- # Run eaach detector one by one
159
  for metric_type, (detector_name, detector) in self.detector_registry.items():
 
 
 
 
 
 
 
160
  try:
161
- result = detector.detect(image = image)
162
  result.metric_type = metric_type
163
  metric_results[metric_type] = result
164
 
165
  logger.debug(f"{detector_name} | {metric_type.value} | score={result.score:.3f} | confidence={result.confidence:.3f}")
166
-
167
  except Exception as e:
168
- logger.error(f"{detector.__class__.__name__} failed: {e}")
169
 
170
  # Same Failure Score by all metrics with same confidence
171
  metric_results[metric_type] = MetricResult(metric_type = metric_type,
@@ -285,4 +308,39 @@ class MetricsAggregator:
285
  return DetectionStatus.REVIEW_REQUIRED
286
 
287
  else:
288
- return DetectionStatus.LIKELY_AUTHENTIC
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Dependencies
2
+ import os
3
  import time
4
  import numpy as np
5
  from typing import List
 
12
  from config.constants import SignalStatus
13
  from config.schemas import AnalysisResult
14
  from config.schemas import DetectionSignal
15
+ from concurrent.futures import as_completed
16
  from config.constants import DetectionStatus
17
  from config.constants import SIGNAL_THRESHOLDS
18
  from utils.image_processor import ImageProcessor
19
  from config.constants import METRIC_EXPLANATIONS
20
  from metrics.noise_analyzer import NoiseAnalyzer
21
  from metrics.color_analyzer import ColorAnalyzer
22
+ from concurrent.futures import ThreadPoolExecutor
23
  from metrics.texture_analyzer import TextureAnalyzer
24
+ from config.constants import SIGNAL_CONFIDENCE_PARAMS
25
  from features.threshold_manager import ThresholdManager
26
  from config.constants import IMAGE_RESIZE_MAX_DIMENSION
27
  from metrics.frequency_analyzer import FrequencyAnalyzer
28
  from metrics.gradient_field_pca import GradientFieldPCADetector
29
 
30
+
31
  # Suppress NumPy warning
32
  np.seterr(divide = 'ignore',
33
  invalid = 'ignore',
 
38
  logger = get_logger(__name__)
39
 
40
 
41
+ class SignalAggregator:
42
  """
43
+ Main detector that orchestrates all detection signals
44
 
45
+ Combines multiple unsupervised metric signals:
46
+ ----------------------------------------------
47
  1. Gradient-Field PCA
48
  2. Frequency Domain Analysis (FFT)
49
  3. Noise Pattern Analysis
 
79
  # Get metric weights either from runtime UI or default to settings
80
  self.weights = (self.threshold_manager.get_metric_weights() if self.threshold_manager else settings.get_metric_weights())
81
 
82
+ # Initialize shared ThreadPoolExecutor (CPU-safe)
83
+ max_workers = min(settings.METRIC_WORKERS or len(self.detector_registry), os.cpu_count() or 4)
84
+
85
+ self.executor = ThreadPoolExecutor(max_workers = max_workers)
86
+
87
  logger.info(f"Metric weights: {self.weights}")
88
 
89
 
 
127
 
128
  # Determine status
129
  status = self._determine_status(overall_score = overall_score)
130
+
131
+ # Calculate confidence
132
+ confidence = self._calculate_confidence(metric_results = metric_results,
133
+ overall_score = overall_score,
134
+ )
135
 
136
  # Calculate processing time
137
  processing_time = time.time() - start_time
 
140
  result = AnalysisResult(filename = filename,
141
  overall_score = overall_score,
142
  status = status,
143
+ confidence = confidence,
144
  signals = signals,
145
  metric_results = metric_results,
146
  processing_time = processing_time,
 
169
  { dict } : Dictionary mapping MetricType to MetricResult
170
  """
171
  metric_results = dict()
172
+ futures = dict()
173
 
174
+ # Submit all detectors
175
  for metric_type, (detector_name, detector) in self.detector_registry.items():
176
+
177
+ futures[self.executor.submit(detector.detect, image = image)] = (metric_type, detector_name)
178
+
179
+ # Collect results as they complete
180
+ for future in as_completed(futures):
181
+ metric_type, detector_name = futures[future]
182
+
183
  try:
184
+ result = future.result(timeout = settings.METRIC_TIMEOUT)
185
  result.metric_type = metric_type
186
  metric_results[metric_type] = result
187
 
188
  logger.debug(f"{detector_name} | {metric_type.value} | score={result.score:.3f} | confidence={result.confidence:.3f}")
189
+
190
  except Exception as e:
191
+ logger.error(f"{detector_name} failed: {e}")
192
 
193
  # Same Failure Score by all metrics with same confidence
194
  metric_results[metric_type] = MetricResult(metric_type = metric_type,
 
308
  return DetectionStatus.REVIEW_REQUIRED
309
 
310
  else:
311
+ return DetectionStatus.LIKELY_AUTHENTIC
312
+
313
+
314
+ def _calculate_confidence(self, metric_results: dict[MetricType, MetricResult], overall_score: float) -> int:
315
+ """
316
+ Tier-1 confidence calculator based on:
317
+ - metric agreement
318
+ - metric reliability
319
+ - decision boundary distance
320
+ """
321
+ scores = [result.score for result in metric_results.values()]
322
+ score_variance = np.var(scores)
323
+
324
+ # If all metrics failed, confidence must be low
325
+ if all(isinstance(result.details, dict) and "error" in result.details for result in metric_results.values()):
326
+ return int(SIGNAL_CONFIDENCE_PARAMS.MIN_CONFIDENCE * 100)
327
+
328
+ # Agreement confidence
329
+ agreement_confidence = 1.0 - min(score_variance / SIGNAL_CONFIDENCE_PARAMS.VARIANCE_NORM, 1.0)
330
+
331
+ # Reliability confidence
332
+ confidences = [result.confidence for result in metric_results.values() if result.confidence is not None]
333
+ reliability_confidence = float(np.mean(confidences)) if confidences else SIGNAL_CONFIDENCE_PARAMS.DEFAULT_RELIABILITY_CONFIDENCE
334
+
335
+ # Distance confidence
336
+ review_threshold = (self.threshold_manager.get_review_threshold() if self.threshold_manager else settings.REVIEW_THRESHOLD)
337
+ distance_confidence = min(abs(overall_score - review_threshold) / SIGNAL_CONFIDENCE_PARAMS.DISTANCE_NORM, 1.0)
338
+
339
+ logger.debug(f"Confidence breakdown | agreement={agreement_confidence:.2f}, reliability={reliability_confidence:.2f}, distance={distance_confidence:.2f}")
340
+
341
+ confidence = (SIGNAL_CONFIDENCE_PARAMS.AGREEMENT_WEIGHT * agreement_confidence +
342
+ SIGNAL_CONFIDENCE_PARAMS.RELIABILITY_WEIGHT * reliability_confidence +
343
+ SIGNAL_CONFIDENCE_PARAMS.DISTANCE_WEIGHT * distance_confidence
344
+ )
345
+
346
+ return int(np.clip(confidence, 0.0, 1.0) * 100)
notebooks/.ipynb_checkpoints/Unified_Dataset_Builder-checkpoint.ipynb CHANGED
@@ -84,7 +84,7 @@
84
  "# ===============================\n",
85
  "# Directory Configuration\n",
86
  "# ===============================\n",
87
- "BASE_DIR = Path(\"tests/dataset\")\n",
88
  "AI_DIR = BASE_DIR / \"ai\"\n",
89
  "REAL_DIR = BASE_DIR / \"real\"\n",
90
  "RAW_DIR = BASE_DIR / \"raw_downloads\"\n",
@@ -166,16 +166,13 @@
166
  " }]\n",
167
  " \n",
168
  "\n",
169
- "REAL_DATASETS = [{\"name\" : \"mscoco_2017\",\n",
170
- " \"hf_id\" : \"shunk031/MSCOCO\",\n",
171
- " \"hf_kwargs\" : {\"year\": 2017,\n",
172
- " \"coco_task\": \"instances\"\n",
173
- " },\n",
174
  " \"split\" : \"train\",\n",
175
  " \"image_key\" : \"image\",\n",
176
  " \"label\" : \"real\",\n",
177
  " \"family\" : \"photographic\",\n",
178
- " \"streaming\" : False\n",
179
  " }]\n",
180
  "\n",
181
  "# Kaggle datasets (public, non-scraped)\n",
@@ -355,7 +352,7 @@
355
  },
356
  {
357
  "cell_type": "code",
358
- "execution_count": null,
359
  "id": "dd8ef771-f39f-4d9d-8eaf-626ecc211141",
360
  "metadata": {},
361
  "outputs": [
@@ -371,7 +368,74 @@
371
  "name": "stderr",
372
  "output_type": "stream",
373
  "text": [
374
- " 46%|████████████████████████████████████████████████████████████████| 463/1000 [02:43<04:08, 2.16it/s]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  ]
376
  }
377
  ],
 
84
  "# ===============================\n",
85
  "# Directory Configuration\n",
86
  "# ===============================\n",
87
+ "BASE_DIR = Path(\"../tests/dataset\")\n",
88
  "AI_DIR = BASE_DIR / \"ai\"\n",
89
  "REAL_DIR = BASE_DIR / \"real\"\n",
90
  "RAW_DIR = BASE_DIR / \"raw_downloads\"\n",
 
166
  " }]\n",
167
  " \n",
168
  "\n",
169
+ "REAL_DATASETS = [{\"name\" : \"imagenette\",\n",
170
+ " \"hf_id\" : \"frgfm/imagenette\",\n",
171
+ " \"config\" : \"320px\",\n",
 
 
172
  " \"split\" : \"train\",\n",
173
  " \"image_key\" : \"image\",\n",
174
  " \"label\" : \"real\",\n",
175
  " \"family\" : \"photographic\",\n",
 
176
  " }]\n",
177
  "\n",
178
  "# Kaggle datasets (public, non-scraped)\n",
 
352
  },
353
  {
354
  "cell_type": "code",
355
+ "execution_count": 7,
356
  "id": "dd8ef771-f39f-4d9d-8eaf-626ecc211141",
357
  "metadata": {},
358
  "outputs": [
 
368
  "name": "stderr",
369
  "output_type": "stream",
370
  "text": [
371
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:31<00:00, 3.01it/s]\n"
372
+ ]
373
+ },
374
+ {
375
+ "name": "stdout",
376
+ "output_type": "stream",
377
+ "text": [
378
+ "\n",
379
+ "▶ Loading HF dataset: imagenette\n"
380
+ ]
381
+ },
382
+ {
383
+ "data": {
384
+ "application/vnd.jupyter.widget-view+json": {
385
+ "model_id": "6e436d2fc4374bff9d76dc2534b752b8",
386
+ "version_major": 2,
387
+ "version_minor": 0
388
+ },
389
+ "text/plain": [
390
+ "Downloading builder script: 0.00B [00:00, ?B/s]"
391
+ ]
392
+ },
393
+ "metadata": {},
394
+ "output_type": "display_data"
395
+ },
396
+ {
397
+ "data": {
398
+ "application/vnd.jupyter.widget-view+json": {
399
+ "model_id": "791d872b01a44cbb908ddbad43f20a42",
400
+ "version_major": 2,
401
+ "version_minor": 0
402
+ },
403
+ "text/plain": [
404
+ "Downloading metadata: 0.00B [00:00, ?B/s]"
405
+ ]
406
+ },
407
+ "metadata": {},
408
+ "output_type": "display_data"
409
+ },
410
+ {
411
+ "data": {
412
+ "application/vnd.jupyter.widget-view+json": {
413
+ "model_id": "36b6ebafe8e4436e9cc3a4bf38a36bda",
414
+ "version_major": 2,
415
+ "version_minor": 0
416
+ },
417
+ "text/plain": [
418
+ "Downloading readme: 0.00B [00:00, ?B/s]"
419
+ ]
420
+ },
421
+ "metadata": {},
422
+ "output_type": "display_data"
423
+ },
424
+ {
425
+ "ename": "ValueError",
426
+ "evalue": "Config name is missing.\nPlease pick one among the available configs: ['full_size', '320px', '160px']\nExample of usage:\n\t`load_dataset('imagenette', 'full_size')`",
427
+ "output_type": "error",
428
+ "traceback": [
429
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
430
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
431
+ "Cell \u001b[0;32mIn[7], line 46\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m# Entry Point\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m---> 46\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
432
+ "Cell \u001b[0;32mIn[7], line 13\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m process_hf_dataset(ds, AI_DIR, writer)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m REAL_DATASETS:\n\u001b[0;32m---> 13\u001b[0m \u001b[43mprocess_hf_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mREAL_DIR\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# Kaggle datasets\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m KAGGLE_DATASETS:\n",
433
+ "Cell \u001b[0;32mIn[4], line 4\u001b[0m, in \u001b[0;36mprocess_hf_dataset\u001b[0;34m(ds_cfg, root_dir, writer)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mprocess_hf_dataset\u001b[39m(ds_cfg, root_dir, writer):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m▶ Loading HF dataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mds_cfg[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhf_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhf_kwargs\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mconfig\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msplit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mstreaming\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstreaming\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m out_dir \u001b[38;5;241m=\u001b[39m root_dir \u001b[38;5;241m/\u001b[39m ds_cfg[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 12\u001b[0m out_dir\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
434
+ "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/load.py:2129\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)\u001b[0m\n\u001b[1;32m 2124\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[1;32m 2125\u001b[0m (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[1;32m 2126\u001b[0m )\n\u001b[1;32m 2128\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 2129\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2130\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2131\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2132\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2133\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2134\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2135\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2136\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2137\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2138\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2139\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2140\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2141\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2142\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2144\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m 2145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
435
+ "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/load.py:1852\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, **config_kwargs)\u001b[0m\n\u001b[1;32m 1850\u001b[0m builder_cls \u001b[38;5;241m=\u001b[39m get_dataset_builder_class(dataset_module, dataset_name\u001b[38;5;241m=\u001b[39mdataset_name)\n\u001b[1;32m 1851\u001b[0m \u001b[38;5;66;03m# Instantiate the dataset builder\u001b[39;00m\n\u001b[0;32m-> 1852\u001b[0m builder_instance: DatasetBuilder \u001b[38;5;241m=\u001b[39m \u001b[43mbuilder_cls\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1853\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1854\u001b[0m \u001b[43m \u001b[49m\u001b[43mdataset_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdataset_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1855\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1856\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1857\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1858\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mhash\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mhash\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1859\u001b[0m \u001b[43m \u001b[49m\u001b[43minfo\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1860\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1861\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1862\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1863\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mbuilder_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1864\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1865\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1867\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m builder_instance\n",
436
+ "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/builder.py:373\u001b[0m, in \u001b[0;36mDatasetBuilder.__init__\u001b[0;34m(self, cache_dir, dataset_name, config_name, hash, base_path, info, features, token, use_auth_token, repo_id, data_files, data_dir, storage_options, writer_batch_size, name, **config_kwargs)\u001b[0m\n\u001b[1;32m 371\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data_dir \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 372\u001b[0m config_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata_dir\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m data_dir\n\u001b[0;32m--> 373\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig_id \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_create_builder_config\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 374\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[43m \u001b[49m\u001b[43mcustom_features\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 376\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 377\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;66;03m# prepare info: DatasetInfo are a standardized dataclass across all datasets\u001b[39;00m\n\u001b[1;32m 380\u001b[0m \u001b[38;5;66;03m# Prefill datasetinfo\u001b[39;00m\n\u001b[1;32m 381\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m info \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 382\u001b[0m \u001b[38;5;66;03m# TODO FOR PACKAGED MODULES IT IMPORTS DATA FROM src/packaged_modules which doesn't make sense\u001b[39;00m\n",
437
+ "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/builder.py:525\u001b[0m, in \u001b[0;36mDatasetBuilder._create_builder_config\u001b[0;34m(self, config_name, custom_features, **config_kwargs)\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mBUILDER_CONFIGS) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 524\u001b[0m example_of_usage \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mload_dataset(\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mBUILDER_CONFIGS[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 525\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 526\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mConfig name is missing.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 527\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mPlease pick one among the available configs: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuilder_configs\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 528\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mExample of usage:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m`\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mexample_of_usage\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 529\u001b[0m )\n\u001b[1;32m 530\u001b[0m builder_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mBUILDER_CONFIGS[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 531\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 532\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo config specified, defaulting to the single config: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mbuilder_config\u001b[38;5;241m.\u001b[39mname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 533\u001b[0m )\n",
438
+ "\u001b[0;31mValueError\u001b[0m: Config name is missing.\nPlease pick one among the available configs: ['full_size', '320px', '160px']\nExample of usage:\n\t`load_dataset('imagenette', 'full_size')`"
439
  ]
440
  }
441
  ],
notebooks/Unified_Dataset_Builder.ipynb CHANGED
@@ -84,7 +84,7 @@
84
  "# ===============================\n",
85
  "# Directory Configuration\n",
86
  "# ===============================\n",
87
- "BASE_DIR = Path(\"tests/dataset\")\n",
88
  "AI_DIR = BASE_DIR / \"ai\"\n",
89
  "REAL_DIR = BASE_DIR / \"real\"\n",
90
  "RAW_DIR = BASE_DIR / \"raw_downloads\"\n",
@@ -150,7 +150,7 @@
150
  },
151
  {
152
  "cell_type": "code",
153
- "execution_count": 3,
154
  "id": "74106705-e2d6-411c-8193-8e02f5ee0fdc",
155
  "metadata": {},
156
  "outputs": [],
@@ -166,16 +166,13 @@
166
  " }]\n",
167
  " \n",
168
  "\n",
169
- "REAL_DATASETS = [{\"name\" : \"mscoco_2017\",\n",
170
- " \"hf_id\" : \"shunk031/MSCOCO\",\n",
171
- " \"hf_kwargs\" : {\"year\": 2017,\n",
172
- " \"coco_task\": \"instances\"\n",
173
- " },\n",
174
  " \"split\" : \"train\",\n",
175
  " \"image_key\" : \"image\",\n",
176
  " \"label\" : \"real\",\n",
177
  " \"family\" : \"photographic\",\n",
178
- " \"streaming\" : False\n",
179
  " }]\n",
180
  "\n",
181
  "# Kaggle datasets (public, non-scraped)\n",
@@ -204,7 +201,7 @@
204
  },
205
  {
206
  "cell_type": "code",
207
- "execution_count": 4,
208
  "id": "a9ea5276-65bb-49f5-a656-c00ceeb1f4d3",
209
  "metadata": {},
210
  "outputs": [],
@@ -269,7 +266,7 @@
269
  },
270
  {
271
  "cell_type": "code",
272
- "execution_count": 5,
273
  "id": "c6eca5e6-0469-4af6-8af8-afe3036cb0a8",
274
  "metadata": {},
275
  "outputs": [],
@@ -305,7 +302,7 @@
305
  },
306
  {
307
  "cell_type": "code",
308
- "execution_count": 6,
309
  "id": "b648832e-5025-4851-af21-382051167a04",
310
  "metadata": {},
311
  "outputs": [],
@@ -355,7 +352,7 @@
355
  },
356
  {
357
  "cell_type": "code",
358
- "execution_count": 7,
359
  "id": "dd8ef771-f39f-4d9d-8eaf-626ecc211141",
360
  "metadata": {},
361
  "outputs": [
@@ -371,7 +368,7 @@
371
  "name": "stderr",
372
  "output_type": "stream",
373
  "text": [
374
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:24<00:00, 3.08it/s]\n"
375
  ]
376
  },
377
  {
@@ -379,13 +376,34 @@
379
  "output_type": "stream",
380
  "text": [
381
  "\n",
382
- "▶ Loading HF dataset: mscoco_2017\n"
 
 
 
 
 
 
 
383
  ]
384
  },
385
  {
386
  "data": {
387
  "application/vnd.jupyter.widget-view+json": {
388
- "model_id": "15b93e24384a49da9e46dceda9bc3f6b",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  "version_major": 2,
390
  "version_minor": 0
391
  },
@@ -399,51 +417,101 @@
399
  {
400
  "data": {
401
  "application/vnd.jupyter.widget-view+json": {
402
- "model_id": "e34eabfceb61496ebbd9336c9ed060f3",
403
  "version_major": 2,
404
  "version_minor": 0
405
  },
406
  "text/plain": [
407
- "Downloading data: 0%| | 0.00/19.3G [00:00<?, ?B/s]"
408
  ]
409
  },
410
  "metadata": {},
411
  "output_type": "display_data"
412
  },
413
  {
414
- "ename": "KeyboardInterrupt",
415
- "evalue": "",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  "output_type": "error",
417
  "traceback": [
418
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
419
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
420
- "Cell \u001b[0;32mIn[7], line 46\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m# Entry Point\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m---> 46\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
421
- "Cell \u001b[0;32mIn[7], line 13\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 10\u001b[0m process_hf_dataset(ds, AI_DIR, writer)\n\u001b[1;32m 12\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m REAL_DATASETS:\n\u001b[0;32m---> 13\u001b[0m \u001b[43mprocess_hf_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mREAL_DIR\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# Kaggle datasets\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m KAGGLE_DATASETS:\n",
422
- "Cell \u001b[0;32mIn[4], line 4\u001b[0m, in \u001b[0;36mprocess_hf_dataset\u001b[0;34m(ds_cfg, root_dir, writer)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mprocess_hf_dataset\u001b[39m(ds_cfg, root_dir, writer):\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m Loading HF dataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mds_cfg[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhf_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhf_kwargs\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mconfig\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43msplit\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msplit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mstreaming\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mds_cfg\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstreaming\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m out_dir \u001b[38;5;241m=\u001b[39m root_dir \u001b[38;5;241m/\u001b[39m ds_cfg[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 12\u001b[0m out_dir\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
423
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/load.py:2153\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)\u001b[0m\n\u001b[1;32m 2150\u001b[0m try_from_hf_gcs \u001b[38;5;241m=\u001b[39m path \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m _PACKAGED_DATASETS_MODULES\n\u001b[1;32m 2152\u001b[0m \u001b[38;5;66;03m# Download and prepare data\u001b[39;00m\n\u001b[0;32m-> 2153\u001b[0m \u001b[43mbuilder_instance\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2154\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2156\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2157\u001b[0m \u001b[43m \u001b[49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtry_from_hf_gcs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2158\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2159\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2160\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2162\u001b[0m \u001b[38;5;66;03m# Build dataset for splits\u001b[39;00m\n\u001b[1;32m 2163\u001b[0m keep_in_memory \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2164\u001b[0m keep_in_memory \u001b[38;5;28;01mif\u001b[39;00m keep_in_memory \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m is_small_dataset(builder_instance\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size)\n\u001b[1;32m 2165\u001b[0m )\n",
424
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/builder.py:954\u001b[0m, in \u001b[0;36mDatasetBuilder.download_and_prepare\u001b[0;34m(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 952\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 953\u001b[0m prepare_split_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_proc\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m num_proc\n\u001b[0;32m--> 954\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 955\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 956\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 957\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_split_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 958\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdownload_and_prepare_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 959\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 960\u001b[0m \u001b[38;5;66;03m# Sync info\u001b[39;00m\n\u001b[1;32m 961\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39mdataset_size \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msum\u001b[39m(split\u001b[38;5;241m.\u001b[39mnum_bytes \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfo\u001b[38;5;241m.\u001b[39msplits\u001b[38;5;241m.\u001b[39mvalues())\n",
425
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/builder.py:1717\u001b[0m, in \u001b[0;36mGeneratorBasedBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_splits_kwargs)\u001b[0m\n\u001b[1;32m 1716\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_download_and_prepare\u001b[39m(\u001b[38;5;28mself\u001b[39m, dl_manager, verification_mode, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mprepare_splits_kwargs):\n\u001b[0;32m-> 1717\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_download_and_prepare\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1718\u001b[0m \u001b[43m \u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1719\u001b[0m \u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1720\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck_duplicate_keys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBASIC_CHECKS\u001b[49m\n\u001b[1;32m 1721\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mverification_mode\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mVerificationMode\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mALL_CHECKS\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1722\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mprepare_splits_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1723\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
426
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/builder.py:1027\u001b[0m, in \u001b[0;36mDatasetBuilder._download_and_prepare\u001b[0;34m(self, dl_manager, verification_mode, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 1025\u001b[0m split_dict \u001b[38;5;241m=\u001b[39m SplitDict(dataset_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdataset_name)\n\u001b[1;32m 1026\u001b[0m split_generators_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_split_generators_kwargs(prepare_split_kwargs)\n\u001b[0;32m-> 1027\u001b[0m split_generators \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_split_generators\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdl_manager\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msplit_generators_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1029\u001b[0m \u001b[38;5;66;03m# Checksums verification\u001b[39;00m\n\u001b[1;32m 1030\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verification_mode \u001b[38;5;241m==\u001b[39m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS \u001b[38;5;129;01mand\u001b[39;00m dl_manager\u001b[38;5;241m.\u001b[39mrecord_checksums:\n",
427
- "File \u001b[0;32m~/.cache/huggingface/modules/datasets_modules/datasets/shunk031--MSCOCO/9a9d3cb1e5e1927e03f5448bc4e3dd95d17101d142ba4b94d6973770757f535f/MSCOCO.py:977\u001b[0m, in \u001b[0;36mMsCocoDataset._split_generators\u001b[0;34m(self, dl_manager)\u001b[0m\n\u001b[1;32m 976\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_split_generators\u001b[39m(\u001b[38;5;28mself\u001b[39m, dl_manager: ds\u001b[38;5;241m.\u001b[39mDownloadManager):\n\u001b[0;32m--> 977\u001b[0m file_paths \u001b[38;5;241m=\u001b[39m \u001b[43mdl_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_and_extract\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_URLS\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43myear\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 979\u001b[0m imgs \u001b[38;5;241m=\u001b[39m file_paths[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimages\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 980\u001b[0m anns \u001b[38;5;241m=\u001b[39m file_paths[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mannotations\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n",
428
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/download/download_manager.py:565\u001b[0m, in \u001b[0;36mDownloadManager.download_and_extract\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdownload_and_extract\u001b[39m(\u001b[38;5;28mself\u001b[39m, url_or_urls):\n\u001b[1;32m 550\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Download and extract given `url_or_urls`.\u001b[39;00m\n\u001b[1;32m 551\u001b[0m \n\u001b[1;32m 552\u001b[0m \u001b[38;5;124;03m Is roughly equivalent to:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 563\u001b[0m \u001b[38;5;124;03m extracted_path(s): `str`, extracted paths of given URL(s).\u001b[39;00m\n\u001b[1;32m 564\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 565\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_or_urls\u001b[49m\u001b[43m)\u001b[49m)\n",
429
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/download/download_manager.py:428\u001b[0m, in \u001b[0;36mDownloadManager.download\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m 425\u001b[0m download_func \u001b[38;5;241m=\u001b[39m partial(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_download, download_config\u001b[38;5;241m=\u001b[39mdownload_config)\n\u001b[1;32m 427\u001b[0m start_time \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mnow()\n\u001b[0;32m--> 428\u001b[0m downloaded_path_or_paths \u001b[38;5;241m=\u001b[39m \u001b[43mmap_nested\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 429\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_func\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 430\u001b[0m \u001b[43m \u001b[49m\u001b[43murl_or_urls\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[43mmap_tuple\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mnum_proc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnum_proc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mdisable_tqdm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mis_progress_bar_enabled\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mDownloading data files\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 436\u001b[0m duration \u001b[38;5;241m=\u001b[39m datetime\u001b[38;5;241m.\u001b[39mnow() \u001b[38;5;241m-\u001b[39m start_time\n\u001b[1;32m 437\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDownloading took \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mduration\u001b[38;5;241m.\u001b[39mtotal_seconds()\u001b[38;5;250m \u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;241m60\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m min\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
430
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:464\u001b[0m, in \u001b[0;36mmap_nested\u001b[0;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)\u001b[0m\n\u001b[1;32m 462\u001b[0m num_proc \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m num_proc \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m<\u001b[39m parallel_min_length:\n\u001b[0;32m--> 464\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 465\u001b[0m _single_map_nested((function, obj, types, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 466\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mtqdm(iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, desc\u001b[38;5;241m=\u001b[39mdesc)\n\u001b[1;32m 467\u001b[0m ]\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 469\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n",
431
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:465\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 462\u001b[0m num_proc \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m num_proc \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m num_proc \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(iterable) \u001b[38;5;241m<\u001b[39m parallel_min_length:\n\u001b[1;32m 464\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 465\u001b[0m \u001b[43m_single_map_nested\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 466\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m obj \u001b[38;5;129;01min\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mtqdm(iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, desc\u001b[38;5;241m=\u001b[39mdesc)\n\u001b[1;32m 467\u001b[0m ]\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 469\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n",
432
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:382\u001b[0m, in \u001b[0;36m_single_map_nested\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mtqdm(pbar_iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, position\u001b[38;5;241m=\u001b[39mrank, unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobj\u001b[39m\u001b[38;5;124m\"\u001b[39m, desc\u001b[38;5;241m=\u001b[39mpbar_desc) \u001b[38;5;28;01mas\u001b[39;00m pbar:\n\u001b[1;32m 381\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 382\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {k: _single_map_nested((function, v, types, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m)) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m pbar}\n\u001b[1;32m 383\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 384\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [_single_map_nested((function, v, types, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m pbar]\n",
433
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:382\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mtqdm(pbar_iterable, disable\u001b[38;5;241m=\u001b[39mdisable_tqdm, position\u001b[38;5;241m=\u001b[39mrank, unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mobj\u001b[39m\u001b[38;5;124m\"\u001b[39m, desc\u001b[38;5;241m=\u001b[39mpbar_desc) \u001b[38;5;28;01mas\u001b[39;00m pbar:\n\u001b[1;32m 381\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, \u001b[38;5;28mdict\u001b[39m):\n\u001b[0;32m--> 382\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {k: \u001b[43m_single_map_nested\u001b[49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m pbar}\n\u001b[1;32m 383\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 384\u001b[0m mapped \u001b[38;5;241m=\u001b[39m [_single_map_nested((function, v, types, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;28;01mTrue\u001b[39;00m, \u001b[38;5;28;01mNone\u001b[39;00m)) \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m pbar]\n",
434
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/py_utils.py:367\u001b[0m, in \u001b[0;36m_single_map_nested\u001b[0;34m(args)\u001b[0m\n\u001b[1;32m 365\u001b[0m \u001b[38;5;66;03m# Singleton first to spare some computation\u001b[39;00m\n\u001b[1;32m 366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, \u001b[38;5;28mdict\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data_struct, types):\n\u001b[0;32m--> 367\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_struct\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;66;03m# Reduce logging to keep things readable in multiprocessing with tqdm\u001b[39;00m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m rank \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mget_verbosity() \u001b[38;5;241m<\u001b[39m logging\u001b[38;5;241m.\u001b[39mWARNING:\n",
435
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/download/download_manager.py:454\u001b[0m, in \u001b[0;36mDownloadManager._download\u001b[0;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_relative_path(url_or_filename):\n\u001b[1;32m 452\u001b[0m \u001b[38;5;66;03m# append the relative path to the base_path\u001b[39;00m\n\u001b[1;32m 453\u001b[0m url_or_filename \u001b[38;5;241m=\u001b[39m url_or_path_join(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_base_path, url_or_filename)\n\u001b[0;32m--> 454\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcached_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_or_filename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m)\u001b[49m\n",
436
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/file_utils.py:182\u001b[0m, in \u001b[0;36mcached_path\u001b[0;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[1;32m 178\u001b[0m url_or_filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(url_or_filename)\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_remote_url(url_or_filename):\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# URL, so get it from the cache (downloading if necessary)\u001b[39;00m\n\u001b[0;32m--> 182\u001b[0m output_path \u001b[38;5;241m=\u001b[39m \u001b[43mget_from_cache\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43murl_or_filename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 186\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_download\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresume_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43muser_agent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muser_agent\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 189\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 190\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_etag\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_etag\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 191\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 192\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 193\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_url_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mignore_url_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 195\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_desc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdownload_desc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 196\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(url_or_filename):\n\u001b[1;32m 198\u001b[0m \u001b[38;5;66;03m# File, and it exists.\u001b[39;00m\n\u001b[1;32m 199\u001b[0m output_path \u001b[38;5;241m=\u001b[39m url_or_filename\n",
437
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/file_utils.py:644\u001b[0m, in \u001b[0;36mget_from_cache\u001b[0;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, token, use_auth_token, ignore_url_params, storage_options, download_desc)\u001b[0m\n\u001b[1;32m 642\u001b[0m fsspec_get(url, temp_file, storage_options\u001b[38;5;241m=\u001b[39mstorage_options, desc\u001b[38;5;241m=\u001b[39mdownload_desc)\n\u001b[1;32m 643\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 644\u001b[0m \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 645\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 646\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemp_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 647\u001b[0m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 648\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 649\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 650\u001b[0m \u001b[43m \u001b[49m\u001b[43mcookies\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcookies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 651\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 652\u001b[0m \u001b[43m \u001b[49m\u001b[43mdesc\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_desc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 653\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 655\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstoring \u001b[39m\u001b[38;5;132;01m{\u001b[39;00murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m in cache at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcache_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 656\u001b[0m shutil\u001b[38;5;241m.\u001b[39mmove(temp_file\u001b[38;5;241m.\u001b[39mname, cache_path)\n",
438
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/datasets/utils/file_utils.py:419\u001b[0m, in \u001b[0;36mhttp_get\u001b[0;34m(url, temp_file, proxies, resume_size, headers, cookies, timeout, max_retries, desc)\u001b[0m\n\u001b[1;32m 410\u001b[0m total \u001b[38;5;241m=\u001b[39m resume_size \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mint\u001b[39m(content_length) \u001b[38;5;28;01mif\u001b[39;00m content_length \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mtqdm(\n\u001b[1;32m 412\u001b[0m unit\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mB\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 413\u001b[0m unit_scale\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 417\u001b[0m disable\u001b[38;5;241m=\u001b[39m\u001b[38;5;129;01mnot\u001b[39;00m logging\u001b[38;5;241m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 418\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m progress:\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m response\u001b[38;5;241m.\u001b[39miter_content(chunk_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1024\u001b[39m):\n\u001b[1;32m 420\u001b[0m progress\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;28mlen\u001b[39m(chunk))\n\u001b[1;32m 421\u001b[0m temp_file\u001b[38;5;241m.\u001b[39mwrite(chunk)\n",
439
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/requests/models.py:816\u001b[0m, in \u001b[0;36mResponse.iter_content.<locals>.generate\u001b[0;34m()\u001b[0m\n\u001b[1;32m 814\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstream\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 815\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 816\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mraw\u001b[38;5;241m.\u001b[39mstream(chunk_size, decode_content\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 817\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 818\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n",
440
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/urllib3/response.py:1091\u001b[0m, in \u001b[0;36mHTTPResponse.stream\u001b[0;34m(self, amt, decode_content)\u001b[0m\n\u001b[1;32m 1089\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1090\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1091\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1093\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data:\n\u001b[1;32m 1094\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m data\n",
441
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/urllib3/response.py:980\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt, decode_content, cache_content)\u001b[0m\n\u001b[1;32m 977\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m amt:\n\u001b[1;32m 978\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer\u001b[38;5;241m.\u001b[39mget(amt)\n\u001b[0;32m--> 980\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 982\u001b[0m flush_decoder \u001b[38;5;241m=\u001b[39m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m (amt \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data)\n\u001b[1;32m 984\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_decoded_buffer) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n",
442
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/urllib3/response.py:904\u001b[0m, in \u001b[0;36mHTTPResponse._raw_read\u001b[0;34m(self, amt, read1)\u001b[0m\n\u001b[1;32m 901\u001b[0m fp_closed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclosed\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 903\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_error_catcher():\n\u001b[0;32m--> 904\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fp_closed \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 905\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data:\n\u001b[1;32m 906\u001b[0m \u001b[38;5;66;03m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[1;32m 907\u001b[0m \u001b[38;5;66;03m# Close the connection when no data is returned\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 912\u001b[0m \u001b[38;5;66;03m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[1;32m 913\u001b[0m \u001b[38;5;66;03m# no harm in redundantly calling close.\u001b[39;00m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mclose()\n",
443
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/site-packages/urllib3/response.py:887\u001b[0m, in \u001b[0;36mHTTPResponse._fp_read\u001b[0;34m(self, amt, read1)\u001b[0m\n\u001b[1;32m 884\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread1(amt) \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread1()\n\u001b[1;32m 885\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 886\u001b[0m \u001b[38;5;66;03m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[0;32m--> 887\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fp\u001b[38;5;241m.\u001b[39mread()\n",
444
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/http/client.py:466\u001b[0m, in \u001b[0;36mHTTPResponse.read\u001b[0;34m(self, amt)\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength:\n\u001b[1;32m 464\u001b[0m \u001b[38;5;66;03m# clip the read to the \"end of response\"\u001b[39;00m\n\u001b[1;32m 465\u001b[0m amt \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlength\n\u001b[0;32m--> 466\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 467\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[1;32m 468\u001b[0m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[1;32m 469\u001b[0m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_conn()\n",
445
- "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/socket.py:717\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 716\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 717\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n",
446
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
447
  ]
448
  }
449
  ],
 
84
  "# ===============================\n",
85
  "# Directory Configuration\n",
86
  "# ===============================\n",
87
+ "BASE_DIR = Path(\"../tests/dataset\")\n",
88
  "AI_DIR = BASE_DIR / \"ai\"\n",
89
  "REAL_DIR = BASE_DIR / \"real\"\n",
90
  "RAW_DIR = BASE_DIR / \"raw_downloads\"\n",
 
150
  },
151
  {
152
  "cell_type": "code",
153
+ "execution_count": 8,
154
  "id": "74106705-e2d6-411c-8193-8e02f5ee0fdc",
155
  "metadata": {},
156
  "outputs": [],
 
166
  " }]\n",
167
  " \n",
168
  "\n",
169
+ "REAL_DATASETS = [{\"name\" : \"imagenette\",\n",
170
+ " \"hf_id\" : \"frgfm/imagenette\",\n",
171
+ " \"config\" : \"320px\",\n",
 
 
172
  " \"split\" : \"train\",\n",
173
  " \"image_key\" : \"image\",\n",
174
  " \"label\" : \"real\",\n",
175
  " \"family\" : \"photographic\",\n",
 
176
  " }]\n",
177
  "\n",
178
  "# Kaggle datasets (public, non-scraped)\n",
 
201
  },
202
  {
203
  "cell_type": "code",
204
+ "execution_count": 9,
205
  "id": "a9ea5276-65bb-49f5-a656-c00ceeb1f4d3",
206
  "metadata": {},
207
  "outputs": [],
 
266
  },
267
  {
268
  "cell_type": "code",
269
+ "execution_count": 10,
270
  "id": "c6eca5e6-0469-4af6-8af8-afe3036cb0a8",
271
  "metadata": {},
272
  "outputs": [],
 
302
  },
303
  {
304
  "cell_type": "code",
305
+ "execution_count": 11,
306
  "id": "b648832e-5025-4851-af21-382051167a04",
307
  "metadata": {},
308
  "outputs": [],
 
352
  },
353
  {
354
  "cell_type": "code",
355
+ "execution_count": 12,
356
  "id": "dd8ef771-f39f-4d9d-8eaf-626ecc211141",
357
  "metadata": {},
358
  "outputs": [
 
368
  "name": "stderr",
369
  "output_type": "stream",
370
  "text": [
371
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [05:30<00:00, 3.02it/s]\n"
372
  ]
373
  },
374
  {
 
376
  "output_type": "stream",
377
  "text": [
378
  "\n",
379
+ "▶ Loading HF dataset: imagenette\n"
380
+ ]
381
+ },
382
+ {
383
+ "name": "stderr",
384
+ "output_type": "stream",
385
+ "text": [
386
+ "Using the latest cached version of the module from /Users/itobuz/.cache/huggingface/modules/datasets_modules/datasets/frgfm--imagenette/38929285b8abcae5c1305418e9d8fea5dd6b189bbbd22caba5f5537c7fa0f01f (last modified on Mon Dec 22 15:06:36 2025) since it couldn't be found locally at frgfm/imagenette., or remotely on the Hugging Face Hub.\n"
387
  ]
388
  },
389
  {
390
  "data": {
391
  "application/vnd.jupyter.widget-view+json": {
392
+ "model_id": "ce74ba00790b49fab546616010a4952d",
393
+ "version_major": 2,
394
+ "version_minor": 0
395
+ },
396
+ "text/plain": [
397
+ "Downloading data: 0%| | 0.00/342M [00:00<?, ?B/s]"
398
+ ]
399
+ },
400
+ "metadata": {},
401
+ "output_type": "display_data"
402
+ },
403
+ {
404
+ "data": {
405
+ "application/vnd.jupyter.widget-view+json": {
406
+ "model_id": "dcadae012cfa492f8e94ea1662cb8102",
407
  "version_major": 2,
408
  "version_minor": 0
409
  },
 
417
  {
418
  "data": {
419
  "application/vnd.jupyter.widget-view+json": {
420
+ "model_id": "3a0d3ab402554b7489a01f883e4d6572",
421
  "version_major": 2,
422
  "version_minor": 0
423
  },
424
  "text/plain": [
425
+ "Downloading data: 0.00B [00:00, ?B/s]"
426
  ]
427
  },
428
  "metadata": {},
429
  "output_type": "display_data"
430
  },
431
  {
432
+ "data": {
433
+ "application/vnd.jupyter.widget-view+json": {
434
+ "model_id": "9e65e45076104352951a4a71bd8d6da7",
435
+ "version_major": 2,
436
+ "version_minor": 0
437
+ },
438
+ "text/plain": [
439
+ "Downloading data: 0.00B [00:00, ?B/s]"
440
+ ]
441
+ },
442
+ "metadata": {},
443
+ "output_type": "display_data"
444
+ },
445
+ {
446
+ "data": {
447
+ "application/vnd.jupyter.widget-view+json": {
448
+ "model_id": "d714e865ab2a43dcb43a33dcf0df2be7",
449
+ "version_major": 2,
450
+ "version_minor": 0
451
+ },
452
+ "text/plain": [
453
+ "Generating train split: 0%| | 0/9469 [00:00<?, ? examples/s]"
454
+ ]
455
+ },
456
+ "metadata": {},
457
+ "output_type": "display_data"
458
+ },
459
+ {
460
+ "data": {
461
+ "application/vnd.jupyter.widget-view+json": {
462
+ "model_id": "49d1d87ad10548eebf9ba2709dbda441",
463
+ "version_major": 2,
464
+ "version_minor": 0
465
+ },
466
+ "text/plain": [
467
+ "Generating validation split: 0%| | 0/3925 [00:00<?, ? examples/s]"
468
+ ]
469
+ },
470
+ "metadata": {},
471
+ "output_type": "display_data"
472
+ },
473
+ {
474
+ "name": "stderr",
475
+ "output_type": "stream",
476
+ "text": [
477
+ " 11%|██████████████▋ | 1000/9469 [02:30<21:13, 6.65it/s]\n"
478
+ ]
479
+ },
480
+ {
481
+ "name": "stdout",
482
+ "output_type": "stream",
483
+ "text": [
484
+ "⬇ Downloading Kaggle dataset: tristanzhang32/ai-generated-images-vs-real-images\n"
485
+ ]
486
+ },
487
+ {
488
+ "name": "stderr",
489
+ "output_type": "stream",
490
+ "text": [
491
+ "Traceback (most recent call last):\n",
492
+ " File \"/Users/itobuz/.conda/envs/mvp_env/bin/kaggle\", line 7, in <module>\n",
493
+ " sys.exit(main())\n",
494
+ " File \"/Users/itobuz/.conda/envs/mvp_env/lib/python3.10/site-packages/kaggle/cli.py\", line 68, in main\n",
495
+ " out = args.func(**command_args)\n",
496
+ " File \"/Users/itobuz/.conda/envs/mvp_env/lib/python3.10/site-packages/kaggle/api/kaggle_api_extended.py\", line 1741, in dataset_download_cli\n",
497
+ " with self.build_kaggle_client() as kaggle:\n",
498
+ " File \"/Users/itobuz/.conda/envs/mvp_env/lib/python3.10/site-packages/kaggle/api/kaggle_api_extended.py\", line 688, in build_kaggle_client\n",
499
+ " username=self.config_values['username'],\n",
500
+ "KeyError: 'username'\n"
501
+ ]
502
+ },
503
+ {
504
+ "ename": "CalledProcessError",
505
+ "evalue": "Command '['kaggle', 'datasets', 'download', 'tristanzhang32/ai-generated-images-vs-real-images', '-p', '../tests/dataset/raw_downloads/ai_vs_real', '--unzip']' returned non-zero exit status 1.",
506
  "output_type": "error",
507
  "traceback": [
508
  "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
509
+ "\u001b[0;31mCalledProcessError\u001b[0m Traceback (most recent call last)",
510
+ "Cell \u001b[0;32mIn[12], line 46\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;66;03m# Entry Point\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;66;03m# ===============================\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;18m__name__\u001b[39m \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__main__\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m---> 46\u001b[0m \u001b[43mmain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
511
+ "Cell \u001b[0;32mIn[12], line 18\u001b[0m, in \u001b[0;36mmain\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ds \u001b[38;5;129;01min\u001b[39;00m KAGGLE_DATASETS:\n\u001b[1;32m 17\u001b[0m raw_path \u001b[38;5;241m=\u001b[39m RAW_DIR \u001b[38;5;241m/\u001b[39m ds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m---> 18\u001b[0m \u001b[43mdownload_kaggle_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mds\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mkaggle_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mraw_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# AI images\u001b[39;00m\n\u001b[1;32m 21\u001b[0m ingest_image_folder(src_dir \u001b[38;5;241m=\u001b[39m raw_path \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mai\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 22\u001b[0m out_dir \u001b[38;5;241m=\u001b[39m AI_DIR \u001b[38;5;241m/\u001b[39m ds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 23\u001b[0m writer \u001b[38;5;241m=\u001b[39m writer,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 26\u001b[0m source \u001b[38;5;241m=\u001b[39m ds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 27\u001b[0m )\n",
512
+ "Cell \u001b[0;32mIn[10], line 10\u001b[0m, in \u001b[0;36mdownload_kaggle_dataset\u001b[0;34m(kaggle_id, out_dir)\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Downloading Kaggle dataset: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkaggle_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 10\u001b[0m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mkaggle\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdatasets\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdownload\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mkaggle_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m-p\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mout_dir\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m--unzip\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 14\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 16\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
513
+ "File \u001b[0;32m~/.conda/envs/mvp_env/lib/python3.10/subprocess.py:526\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 524\u001b[0m retcode \u001b[38;5;241m=\u001b[39m process\u001b[38;5;241m.\u001b[39mpoll()\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check \u001b[38;5;129;01mand\u001b[39;00m retcode:\n\u001b[0;32m--> 526\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m CalledProcessError(retcode, process\u001b[38;5;241m.\u001b[39margs,\n\u001b[1;32m 527\u001b[0m output\u001b[38;5;241m=\u001b[39mstdout, stderr\u001b[38;5;241m=\u001b[39mstderr)\n\u001b[1;32m 528\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m CompletedProcess(process\u001b[38;5;241m.\u001b[39margs, retcode, stdout, stderr)\n",
514
+ "\u001b[0;31mCalledProcessError\u001b[0m: Command '['kaggle', 'datasets', 'download', 'tristanzhang32/ai-generated-images-vs-real-images', '-p', '../tests/dataset/raw_downloads/ai_vs_real', '--unzip']' returned non-zero exit status 1."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  ]
516
  }
517
  ],
reporter/csv_reporter.py CHANGED
@@ -7,8 +7,8 @@ from utils.logger import get_logger
7
  from config.settings import settings
8
  from config.constants import MetricType
9
  from config.schemas import AnalysisResult
 
10
  from utils.helpers import generate_unique_id
11
- from config.constants import DetectionStatus
12
  from config.schemas import BatchAnalysisResult
13
  from features.detailed_result_maker import DetailedResultMaker
14
 
@@ -19,444 +19,248 @@ logger = get_logger(__name__)
19
 
20
  class CSVReporter:
21
  """
22
- Professional CSV report generator
23
-
24
- Features:
25
- ---------
26
- - Single image detailed reports
27
- - Batch summary reports with statistics
28
- - Detailed forensic data export
29
- - Excel-compatible formatting
30
- - UTF-8 encoding with BOM for international compatibility
31
  """
32
  def __init__(self):
33
  """
34
  Initialize CSV Reporter
35
  """
36
  self.detailed_maker = DetailedResultMaker()
 
37
  logger.debug("CSVReporter initialized")
38
-
39
 
40
  def export_batch_summary(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
41
  """
42
- Export batch analysis summary as CSV
43
-
44
- Arguments:
45
- ----------
46
- batch_result { BatchAnalysisResult } : Complete batch analysis result
47
-
48
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
49
-
50
- Returns:
51
- --------
52
- { Path } : Path to generated CSV file
53
  """
54
  output_dir = output_dir or settings.REPORTS_DIR
55
  report_id = generate_unique_id()
56
  filename = f"batch_summary_{report_id}.csv"
57
  output_path = output_dir / filename
58
-
59
  logger.info(f"Generating batch summary CSV: {filename}")
60
-
61
  try:
62
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
63
  writer = csv.writer(f)
64
-
65
- # Report Header
66
- self._write_report_header(writer = writer,
67
- report_type = "Batch Analysis Summary",
68
  timestamp = batch_result.timestamp,
69
  )
70
-
71
- # Batch Statistics
72
- self._write_batch_statistics(writer = writer,
73
- batch_result = batch_result,
74
- )
75
-
76
- # Main Results Table
77
  self._write_batch_results_table(writer = writer,
78
  batch_result = batch_result,
79
  )
80
-
81
- # Footer
82
  self._write_footer(writer = writer)
83
-
84
  logger.info(f"Batch summary CSV generated: {output_path}")
85
  return output_path
86
-
87
  except Exception as e:
88
  logger.error(f"Failed to generate batch summary CSV: {e}")
89
  raise
90
-
91
 
92
  def export_batch_detailed(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
93
  """
94
- Export detailed batch analysis with forensic data
95
-
96
- Arguments:
97
- ----------
98
- batch_result { BatchAnalysisResult } : Complete batch analysis result
99
-
100
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
101
-
102
- Returns:
103
- --------
104
- { Path } : Path to generated CSV file
105
  """
106
  output_dir = output_dir or settings.REPORTS_DIR
107
  report_id = generate_unique_id()
108
  filename = f"batch_detailed_{report_id}.csv"
109
  output_path = output_dir / filename
110
-
111
  logger.info(f"Generating detailed batch CSV: {filename}")
112
-
113
  try:
114
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
115
  writer = csv.writer(f)
116
-
117
- # Report Header
118
- self._write_report_header(writer = writer,
119
  report_type = "Detailed Batch Analysis",
120
  timestamp = batch_result.timestamp,
121
  )
122
-
123
- # Process each image with full details
124
  for idx, result in enumerate(batch_result.results, 1):
125
- self._write_detailed_image_section(writer = writer,
126
- result = result,
127
- image_number = idx,
128
- total_images = batch_result.processed,
129
  )
130
-
131
- # Add separator between images
132
  if (idx < batch_result.processed):
133
  writer.writerow([])
134
  writer.writerow(['=' * 100])
135
  writer.writerow([])
136
-
137
- # Footer
138
  self._write_footer(writer = writer)
139
-
140
  logger.info(f"Detailed batch CSV generated: {output_path}")
141
- return output_path
142
 
 
 
143
  except Exception as e:
144
  logger.error(f"Failed to generate detailed batch CSV: {e}")
145
  raise
146
-
147
 
148
  def export_single_detailed(self, result: AnalysisResult, output_dir: Optional[Path] = None) -> Path:
149
  """
150
- Export single image detailed analysis as CSV
151
-
152
- Arguments:
153
- ----------
154
- result { AnalysisResult } : Single image analysis result
155
-
156
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
157
-
158
- Returns:
159
- --------
160
- { Path } : Path to generated CSV file
161
  """
162
  output_dir = output_dir or settings.REPORTS_DIR
163
  report_id = generate_unique_id()
164
  filename = f"single_analysis_{report_id}.csv"
165
  output_path = output_dir / filename
166
-
167
  logger.info(f"Generating single image CSV: {filename}")
168
-
169
  try:
170
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
171
  writer = csv.writer(f)
172
-
173
- # Report Header
174
- self._write_report_header(writer = writer,
175
  report_type = "Single Image Analysis",
176
  timestamp = result.timestamp,
177
  )
178
-
179
- # Image Details
180
- self._write_detailed_image_section(writer = writer,
181
  result = result,
182
  image_number = 1,
183
  total_images = 1,
184
  )
185
-
186
- # Footer
187
  self._write_footer(writer = writer)
188
-
189
  logger.info(f"Single image CSV generated: {output_path}")
190
- return output_path
191
 
 
 
192
  except Exception as e:
193
  logger.error(f"Failed to generate single image CSV: {e}")
194
  raise
195
-
196
 
197
- def export_metrics_comparison(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
198
- """
199
- Export metrics comparison table across all images
200
-
201
- Arguments:
202
- ----------
203
- batch_result { BatchAnalysisResult } : Complete batch analysis result
204
-
205
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
206
-
207
- Returns:
208
- --------
209
- { Path } : Path to generated CSV file
210
- """
211
- output_dir = output_dir or settings.REPORTS_DIR
212
- report_id = generate_unique_id()
213
- filename = f"metrics_comparison_{report_id}.csv"
214
- output_path = output_dir / filename
215
-
216
- logger.info(f"Generating metrics comparison CSV: {filename}")
217
-
218
- try:
219
- with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
220
- writer = csv.writer(f)
221
-
222
- # Report Header
223
- self._write_report_header(writer = writer,
224
- report_type = "Metrics Comparison",
225
- timestamp = batch_result.timestamp,
226
- )
227
-
228
- # Comparison Table Header
229
- writer.writerow(['Metrics Comparison Across All Images'])
230
- writer.writerow([])
231
-
232
- header = ['Filename',
233
- 'Overall Score',
234
- 'Analysis Status',
235
- 'Gradient Analysis Score',
236
- 'Gradient Analysis Confidence',
237
- 'Frequency Analysis Score',
238
- 'Frequency Analysis Confidence',
239
- 'Noise Analysis Score',
240
- 'Noise Analysis Confidence',
241
- 'Texture Analysis Score',
242
- 'Texture Analysis Confidence',
243
- 'Color Analysis Score',
244
- 'Color Analysis Confidence',
245
- 'Processing Time',
246
- ]
247
-
248
- writer.writerow(header)
249
-
250
- # Data rows
251
- for result in batch_result.results:
252
- row = [result.filename,
253
- f"{result.overall_score:.3f}",
254
- result.status.value,
255
- ]
256
-
257
- # Add each metric's score and confidence
258
- for metric_type in [MetricType.GRADIENT, MetricType.FREQUENCY, MetricType.NOISE, MetricType.TEXTURE, MetricType.COLOR]:
259
- metric_result = result.metric_results.get(metric_type)
260
-
261
- if metric_result:
262
- row.append(f"{metric_result.score:.3f}")
263
- row.append(f"{metric_result.confidence:.3f}" if metric_result.confidence is not None else "N/A")
264
-
265
- else:
266
- row.extend(["N/A", "N/A"])
267
-
268
- row.append(f"{result.processing_time:.2f}s")
269
- writer.writerow(row)
270
-
271
- # Footer
272
- writer.writerow([])
273
- self._write_footer(writer = writer)
274
-
275
- logger.info(f"Metrics comparison CSV generated: {output_path}")
276
- return output_path
277
-
278
- except Exception as e:
279
- logger.error(f"Failed to generate metrics comparison CSV: {e}")
280
- raise
281
-
282
 
283
  def _write_report_header(self, writer, report_type: str, timestamp: datetime) -> None:
284
- """
285
- Write CSV report header
286
- """
287
  writer.writerow(['=' * 100])
288
  writer.writerow([f'AI Image Screener - {report_type}'])
289
  writer.writerow([f'Generated: {timestamp.strftime("%Y-%m-%d %H:%M:%S")}'])
290
  writer.writerow([f'Version: {settings.VERSION}'])
291
  writer.writerow(['=' * 100])
292
  writer.writerow([])
293
-
294
 
295
- def _write_batch_statistics(self, writer, batch_result: BatchAnalysisResult) -> None:
296
- """
297
- Write batch statistics section
298
- """
299
- writer.writerow(['BATCH STATISTICS'])
300
  writer.writerow([])
301
-
302
- stats = [['Total Images', batch_result.total_images],
303
- ['Successfully Processed', batch_result.processed],
304
- ['Failed', batch_result.failed],
305
- ['Success Rate', f"{batch_result.summary.get('success_rate', 0)}%"],
306
- ['' , ''],
307
- ['Likely Authentic', batch_result.summary.get('likely_authentic', 0)],
308
- ['Review Required', batch_result.summary.get('review_required', 0)],
309
- ['', ''],
310
- ['Average Score', f"{batch_result.summary.get('avg_score', 0):.3f}"],
311
- ['Average Confidence', f"{batch_result.summary.get('avg_confidence', 0)}%"],
312
- ['Total Processing Time', f"{batch_result.total_processing_time:.2f}s"],
313
- ['Average Time per Image', f"{batch_result.summary.get('avg_proc_time', 0):.2f}s"],
314
- ]
315
-
316
- for row in stats:
317
  writer.writerow(row)
318
-
319
  writer.writerow([])
320
  writer.writerow(['=' * 100])
321
  writer.writerow([])
322
-
323
 
324
  def _write_batch_results_table(self, writer, batch_result: BatchAnalysisResult) -> None:
325
- """
326
- Write batch results main table
327
- """
328
  writer.writerow(['ANALYSIS RESULTS'])
329
  writer.writerow([])
330
-
331
- # Table Header
332
- header = ['Filename',
333
- 'Image Size',
334
- 'Analysis Status',
335
- 'Overall Score',
336
- 'Analysis Confidence (%)',
337
- 'Top Warning Signals',
338
- 'Recommendation',
339
- 'Processing Time (s)',
340
  ]
341
 
342
  writer.writerow(header)
343
-
344
- # Data rows
345
  for result in batch_result.results:
346
- # Get top warning signals
347
- top_signals = [s.name for s in result.signals if s.status.value in ['flagged', 'warning']][:2]
348
- signals_str = "; ".join(top_signals) if top_signals else "All tests passed"
349
-
350
- # Recommendation
351
- if (result.status == DetectionStatus.REVIEW_REQUIRED):
352
- recommendation = "Manual verification recommended"
353
 
354
- else:
355
- recommendation = "No further action needed"
356
-
357
- row = [result.filename,
358
- f"{result.image_size[0]}×{result.image_size[1]}",
359
- result.status.value,
360
- f"{result.overall_score:.3f}",
361
- f"{result.confidence}%",
362
- signals_str,
363
- recommendation,
364
- f"{result.processing_time:.2f}",
365
- ]
366
-
367
- writer.writerow(row)
368
-
369
  writer.writerow([])
370
-
371
 
372
  def _write_detailed_image_section(self, writer, result: AnalysisResult, image_number: int, total_images: int) -> None:
373
- """
374
- Write detailed section for single image
375
- """
376
  writer.writerow([f'IMAGE {image_number} OF {total_images}'])
377
  writer.writerow([])
378
-
379
- # Basic Information
380
- writer.writerow(['BASIC INFORMATION'])
381
- writer.writerow(['Filename', result.filename])
382
- writer.writerow(['Status', result.status.value])
383
- writer.writerow(['Overall Score', f"{result.overall_score:.3f}"])
384
  writer.writerow(['Confidence', f"{result.confidence}%"])
385
- writer.writerow(['Image Size', f"{result.image_size[0]}×{result.image_size[1]}"])
386
- writer.writerow(['Processing Time', f"{result.processing_time:.2f}s"])
387
- writer.writerow(['Timestamp', result.timestamp.isoformat()])
388
- writer.writerow([])
389
-
390
- # Detection Signals
391
- writer.writerow(['DETECTION SIGNALS'])
392
  writer.writerow([])
393
- writer.writerow(['Metric Name', 'Metric Score', 'Analysis Status', 'Metric Confidence', 'Metric Explanation'])
394
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  for signal in result.signals:
396
- metric_result = result.metric_results.get(signal.metric_type)
397
- confidence_str = f"{metric_result.confidence:.3f}" if metric_result.confidence is not None else "N/A"
398
-
399
  writer.writerow([signal.name,
400
  f"{signal.score:.3f}",
401
- signal.status.value.upper(),
402
- confidence_str,
403
- signal.explanation.replace("\n", " "),
404
  ])
405
-
406
- writer.writerow([])
407
-
408
- # Detailed Forensics
409
- writer.writerow(['FORENSIC DETAILS'])
410
- writer.writerow([])
411
 
412
- for metric_type in MetricType:
413
- metric_result = result.metric_results.get(metric_type)
414
-
415
- if not metric_result:
416
- continue
417
-
418
- metric_name = self.detailed_maker.metric_display_names.get(metric_type, metric_type.value)
419
-
420
- writer.writerow([f'--- {metric_name} ---'])
421
- writer.writerow(['Score', f"{metric_result.score:.3f}"])
422
- writer.writerow(['Confidence', f"{metric_result.confidence:.3f}" if metric_result.confidence is not None else "N/A"])
423
-
424
- # Write details
425
- if metric_result.details:
426
- for key, value in metric_result.details.items():
427
- if isinstance(value, dict):
428
- writer.writerow([f" {key}:", ""])
429
- for sub_key, sub_value in value.items():
430
- writer.writerow([f" {sub_key}", str(sub_value)])
431
-
432
- else:
433
- writer.writerow([f" {key}", str(value)])
434
-
435
- writer.writerow([])
436
-
437
- # Recommendation
438
- writer.writerow(['RECOMMENDATION'])
439
- writer.writerow([])
440
-
441
- if (result.status == DetectionStatus.REVIEW_REQUIRED):
442
- writer.writerow(['Action', 'Manual verification recommended'])
443
- writer.writerow(['Priority', 'HIGH' if (result.overall_score >= 0.85) else 'MEDIUM'])
444
- writer.writerow(['Next Steps', 'Forensic analysis, reverse image search, metadata inspection'])
445
-
446
- else:
447
- writer.writerow(['Action', 'No immediate action needed'])
448
- writer.writerow(['Priority', 'LOW'])
449
- writer.writerow(['Next Steps', 'Proceed with normal workflow'])
450
-
451
  writer.writerow([])
452
-
453
 
454
  def _write_footer(self, writer) -> None:
455
- """
456
- Write CSV report footer
457
- """
458
  writer.writerow(['=' * 100])
459
  writer.writerow(['Report generated by AI Image Screener'])
460
- writer.writerow(['For questions or support, contact: support@aiimagescreener.com'])
461
- writer.writerow(['DISCLAIMER: Results are indicative and should be verified manually for critical applications'])
462
  writer.writerow(['=' * 100])
 
7
  from config.settings import settings
8
  from config.constants import MetricType
9
  from config.schemas import AnalysisResult
10
+ from config.constants import FinalDecision
11
  from utils.helpers import generate_unique_id
 
12
  from config.schemas import BatchAnalysisResult
13
  from features.detailed_result_maker import DetailedResultMaker
14
 
 
19
 
20
  class CSVReporter:
21
  """
22
+ CSV report generator
23
+
24
+ Guarantees:
25
+ -----------
26
+ - FinalDecision is authoritative
27
+ - Metrics are informational only
28
+ - Evidence-first reporting
29
+ - Audit-safe CSV structure
 
30
  """
31
  def __init__(self):
32
  """
33
  Initialize CSV Reporter
34
  """
35
  self.detailed_maker = DetailedResultMaker()
36
+
37
  logger.debug("CSVReporter initialized")
38
+
39
 
40
  def export_batch_summary(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
41
  """
42
+ Export batch decision summary as CSV
 
 
 
 
 
 
 
 
 
 
43
  """
44
  output_dir = output_dir or settings.REPORTS_DIR
45
  report_id = generate_unique_id()
46
  filename = f"batch_summary_{report_id}.csv"
47
  output_path = output_dir / filename
48
+
49
  logger.info(f"Generating batch summary CSV: {filename}")
50
+
51
  try:
52
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
53
  writer = csv.writer(f)
54
+
55
+ self._write_report_header(writer,
56
+ report_type = "Batch Decision Summary",
 
57
  timestamp = batch_result.timestamp,
58
  )
59
+
60
+ self._write_batch_decision_statistics(writer = writer,
61
+ batch_result = batch_result,
62
+ )
63
+
 
 
64
  self._write_batch_results_table(writer = writer,
65
  batch_result = batch_result,
66
  )
67
+
 
68
  self._write_footer(writer = writer)
69
+
70
  logger.info(f"Batch summary CSV generated: {output_path}")
71
  return output_path
72
+
73
  except Exception as e:
74
  logger.error(f"Failed to generate batch summary CSV: {e}")
75
  raise
76
+
77
 
78
  def export_batch_detailed(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
79
  """
80
+ Export detailed batch forensic CSV
 
 
 
 
 
 
 
 
 
 
81
  """
82
  output_dir = output_dir or settings.REPORTS_DIR
83
  report_id = generate_unique_id()
84
  filename = f"batch_detailed_{report_id}.csv"
85
  output_path = output_dir / filename
86
+
87
  logger.info(f"Generating detailed batch CSV: {filename}")
88
+
89
  try:
90
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
91
  writer = csv.writer(f)
92
+
93
+ self._write_report_header(writer,
 
94
  report_type = "Detailed Batch Analysis",
95
  timestamp = batch_result.timestamp,
96
  )
97
+
 
98
  for idx, result in enumerate(batch_result.results, 1):
99
+ self._write_detailed_image_section(writer,
100
+ result = result,
101
+ image_number = idx,
102
+ total_images = batch_result.processed,
103
  )
104
+
 
105
  if (idx < batch_result.processed):
106
  writer.writerow([])
107
  writer.writerow(['=' * 100])
108
  writer.writerow([])
109
+
 
110
  self._write_footer(writer = writer)
111
+
112
  logger.info(f"Detailed batch CSV generated: {output_path}")
 
113
 
114
+ return output_path
115
+
116
  except Exception as e:
117
  logger.error(f"Failed to generate detailed batch CSV: {e}")
118
  raise
119
+
120
 
121
  def export_single_detailed(self, result: AnalysisResult, output_dir: Optional[Path] = None) -> Path:
122
  """
123
+ Export single image detailed CSV
 
 
 
 
 
 
 
 
 
 
124
  """
125
  output_dir = output_dir or settings.REPORTS_DIR
126
  report_id = generate_unique_id()
127
  filename = f"single_analysis_{report_id}.csv"
128
  output_path = output_dir / filename
129
+
130
  logger.info(f"Generating single image CSV: {filename}")
131
+
132
  try:
133
  with open(output_path, 'w', newline = '', encoding = 'utf-8-sig') as f:
134
  writer = csv.writer(f)
135
+
136
+ self._write_report_header(writer,
 
137
  report_type = "Single Image Analysis",
138
  timestamp = result.timestamp,
139
  )
140
+
141
+ self._write_detailed_image_section(writer,
 
142
  result = result,
143
  image_number = 1,
144
  total_images = 1,
145
  )
146
+
 
147
  self._write_footer(writer = writer)
148
+
149
  logger.info(f"Single image CSV generated: {output_path}")
 
150
 
151
+ return output_path
152
+
153
  except Exception as e:
154
  logger.error(f"Failed to generate single image CSV: {e}")
155
  raise
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  def _write_report_header(self, writer, report_type: str, timestamp: datetime) -> None:
 
 
 
159
  writer.writerow(['=' * 100])
160
  writer.writerow([f'AI Image Screener - {report_type}'])
161
  writer.writerow([f'Generated: {timestamp.strftime("%Y-%m-%d %H:%M:%S")}'])
162
  writer.writerow([f'Version: {settings.VERSION}'])
163
  writer.writerow(['=' * 100])
164
  writer.writerow([])
 
165
 
166
+
167
+ def _write_batch_decision_statistics(self, writer, batch_result: BatchAnalysisResult) -> None:
168
+ writer.writerow(['BATCH DECISION STATISTICS'])
 
 
169
  writer.writerow([])
170
+
171
+ summary = batch_result.summary or {}
172
+
173
+ rows = [['Total Images', batch_result.total_images],
174
+ ['Processed', batch_result.processed],
175
+ ['Failed', batch_result.failed],
176
+ ['Success Rate', f"{summary.get('success_rate', 0)}%"],
177
+ ['', ''],
178
+ ]
179
+
180
+ for decision in FinalDecision:
181
+ rows.append([decision.value, summary.get(decision.value, 0)])
182
+
183
+ rows.append(['Total Processing Time', f"{batch_result.total_processing_time:.2f}s"])
184
+
185
+ for row in rows:
186
  writer.writerow(row)
187
+
188
  writer.writerow([])
189
  writer.writerow(['=' * 100])
190
  writer.writerow([])
191
+
192
 
193
  def _write_batch_results_table(self, writer, batch_result: BatchAnalysisResult) -> None:
 
 
 
194
  writer.writerow(['ANALYSIS RESULTS'])
195
  writer.writerow([])
196
+
197
+ header = ['Filename',
198
+ 'Final Decision',
199
+ 'Decision Confidence (%)',
200
+ 'Overall Score (informational)',
201
+ 'Decision Explanation',
202
+ 'Processing Time (s)',
 
 
 
203
  ]
204
 
205
  writer.writerow(header)
206
+
 
207
  for result in batch_result.results:
208
+ writer.writerow([result.filename,
209
+ result.final_decision.value,
210
+ f"{result.confidence}%",
211
+ f"{result.overall_score:.3f}",
212
+ (result.decision_explanation or '').replace("\n", " "),
213
+ f"{result.processing_time:.2f}",
214
+ ])
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  writer.writerow([])
217
+
218
 
219
  def _write_detailed_image_section(self, writer, result: AnalysisResult, image_number: int, total_images: int) -> None:
 
 
 
220
  writer.writerow([f'IMAGE {image_number} OF {total_images}'])
221
  writer.writerow([])
222
+
223
+ # Decision Summary
224
+ writer.writerow(['FINAL DECISION'])
225
+ writer.writerow(['Decision', result.final_decision.value])
 
 
226
  writer.writerow(['Confidence', f"{result.confidence}%"])
227
+ writer.writerow(['Explanation', result.decision_explanation or ''])
 
 
 
 
 
 
228
  writer.writerow([])
229
+
230
+ # Evidence Summary
231
+ if result.evidence:
232
+ writer.writerow(['EVIDENCE SUMMARY'])
233
+ writer.writerow(['Source', 'Direction', 'Strength', 'Confidence', 'Finding'])
234
+
235
+ for e in result.evidence:
236
+ writer.writerow([e.source.value,
237
+ e.direction.value,
238
+ e.strength.value,
239
+ f"{e.confidence:.3f}" if e.confidence is not None else 'N/A',
240
+ e.finding.replace("\n", " "),
241
+ ])
242
+
243
+ writer.writerow([])
244
+
245
+ # Metric Signals (Informational)
246
+ writer.writerow(['METRIC SIGNALS (INFORMATIONAL)'])
247
+ writer.writerow(['Metric', 'Score', 'Status', 'Confidence'])
248
+
249
  for signal in result.signals:
250
+ metric_result = result.metric_results.get(signal.metric_type)
251
+
 
252
  writer.writerow([signal.name,
253
  f"{signal.score:.3f}",
254
+ signal.status.value,
255
+ f"{metric_result.confidence:.3f}" if (metric_result and metric_result.confidence is not None) else 'N/A',
 
256
  ])
 
 
 
 
 
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  writer.writerow([])
259
+
260
 
261
  def _write_footer(self, writer) -> None:
 
 
 
262
  writer.writerow(['=' * 100])
263
  writer.writerow(['Report generated by AI Image Screener'])
264
+ writer.writerow(['DISCLAIMER: Statistical signals are non-decisional'])
265
+ writer.writerow(['Final decisions are policy-based and auditable'])
266
  writer.writerow(['=' * 100])
reporter/json_reporter.py CHANGED
@@ -8,6 +8,7 @@ from datetime import datetime
8
  from utils.logger import get_logger
9
  from config.settings import settings
10
  from config.schemas import AnalysisResult
 
11
  from utils.helpers import generate_unique_id
12
  from config.schemas import BatchAnalysisResult
13
  from features.detailed_result_maker import DetailedResultMaker
@@ -19,39 +20,27 @@ logger = get_logger(__name__)
19
 
20
  class JSONReporter:
21
  """
22
- Professional JSON report generator
23
-
24
- Features:
25
- ---------
26
- - Machine-readable structured format
27
- - API-friendly output
28
- - Complete data preservation
29
- - Pretty-printed for readability
30
- - Nested structure for complex data
31
  """
32
  def __init__(self):
33
  """
34
  Initialize JSON Reporter
35
  """
36
  self.detailed_maker = DetailedResultMaker()
 
37
  logger.debug("JSONReporter initialized")
38
 
39
 
40
  def export_batch(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None, include_detailed: bool = True) -> Path:
41
  """
42
  Export batch analysis as JSON
43
-
44
- Arguments:
45
- ----------
46
- batch_result { BatchAnalysisResult } : Complete batch analysis result
47
-
48
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
49
-
50
- include_detailed { bool } : Include detailed forensic data
51
-
52
- Returns:
53
- --------
54
- { Path } : Path to generated JSON file
55
  """
56
  output_dir = output_dir or settings.REPORTS_DIR
57
  report_id = generate_unique_id()
@@ -59,47 +48,33 @@ class JSONReporter:
59
  output_path = output_dir / filename
60
 
61
  output_dir.mkdir(parents = True, exist_ok = True)
62
-
63
  logger.info(f"Generating batch JSON: {filename}")
64
-
65
  try:
66
- # Build JSON structure
67
  data = self._build_batch_json(batch_result = batch_result,
68
  include_detailed = include_detailed,
69
  )
70
-
71
- # Write to file
72
  with open(output_path, 'w', encoding = 'utf-8') as f:
73
- json.dump(obj = data,
74
- fp = f,
75
- indent = 4,
76
- ensure_ascii = False,
77
  default = str,
78
  )
79
-
80
  logger.info(f"Batch JSON generated: {output_path}")
81
  return output_path
82
-
83
  except Exception as e:
84
  logger.error(f"Failed to generate batch JSON: {e}")
85
  raise
86
 
87
 
88
- def export_single(self, result: AnalysisResult, output_dir: Optional[Path] = None, include_detailed: bool = True) -> Path:
 
89
  """
90
  Export single image analysis as JSON
91
-
92
- Arguments:
93
- ----------
94
- result { AnalysisResult } : Single image analysis result
95
-
96
- output_dir { Path } : Output directory (defaults to settings.REPORTS_DIR)
97
-
98
- include_detailed { bool } : Include detailed forensic data
99
-
100
- Returns:
101
- --------
102
- { Path } : Path to generated JSON file
103
  """
104
  output_dir = output_dir or settings.REPORTS_DIR
105
  report_id = generate_unique_id()
@@ -107,27 +82,24 @@ class JSONReporter:
107
  output_path = output_dir / filename
108
 
109
  output_dir.mkdir(parents = True, exist_ok = True)
110
-
111
  logger.info(f"Generating single image JSON: {filename}")
112
-
113
  try:
114
- # Build JSON structure
115
  data = self._build_single_json(result = result,
116
  include_detailed = include_detailed,
117
  )
118
-
119
- # Write to file
120
  with open(output_path, 'w', encoding = 'utf-8') as f:
121
- json.dump(obj = data,
122
- fp = f,
123
- indent = 4,
124
- ensure_ascii = False,
125
  default = str,
126
  )
127
-
128
  logger.info(f"Single image JSON generated: {output_path}")
129
  return output_path
130
-
131
  except Exception as e:
132
  logger.error(f"Failed to generate single image JSON: {e}")
133
  raise
@@ -135,15 +107,7 @@ class JSONReporter:
135
 
136
  def export_api_response(self, result: AnalysisResult) -> Dict:
137
  """
138
- Generate API-friendly JSON response (in-memory, no file)
139
-
140
- Arguments:
141
- ----------
142
- result { AnalysisResult } : Analysis result
143
-
144
- Returns:
145
- --------
146
- { dict } : API response dictionary
147
  """
148
  return {"success" : True,
149
  "timestamp" : datetime.now().isoformat(),
@@ -158,36 +122,23 @@ class JSONReporter:
158
  """
159
  Build complete batch JSON structure
160
  """
161
- data = {"report_metadata" : self._build_metadata(report_type = "Batch Analysis",
162
  timestamp = batch_result.timestamp,
163
  ),
164
- "batch_summary" : self._build_batch_summary(batch_result = batch_result),
165
- "results" : [],
166
  }
167
-
168
- # Add each image result
169
- for result in batch_result.results:
170
- image_data = self._build_image_data(result = result,
171
- include_detailed = include_detailed,
172
- )
173
- data["results"].append(image_data)
174
-
175
- return data
176
 
177
 
178
  def _build_single_json(self, result: AnalysisResult, include_detailed: bool) -> Dict:
179
  """
180
  Build single image JSON structure
181
  """
182
- data = {"report_metadata" : self._build_metadata(report_type = "Single Image Analysis",
183
  timestamp = result.timestamp,
184
  ),
185
- "analysis" : self._build_image_data(result = result,
186
- include_detailed = include_detailed,
187
- ),
188
  }
189
-
190
- return data
191
 
192
 
193
  def _build_metadata(self, report_type: str, timestamp: datetime) -> Dict:
@@ -204,17 +155,20 @@ class JSONReporter:
204
 
205
  def _build_batch_summary(self, batch_result: BatchAnalysisResult) -> Dict:
206
  """
207
- Build batch summary section
208
  """
 
 
209
  return {"total_images" : batch_result.total_images,
210
  "processed" : batch_result.processed,
211
  "failed" : batch_result.failed,
212
- "success_rate" : batch_result.summary.get('success_rate', 0),
213
- "statistics" : {"likely_authentic" : batch_result.summary.get('likely_authentic', 0),
214
- "review_required" : batch_result.summary.get('review_required', 0),
215
- "avg_score" : batch_result.summary.get('avg_score', 0.0),
216
- "avg_confidence" : batch_result.summary.get('avg_confidence', 0),
217
- "avg_proc_time" : batch_result.summary.get('avg_proc_time', 0.0),
 
218
  },
219
  "total_processing_time" : round(batch_result.total_processing_time, 2),
220
  }
@@ -222,128 +176,100 @@ class JSONReporter:
222
 
223
  def _build_image_data(self, result: AnalysisResult, include_detailed: bool) -> Dict:
224
  """
225
- Build complete image data structure
226
  """
227
- image_data = {"filename" : result.filename,
228
- "status" : result.status.value,
229
- "overall" : {"score" : round(result.overall_score, 3),
230
- "confidence" : result.confidence,
231
- "interpretation" : self._interpret_score(score = result.overall_score),
232
- },
233
- "image_info" : {"size" : {"width" : result.image_size[0],
234
- "height" : result.image_size[1],
235
- },
236
- "processing_time" : round(result.processing_time, 2),
237
- "timestamp" : result.timestamp.isoformat(),
238
- },
239
- "signals" : self._build_signals_data(result = result),
 
 
240
  }
241
-
242
- # Add detailed forensics if requested
243
  if include_detailed:
244
- image_data["forensics"] = self._build_forensics_data(result = result)
245
- image_data["recommendations"] = self._build_recommendations(result = result)
246
-
247
  return image_data
248
 
249
 
250
  def _build_signals_data(self, result: AnalysisResult) -> List[Dict]:
251
  """
252
- Build signals data structure
253
  """
254
  signals = list()
255
-
256
  for signal in result.signals:
257
  metric_result = result.metric_results.get(signal.metric_type)
258
-
259
- signal_data = {"metric_name" : signal.name,
260
- "metric_type" : signal.metric_type.value,
261
- "score" : round(signal.score, 3),
262
- "status" : signal.status.value,
263
- "confidence" : round(metric_result.confidence, 3) if (metric_result and metric_result.confidence is not None) else None,
264
- "explanation" : signal.explanation,
265
- }
266
-
267
- signals.append(signal_data)
268
-
269
  return signals
270
 
271
 
272
  def _build_forensics_data(self, result: AnalysisResult) -> Dict:
273
  """
274
- Build detailed forensics data structure
275
  """
276
  forensics = dict()
277
-
278
  for metric_type, metric_result in result.metric_results.items():
279
- metric_name = self.detailed_maker.metric_display_names.get(metric_type, metric_type.value)
280
-
281
- forensics[metric_type.value] = {"display_name" : metric_name,
282
  "score" : round(metric_result.score, 3),
283
- "confidence" : round(metric_result.confidence, 3) if (metric_result and metric_result.confidence is not None) else None,
284
  "details" : metric_result.details or {},
285
- "key_findings" : self.detailed_maker.extract_key_findings(metric_type = metric_type,
286
- metric_result = metric_result,
287
- ),
288
  }
289
-
290
  return forensics
291
 
292
 
293
  def _build_recommendations(self, result: AnalysisResult) -> Dict:
294
  """
295
- Build recommendations structure
296
  """
297
- score = result.overall_score
298
-
299
- if (score >= 0.85):
300
- return {"action" : "Immediate manual verification required",
301
- "priority" : "HIGH",
302
- "risk_level" : "CRITICAL",
303
- "next_steps" : ["Forensic analysis", "Reverse image search", "Metadata inspection"],
304
- "confidence" : "Very high likelihood of AI generation",
305
  }
306
-
307
- elif (score >= 0.70):
308
- return {"action" : "Manual verification recommended",
309
- "priority" : "MEDIUM",
310
- "risk_level" : "HIGH",
311
- "next_steps" : ["Visual inspection", "Compare with authentic samples"],
312
- "confidence" : "High likelihood of AI generation",
313
  }
314
-
315
- elif (score >= 0.50):
316
- return {"action" : "Optional review suggested",
317
- "priority" : "LOW",
318
- "risk_level" : "MEDIUM",
319
- "next_steps" : ["Verify image source", "Check for inconsistencies"],
320
- "confidence" : "Moderate indicators present",
321
  }
322
-
323
- else:
324
- return {"action" : "No immediate action required",
325
- "priority" : "NONE",
326
- "risk_level" : "LOW",
327
- "next_steps" : ["Proceed with normal workflow"],
328
- "confidence" : "Low likelihood of AI generation",
329
  }
330
-
331
 
332
- def _interpret_score(self, score: float) -> str:
333
- """
334
- Interpret score for human readability
335
- """
336
- if (score >= 0.85):
337
- return "Very high suspicion"
338
-
339
- elif (score >= 0.70):
340
- return "High suspicion"
341
-
342
- elif (score >= 0.50):
343
- return "Moderate suspicion"
344
-
345
- elif (score >= 0.30):
346
- return "Low suspicion"
347
-
348
- else:
349
- return "Very low suspicion"
 
8
  from utils.logger import get_logger
9
  from config.settings import settings
10
  from config.schemas import AnalysisResult
11
+ from config.constants import FinalDecision
12
  from utils.helpers import generate_unique_id
13
  from config.schemas import BatchAnalysisResult
14
  from features.detailed_result_maker import DetailedResultMaker
 
20
 
21
  class JSONReporter:
22
  """
23
+ JSON report generator
24
+
25
+ Guarantees:
26
+ -----------
27
+ - FinalDecision is authoritative
28
+ - Metrics are informational only
29
+ - Evidence-first interpretation
30
+ - Audit-safe output
 
31
  """
32
  def __init__(self):
33
  """
34
  Initialize JSON Reporter
35
  """
36
  self.detailed_maker = DetailedResultMaker()
37
+
38
  logger.debug("JSONReporter initialized")
39
 
40
 
41
  def export_batch(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None, include_detailed: bool = True) -> Path:
42
  """
43
  Export batch analysis as JSON
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
  output_dir = output_dir or settings.REPORTS_DIR
46
  report_id = generate_unique_id()
 
48
  output_path = output_dir / filename
49
 
50
  output_dir.mkdir(parents = True, exist_ok = True)
 
51
  logger.info(f"Generating batch JSON: {filename}")
52
+
53
  try:
 
54
  data = self._build_batch_json(batch_result = batch_result,
55
  include_detailed = include_detailed,
56
  )
57
+
 
58
  with open(output_path, 'w', encoding = 'utf-8') as f:
59
+ json.dump(obj = data,
60
+ fp = f,
61
+ indent = 4,
62
+ ensure_ascii = False,
63
  default = str,
64
  )
65
+
66
  logger.info(f"Batch JSON generated: {output_path}")
67
  return output_path
68
+
69
  except Exception as e:
70
  logger.error(f"Failed to generate batch JSON: {e}")
71
  raise
72
 
73
 
74
+ def export_single(self, result: AnalysisResult, output_dir: Optional[Path] = None, include_detailed: bool = True,
75
+ ) -> Path:
76
  """
77
  Export single image analysis as JSON
 
 
 
 
 
 
 
 
 
 
 
 
78
  """
79
  output_dir = output_dir or settings.REPORTS_DIR
80
  report_id = generate_unique_id()
 
82
  output_path = output_dir / filename
83
 
84
  output_dir.mkdir(parents = True, exist_ok = True)
 
85
  logger.info(f"Generating single image JSON: {filename}")
86
+
87
  try:
 
88
  data = self._build_single_json(result = result,
89
  include_detailed = include_detailed,
90
  )
91
+
 
92
  with open(output_path, 'w', encoding = 'utf-8') as f:
93
+ json.dump(obj = data,
94
+ fp = f,
95
+ indent = 4,
96
+ ensure_ascii = False,
97
  default = str,
98
  )
99
+
100
  logger.info(f"Single image JSON generated: {output_path}")
101
  return output_path
102
+
103
  except Exception as e:
104
  logger.error(f"Failed to generate single image JSON: {e}")
105
  raise
 
107
 
108
  def export_api_response(self, result: AnalysisResult) -> Dict:
109
  """
110
+ Generate API-friendly JSON response
 
 
 
 
 
 
 
 
111
  """
112
  return {"success" : True,
113
  "timestamp" : datetime.now().isoformat(),
 
122
  """
123
  Build complete batch JSON structure
124
  """
125
+ return {"report_metadata" : self._build_metadata(report_type = "Batch Analysis",
126
  timestamp = batch_result.timestamp,
127
  ),
128
+ "batch_summary" : self._build_batch_summary(batch_result),
129
+ "results" : [self._build_image_data(result, include_detailed) for result in batch_result.results],
130
  }
 
 
 
 
 
 
 
 
 
131
 
132
 
133
  def _build_single_json(self, result: AnalysisResult, include_detailed: bool) -> Dict:
134
  """
135
  Build single image JSON structure
136
  """
137
+ return {"report_metadata" : self._build_metadata(report_type = "Single Image Analysis",
138
  timestamp = result.timestamp,
139
  ),
140
+ "analysis" : self._build_image_data(result, include_detailed),
 
 
141
  }
 
 
142
 
143
 
144
  def _build_metadata(self, report_type: str, timestamp: datetime) -> Dict:
 
155
 
156
  def _build_batch_summary(self, batch_result: BatchAnalysisResult) -> Dict:
157
  """
158
+ Build batch summary (decision-aware)
159
  """
160
+ summary = batch_result.summary or {}
161
+
162
  return {"total_images" : batch_result.total_images,
163
  "processed" : batch_result.processed,
164
  "failed" : batch_result.failed,
165
+ "success_rate" : summary.get("success_rate", 0),
166
+ "decision_distribution" : {key : summary.get(key, 0)
167
+ for key in [FinalDecision.CONFIRMED_AI_GENERATED.value,
168
+ FinalDecision.SUSPICIOUS_AI_LIKELY.value,
169
+ FinalDecision.AUTHENTIC_BUT_REVIEW.value,
170
+ FinalDecision.MOSTLY_AUTHENTIC.value,
171
+ ]
172
  },
173
  "total_processing_time" : round(batch_result.total_processing_time, 2),
174
  }
 
176
 
177
  def _build_image_data(self, result: AnalysisResult, include_detailed: bool) -> Dict:
178
  """
179
+ Build complete image data structure (decision-first)
180
  """
181
+ image_data = {"filename" : result.filename,
182
+ "decision" : {"value" : result.final_decision.value if result.final_decision else None,
183
+ "confidence" : result.confidence,
184
+ "explanation" : result.decision_explanation,
185
+ },
186
+ "overall" : {"score" : round(result.overall_score, 3),
187
+ "note" : "Statistical score (non-authoritative)",
188
+ },
189
+ "image_info" : {"size" : {"width" : result.image_size[0],
190
+ "height" : result.image_size[1],
191
+ },
192
+ "processing_time" : round(result.processing_time, 2),
193
+ "timestamp" : result.timestamp.isoformat(),
194
+ },
195
+ "signals" : self._build_signals_data(result),
196
  }
197
+
 
198
  if include_detailed:
199
+ image_data["forensics"] = self._build_forensics_data(result)
200
+ image_data["recommendations"] = self._build_recommendations(result)
201
+
202
  return image_data
203
 
204
 
205
  def _build_signals_data(self, result: AnalysisResult) -> List[Dict]:
206
  """
207
+ Build Tier-1 signal data (informational)
208
  """
209
  signals = list()
210
+
211
  for signal in result.signals:
212
  metric_result = result.metric_results.get(signal.metric_type)
213
+
214
+ signals.append({"metric_name" : signal.name,
215
+ "metric_type" : signal.metric_type.value,
216
+ "score" : round(signal.score, 3),
217
+ "status" : signal.status.value,
218
+ "confidence" : round(metric_result.confidence, 3) if (metric_result and metric_result.confidence is not None) else None,
219
+ "explanation" : signal.explanation,
220
+ })
221
+
 
 
222
  return signals
223
 
224
 
225
  def _build_forensics_data(self, result: AnalysisResult) -> Dict:
226
  """
227
+ Build forensic metric details
228
  """
229
  forensics = dict()
230
+
231
  for metric_type, metric_result in result.metric_results.items():
232
+ forensics[metric_type.value] = {"display_name" : self.detailed_maker.metric_display_names.get(metric_type, metric_type.value),
 
 
233
  "score" : round(metric_result.score, 3),
234
+ "confidence" : round(metric_result.confidence, 3) if metric_result.confidence is not None else None,
235
  "details" : metric_result.details or {},
236
+ "key_findings" : self.detailed_maker.extract_key_findings(metric_type, metric_result),
 
 
237
  }
238
+
239
  return forensics
240
 
241
 
242
  def _build_recommendations(self, result: AnalysisResult) -> Dict:
243
  """
244
+ Build recommendations (decision-driven, not score-driven)
245
  """
246
+ decision = result.final_decision
247
+
248
+ if (decision == FinalDecision.CONFIRMED_AI_GENERATED):
249
+ return {"action" : "Block or flag image immediately",
250
+ "priority" : "CRITICAL",
251
+ "next_steps" : ["Audit source", "Apply AI-content policy"],
 
 
252
  }
253
+
254
+ if (decision == FinalDecision.SUSPICIOUS_AI_LIKELY):
255
+ return {"action" : "Manual review required",
256
+ "priority" : "HIGH",
257
+ "next_steps" : ["Human inspection", "Cross-check metadata"],
 
 
258
  }
259
+
260
+ if (decision == FinalDecision.AUTHENTIC_BUT_REVIEW):
261
+ return {"action" : "Optional human review",
262
+ "priority" : "MEDIUM",
263
+ "next_steps" : ["Spot-check authenticity"],
 
 
264
  }
265
+
266
+ if (decision == FinalDecision.MOSTLY_AUTHENTIC):
267
+ return {"action" : "No action required",
268
+ "priority" : "LOW",
269
+ "next_steps" : ["Proceed normally"],
 
 
270
  }
 
271
 
272
+ return {"action" : "Decision unavailable",
273
+ "priority" : "UNKNOWN",
274
+ "next_steps" : [],
275
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
reporter/pdf_reporter.py CHANGED
@@ -1,24 +1,28 @@
1
  # Dependencies
 
 
2
  from pathlib import Path
3
- from typing import Optional, List, Dict, Any
4
  from datetime import datetime
 
5
  from utils.logger import get_logger
6
  from config.settings import settings
7
- from reportlab.platypus import Table, Spacer, Paragraph, PageBreak, Image as RLImage
8
- from reportlab.lib import colors
9
- from reportlab.lib.pagesizes import A4, LETTER
10
- from reportlab.lib.enums import TA_LEFT, TA_RIGHT, TA_CENTER, TA_JUSTIFY
 
11
  from reportlab.platypus import TableStyle
12
  from config.schemas import AnalysisResult
 
 
13
  from utils.helpers import generate_unique_id
14
- from config.constants import DetectionStatus
15
  from config.schemas import BatchAnalysisResult
16
- from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
17
  from reportlab.platypus import SimpleDocTemplate
 
18
  from features.detailed_result_maker import DetailedResultMaker
19
- from reportlab.lib.units import inch
20
- from reportlab.pdfgen import canvas
21
- import textwrap
22
 
23
 
24
  # Setup Logging
@@ -27,817 +31,296 @@ logger = get_logger(__name__)
27
 
28
  class PDFReporter:
29
  """
30
- Professional-Grade PDF Report Generator for AI Image Analysis
31
-
32
- Features:
33
- ---------
34
- - Comprehensive single image reports with full forensic details
35
- - Multi-page batch reports with executive summary
36
- - Enhanced visual hierarchy and color coding
37
- - Detailed metric breakdowns with explanations
38
- - Professional formatting and layout
39
- - Statistical summaries and insights
40
  """
41
-
42
- # Enhanced Color Scheme
43
- COLOR_PRIMARY = colors.HexColor('#0D47A1') # Deep Blue
44
- COLOR_SUCCESS = colors.HexColor('#1B5E20') # Dark Green
45
- COLOR_WARNING = colors.HexColor('#E65100') # Deep Orange
46
- COLOR_DANGER = colors.HexColor('#B71C1C') # Dark Red
47
- COLOR_INFO = colors.HexColor('#01579B') # Light Blue
48
- COLOR_NEUTRAL = colors.HexColor('#424242') # Dark Grey
49
- COLOR_HEADER_BG = colors.HexColor('#1565C0') # Blue
50
- COLOR_SUBHEADER_BG = colors.HexColor('#1976D2') # Lighter Blue
51
- COLOR_ALT_ROW = colors.HexColor('#F5F5F5') # Light Grey
52
- COLOR_LIGHT_BLUE = colors.HexColor('#E3F2FD') # Very Light Blue
53
- COLOR_LIGHT_GREEN = colors.HexColor('#E8F5E9') # Very Light Green
54
- COLOR_LIGHT_ORANGE = colors.HexColor('#FFF3E0') # Very Light Orange
55
- COLOR_LIGHT_RED = colors.HexColor('#FFEBEE') # Very Light Red
56
-
57
  def __init__(self):
58
  self.detailed_maker = DetailedResultMaker()
59
- self.styles = self._build_styles()
60
- logger.debug("Enhanced PDFReporter initialized")
 
61
 
62
  def export_single(self, result: AnalysisResult, output_dir: Optional[Path] = None) -> Path:
63
- """Export comprehensive single image analysis report"""
64
- output_dir = output_dir or settings.REPORTS_DIR
65
- output_dir.mkdir(parents=True, exist_ok=True)
66
-
67
- report_id = generate_unique_id()
68
- filename = f"ai_screener_report_{report_id}.pdf"
 
69
  output_path = output_dir / filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- logger.info(f"Generating comprehensive single image PDF: {filename}")
72
 
73
- doc = SimpleDocTemplate(
74
- str(output_path),
75
- pagesize=LETTER,
76
- rightMargin=30,
77
- leftMargin=30,
78
- topMargin=20,
79
- bottomMargin=35
80
- )
81
 
82
- story = []
83
- self._add_professional_header(story, "AI Image Analysis Report")
84
- self._add_executive_summary_single(story, result)
85
  story.append(PageBreak())
86
- self._add_detailed_metrics_analysis(story, result)
87
- story.append(PageBreak())
88
- self._add_forensic_breakdown(story, result)
89
- self._add_recommendations(story, result)
90
- self._add_professional_footer(story)
91
 
92
- doc.build(story, onFirstPage=self._add_watermark, onLaterPages=self._add_watermark)
93
- logger.info(f"Single image report generated: {output_path}")
 
 
 
94
  return output_path
95
 
 
96
  def export_batch(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
97
- """Export comprehensive batch analysis report"""
98
- output_dir = output_dir or settings.REPORTS_DIR
99
- output_dir.mkdir(parents=True, exist_ok=True)
100
-
101
- report_id = generate_unique_id()
102
- filename = f"ai_screener_report_{report_id}.pdf"
 
103
  output_path = output_dir / filename
104
-
105
- num_images = len(batch_result.results)
106
- logger.info(f"Generating batch PDF report: {filename} ({num_images} images)")
107
-
108
- doc = SimpleDocTemplate(
109
- str(output_path),
110
- pagesize=LETTER,
111
- rightMargin=30,
112
- leftMargin=30,
113
- topMargin=20,
114
- bottomMargin=35
115
- )
116
-
117
- story = []
118
- self._add_professional_header(story, "Batch Image Analysis Report")
119
- self._add_batch_executive_summary(story, batch_result)
120
- story.append(PageBreak())
121
- self._add_batch_overview_table(story, batch_result.results)
122
- story.append(PageBreak())
123
- self._add_batch_metrics_analysis(story, batch_result.results)
124
  story.append(PageBreak())
125
- self._add_individual_results_summary(story, batch_result.results)
126
- self._add_batch_recommendations(story, batch_result)
127
- self._add_professional_footer(story)
128
-
129
- doc.build(story, onFirstPage=self._add_watermark, onLaterPages=self._add_watermark)
130
- logger.info(f"Batch report generated: {output_path}")
 
 
 
 
 
 
131
  return output_path
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def _build_styles(self):
134
- """Build comprehensive style definitions"""
135
  styles = getSampleStyleSheet()
136
-
137
  styles.add(ParagraphStyle(
138
- name='ReportTitle',
139
  fontSize=18,
 
140
  textColor=self.COLOR_PRIMARY,
141
- alignment=TA_CENTER,
142
- spaceAfter=4,
143
- spaceBefore=2,
144
- fontName='Helvetica-Bold'
145
- ))
146
-
147
- styles.add(ParagraphStyle(
148
- name='ReportSubtitle',
149
- fontSize=10,
150
- textColor=self.COLOR_NEUTRAL,
151
- alignment=TA_CENTER,
152
- spaceAfter=6,
153
- fontName='Helvetica'
154
- ))
155
-
156
- styles.add(ParagraphStyle(
157
- name='SectionTitle',
158
- fontSize=13,
159
- textColor=self.COLOR_PRIMARY,
160
- spaceBefore=10,
161
- spaceAfter=6,
162
  fontName='Helvetica-Bold'
163
  ))
164
-
165
- styles.add(ParagraphStyle(
166
- name='SectionHeader',
167
- fontSize=11,
168
- textColor=self.COLOR_PRIMARY,
169
- spaceBefore=8,
170
- spaceAfter=5,
171
- fontName='Helvetica-Bold'
172
- ))
173
-
174
- styles.add(ParagraphStyle(
175
- name='SubHeader',
176
- fontSize=9.5,
177
- textColor=self.COLOR_PRIMARY,
178
- spaceBefore=5,
179
- spaceAfter=3,
180
- fontName='Helvetica-Bold'
181
- ))
182
-
183
  styles.add(ParagraphStyle(
184
- name='CustomBodyText',
185
- fontSize=9,
186
- leading=12,
187
- alignment=TA_JUSTIFY,
188
  spaceAfter=6
189
  ))
190
-
191
  styles.add(ParagraphStyle(
192
- name='TableCell',
193
  fontSize=8,
194
- leading=10
195
- ))
196
-
197
- styles.add(ParagraphStyle(
198
- name='TableCellSmall',
199
- fontSize=7.5,
200
- leading=9
201
  ))
202
-
203
  styles.add(ParagraphStyle(
204
- name='TableHeader',
205
- fontSize=8.5,
206
- textColor=colors.white,
207
  fontName='Helvetica-Bold',
208
- leading=10,
209
- alignment=TA_CENTER
210
  ))
211
-
212
  styles.add(ParagraphStyle(
213
- name='Footer',
214
- fontSize=7.5,
215
- textColor=colors.grey,
216
- alignment=TA_CENTER,
217
- spaceAfter=2
218
  ))
219
-
220
  styles.add(ParagraphStyle(
221
- name='Timestamp',
222
- fontSize=8,
223
- textColor=self.COLOR_NEUTRAL,
224
- alignment=TA_CENTER,
225
- spaceAfter=8
226
  ))
227
-
228
- return styles
229
 
230
- def _add_watermark(self, canvas, doc):
231
- """Add professional watermark"""
232
- canvas.saveState()
233
- canvas.setFont('Helvetica-Bold', 70)
234
- canvas.setFillColorRGB(0.85, 0.85, 0.85, alpha=0.15)
235
- canvas.rotate(45)
236
- canvas.drawString(2.5*inch, -0.5*inch, "AI IMAGE SCREENER")
237
- canvas.restoreState()
238
-
239
- def _add_professional_header(self, story, title: str):
240
- """Professional header with branding"""
241
- story.append(Paragraph("🔍 AI IMAGE SCREENER", self.styles['ReportTitle']))
242
- story.append(Spacer(1, 3))
243
-
244
- timestamp_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | Version: {settings.VERSION}"
245
- story.append(Paragraph(timestamp_text, self.styles['Timestamp']))
246
-
247
- story.append(Paragraph(title, self.styles['SectionTitle']))
248
- story.append(Spacer(1, 10))
249
-
250
- def _add_executive_summary_single(self, story, result: AnalysisResult):
251
- """Executive summary for single image"""
252
- story.append(Paragraph("Executive Summary", self.styles['SectionTitle']))
253
- story.append(Spacer(1, 5))
254
-
255
- # Key findings box
256
- status_color = self.COLOR_DANGER if result.status == DetectionStatus.REVIEW_REQUIRED else self.COLOR_SUCCESS
257
- status_bg = self.COLOR_LIGHT_RED if result.status == DetectionStatus.REVIEW_REQUIRED else self.COLOR_LIGHT_GREEN
258
- status_text = "⚠️ REVIEW REQUIRED" if result.status == DetectionStatus.REVIEW_REQUIRED else "✅ LIKELY AUTHENTIC"
259
-
260
- key_findings = [
261
- [Paragraph("<b>Overall Assessment</b>", self.styles['TableHeader'])],
262
- [Paragraph(f"<font size=12 color='{status_color.hexval()}'><b>{status_text}</b></font>", self.styles['CustomBodyText'])],
263
- [Paragraph(f"<b>Confidence:</b> {result.confidence}%", self.styles['CustomBodyText'])],
264
- [Paragraph(f"<b>Overall Score:</b> {result.overall_score:.4f}", self.styles['CustomBodyText'])]
265
- ]
266
-
267
- findings_table = Table(key_findings, colWidths=[530])
268
- findings_table.setStyle(TableStyle([
269
- ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_INFO),
270
- ('BACKGROUND', (0, 1), (-1, -1), status_bg),
271
- ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
272
- ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
273
- ('LEFTPADDING', (0, 0), (-1, -1), 12),
274
- ('RIGHTPADDING', (0, 0), (-1, -1), 12),
275
- ('TOPPADDING', (0, 0), (-1, -1), 8),
276
- ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
277
- ('BOX', (0, 0), (-1, -1), 1.5, self.COLOR_PRIMARY)
278
- ]))
279
- story.append(findings_table)
280
- story.append(Spacer(1, 12))
281
-
282
- # Image information
283
- story.append(Paragraph("Image Information", self.styles['SectionHeader']))
284
-
285
- info_data = [
286
- [Paragraph("<b>Property</b>", self.styles['TableHeader']),
287
- Paragraph("<b>Value</b>", self.styles['TableHeader'])],
288
- [Paragraph("Filename", self.styles['TableCell']),
289
- Paragraph(result.filename, self.styles['TableCell'])],
290
- [Paragraph("Dimensions", self.styles['TableCell']),
291
- Paragraph(f"{result.image_size[0]} × {result.image_size[1]} pixels", self.styles['TableCell'])],
292
- [Paragraph("Aspect Ratio", self.styles['TableCell']),
293
- Paragraph(f"{result.image_size[0]/result.image_size[1]:.2f}:1", self.styles['TableCell'])],
294
- [Paragraph("Processing Time", self.styles['TableCell']),
295
- Paragraph(f"{result.processing_time:.3f} seconds", self.styles['TableCell'])],
296
- [Paragraph("Analysis Date", self.styles['TableCell']),
297
- Paragraph(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.styles['TableCell'])]
298
- ]
299
-
300
- info_table = Table(info_data, colWidths=[180, 350])
301
- info_table.setStyle(self._get_standard_table_style(len(info_data)))
302
- story.append(info_table)
303
- story.append(Spacer(1, 12))
304
-
305
- # Detection signals summary
306
- story.append(Paragraph("Detection Signals Summary", self.styles['SectionHeader']))
307
-
308
- flagged = sum(1 for s in result.signals if s.status.value == 'flagged')
309
- warning = sum(1 for s in result.signals if s.status.value == 'warning')
310
- passed = sum(1 for s in result.signals if s.status.value == 'passed')
311
-
312
- signals_data = [
313
- [Paragraph("<b>Status</b>", self.styles['TableHeader']),
314
- Paragraph("<b>Count</b>", self.styles['TableHeader']),
315
- Paragraph("<b>Percentage</b>", self.styles['TableHeader'])],
316
- [Paragraph("🔴 Flagged", self.styles['TableCell']),
317
- Paragraph(f"<font color='red'><b>{flagged}</b></font>", self.styles['TableCell']),
318
- Paragraph(f"{flagged/len(result.signals)*100:.1f}%", self.styles['TableCell'])],
319
- [Paragraph("🟡 Warning", self.styles['TableCell']),
320
- Paragraph(f"<font color='orange'><b>{warning}</b></font>", self.styles['TableCell']),
321
- Paragraph(f"{warning/len(result.signals)*100:.1f}%", self.styles['TableCell'])],
322
- [Paragraph("🟢 Passed", self.styles['TableCell']),
323
- Paragraph(f"<font color='green'><b>{passed}</b></font>", self.styles['TableCell']),
324
- Paragraph(f"{passed/len(result.signals)*100:.1f}%", self.styles['TableCell'])]
325
- ]
326
-
327
- signals_table = Table(signals_data, colWidths=[200, 165, 165])
328
- signals_table.setStyle(self._get_standard_table_style(len(signals_data)))
329
- story.append(signals_table)
330
-
331
- def _add_detailed_metrics_analysis(self, story, result: AnalysisResult):
332
- """Comprehensive metrics analysis"""
333
- story.append(Paragraph("Detailed Metrics Analysis", self.styles['SectionTitle']))
334
- story.append(Spacer(1, 8))
335
-
336
- # All detection signals with full details
337
- story.append(Paragraph("Detection Signals Breakdown", self.styles['SectionHeader']))
338
-
339
- signal_data = [
340
- [Paragraph("<b>Metric</b>", self.styles['TableHeader']),
341
- Paragraph("<b>Score</b>", self.styles['TableHeader']),
342
- Paragraph("<b>Status</b>", self.styles['TableHeader']),
343
- Paragraph("<b>Explanation</b>", self.styles['TableHeader'])]
344
- ]
345
-
346
- for signal in result.signals:
347
- status_badge = self._get_status_badge_html(signal.status.value)
348
-
349
- # Wrap long explanations
350
- explanation = signal.explanation
351
- if len(explanation) > 120:
352
- explanation = explanation[:120] + "..."
353
-
354
- signal_data.append([
355
- Paragraph(f"<b>{signal.name}</b>", self.styles['TableCell']),
356
- Paragraph(f"{signal.score:.4f}", self.styles['TableCell']),
357
- Paragraph(status_badge, self.styles['TableCell']),
358
- Paragraph(explanation, self.styles['TableCellSmall'])
359
- ])
360
-
361
- signal_table = Table(signal_data, colWidths=[120, 60, 80, 270])
362
- signal_table.setStyle(self._get_signal_table_style(len(signal_data)))
363
- story.append(signal_table)
364
-
365
- def _add_forensic_breakdown(self, story, result: AnalysisResult):
366
- """Detailed forensic analysis breakdown"""
367
- story.append(Paragraph("Forensic Analysis Breakdown", self.styles['SectionTitle']))
368
- story.append(Spacer(1, 8))
369
-
370
- for metric_type, metric_result in result.metric_results.items():
371
- metric_name = self.detailed_maker.metric_display_names.get(metric_type, metric_type.value)
372
- details = metric_result.details or {}
373
-
374
- # Skip if error
375
- if 'error' in details:
376
- continue
377
-
378
- story.append(Paragraph(metric_name, self.styles['SectionHeader']))
379
-
380
- # Metric overview
381
- overview_data = [
382
- [Paragraph("<b>Property</b>", self.styles['TableHeader']),
383
- Paragraph("<b>Value</b>", self.styles['TableHeader'])],
384
- [Paragraph("Score", self.styles['TableCell']),
385
- Paragraph(f"<b>{metric_result.score:.4f}</b>", self.styles['TableCell'])],
386
- [Paragraph("Confidence", self.styles['TableCell']),
387
- Paragraph(f"{metric_result.confidence:.4f}" if metric_result.confidence else "N/A", self.styles['TableCell'])],
388
- [Paragraph("Status", self.styles['TableCell']),
389
- Paragraph(self._get_metric_status_html(metric_result.score), self.styles['TableCell'])]
390
- ]
391
-
392
- overview_table = Table(overview_data, colWidths=[130, 400])
393
- overview_table.setStyle(self._get_standard_table_style(len(overview_data)))
394
- story.append(overview_table)
395
- story.append(Spacer(1, 5))
396
-
397
- # Detailed parameters
398
- if details and len(details) > 0:
399
- story.append(Paragraph("Detailed Parameters:", self.styles['SubHeader']))
400
-
401
- param_data = [[Paragraph("<b>Parameter</b>", self.styles['TableHeader']),
402
- Paragraph("<b>Value</b>", self.styles['TableHeader'])]]
403
-
404
- for key, value in details.items():
405
- if key in ['error', 'reason']:
406
- continue
407
-
408
- if isinstance(value, dict):
409
- for sub_key, sub_value in value.items():
410
- if sub_key not in ['reason', 'error']:
411
- formatted_value = self._format_value(sub_value)
412
- param_data.append([
413
- Paragraph(f" └─ {sub_key}", self.styles['TableCellSmall']),
414
- Paragraph(formatted_value, self.styles['TableCellSmall'])
415
- ])
416
- else:
417
- formatted_value = self._format_value(value)
418
- param_data.append([
419
- Paragraph(key, self.styles['TableCell']),
420
- Paragraph(formatted_value, self.styles['TableCell'])
421
- ])
422
-
423
- param_table = Table(param_data, colWidths=[200, 330])
424
- param_table.setStyle(TableStyle([
425
- ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_SUBHEADER_BG),
426
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
427
- ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
428
- ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, self.COLOR_ALT_ROW]),
429
- ('VALIGN', (0, 0), (-1, -1), 'TOP'),
430
- ('LEFTPADDING', (0, 0), (-1, -1), 8),
431
- ('RIGHTPADDING', (0, 0), (-1, -1), 8),
432
- ('TOPPADDING', (0, 0), (-1, -1), 4),
433
- ('BOTTOMPADDING', (0, 0), (-1, -1), 4)
434
- ]))
435
- story.append(param_table)
436
-
437
- story.append(Spacer(1, 8))
438
-
439
- def _add_recommendations(self, story, result: AnalysisResult):
440
- """Add actionable recommendations"""
441
- story.append(Paragraph("Recommendations & Next Steps", self.styles['SectionTitle']))
442
- story.append(Spacer(1, 8))
443
-
444
- if result.status == DetectionStatus.REVIEW_REQUIRED:
445
- rec_text = """
446
- <b>⚠️ MANUAL REVIEW REQUIRED</b><br/>
447
- This image has been flagged for manual review based on multiple detection signals.
448
- Recommended actions:<br/>
449
- • Conduct visual inspection by trained personnel<br/>
450
- • Cross-reference with source verification<br/>
451
- • Consider additional forensic analysis if high stakes<br/>
452
- • Document findings for audit trail
453
- """
454
- rec_color = self.COLOR_LIGHT_RED
455
- border_color = self.COLOR_DANGER
456
- else:
457
- rec_text = """
458
- <b>✅ NO IMMEDIATE ACTION REQUIRED</b><br/>
459
- This image appears to be authentic based on current analysis. However:<br/>
460
- • Continue monitoring for evolving AI techniques<br/>
461
- • Consider periodic re-screening for critical assets<br/>
462
- • Maintain chain of custody documentation<br/>
463
- • Stay updated on latest detection methodologies
464
- """
465
- rec_color = self.COLOR_LIGHT_GREEN
466
- border_color = self.COLOR_SUCCESS
467
-
468
- rec_table = Table([[Paragraph(rec_text, self.styles['CustomBodyText'])]], colWidths=[530])
469
- rec_table.setStyle(TableStyle([
470
- ('BACKGROUND', (0, 0), (-1, -1), rec_color),
471
- ('BOX', (0, 0), (-1, -1), 2, border_color),
472
- ('LEFTPADDING', (0, 0), (-1, -1), 15),
473
- ('RIGHTPADDING', (0, 0), (-1, -1), 15),
474
- ('TOPPADDING', (0, 0), (-1, -1), 12),
475
- ('BOTTOMPADDING', (0, 0), (-1, -1), 12)
476
- ]))
477
- story.append(rec_table)
478
-
479
- def _add_batch_executive_summary(self, story, batch_result: BatchAnalysisResult):
480
- """Executive summary for batch analysis"""
481
- story.append(Paragraph("Executive Summary", self.styles['SectionTitle']))
482
- story.append(Spacer(1, 8))
483
-
484
- # Key metrics
485
- summary_data = [
486
- [Paragraph("<b>Metric</b>", self.styles['TableHeader']),
487
- Paragraph("<b>Value</b>", self.styles['TableHeader']),
488
- Paragraph("<b>Details</b>", self.styles['TableHeader'])],
489
- [Paragraph("Total Images", self.styles['TableCell']),
490
- Paragraph(f"<b>{batch_result.total_images}</b>", self.styles['TableCell']),
491
- Paragraph("Images submitted for analysis", self.styles['TableCellSmall'])],
492
- [Paragraph("Successfully Processed", self.styles['TableCell']),
493
- Paragraph(f"<font color='green'><b>{batch_result.processed}</b></font>", self.styles['TableCell']),
494
- Paragraph(f"{batch_result.summary.get('success_rate', 0)}% success rate", self.styles['TableCellSmall'])],
495
- [Paragraph("Failed", self.styles['TableCell']),
496
- Paragraph(f"<font color='red'><b>{batch_result.failed}</b></font>", self.styles['TableCell']),
497
- Paragraph("Processing errors encountered", self.styles['TableCellSmall'])],
498
- [Paragraph("Likely Authentic", self.styles['TableCell']),
499
- Paragraph(f"<font color='green'><b>{batch_result.summary.get('likely_authentic', 0)}</b></font>", self.styles['TableCell']),
500
- Paragraph("Images passing authenticity checks", self.styles['TableCellSmall'])],
501
- [Paragraph("Review Required", self.styles['TableCell']),
502
- Paragraph(f"<font color='red'><b>{batch_result.summary.get('review_required', 0)}</b></font>", self.styles['TableCell']),
503
- Paragraph("Images flagged for manual review", self.styles['TableCellSmall'])],
504
- [Paragraph("Average Score", self.styles['TableCell']),
505
- Paragraph(f"<b>{batch_result.summary.get('avg_score', 0):.4f}</b>", self.styles['TableCell']),
506
- Paragraph("Mean authenticity score across batch", self.styles['TableCellSmall'])],
507
- [Paragraph("Average Processing Time", self.styles['TableCell']),
508
- Paragraph(f"<b>{batch_result.summary.get('avg_proc_time', 0):.3f}s</b>", self.styles['TableCell']),
509
- Paragraph("Per-image processing duration", self.styles['TableCellSmall'])],
510
- ]
511
-
512
- summary_table = Table(summary_data, colWidths=[150, 130, 250])
513
- summary_table.setStyle(self._get_standard_table_style(len(summary_data)))
514
- story.append(summary_table)
515
-
516
- def _add_batch_overview_table(self, story, results: List[AnalysisResult]):
517
- """Comprehensive batch overview"""
518
- story.append(Paragraph("Batch Overview Matrix", self.styles['SectionTitle']))
519
- story.append(Spacer(1, 8))
520
-
521
- header = [
522
- Paragraph("<b>#</b>", self.styles['TableHeader']),
523
- Paragraph("<b>Filename</b>", self.styles['TableHeader']),
524
- Paragraph("<b>Image Size</b>", self.styles['TableHeader']),
525
- Paragraph("<b>Score</b>", self.styles['TableHeader']),
526
- Paragraph("<b>Status</b>", self.styles['TableHeader']),
527
- Paragraph("<b>Top Signal</b>", self.styles['TableHeader']),
528
- Paragraph("<b>Time(s)</b>", self.styles['TableHeader'])
529
- ]
530
-
531
- data = [header]
532
-
533
- for idx, result in enumerate(results, 1):
534
- top_signal = max(result.signals, key=lambda s: s.score)
535
- status_badge = self._get_status_badge_short(result.status.value)
536
-
537
- data.append([
538
- Paragraph(str(idx), self.styles['TableCell']),
539
- Paragraph(result.filename, self.styles['TableCellSmall']),
540
- Paragraph(f"{result.image_size[0]}×{result.image_size[1]}", self.styles['TableCellSmall']),
541
- Paragraph(f"<b>{result.overall_score:.3f}</b>", self.styles['TableCell']),
542
- Paragraph(status_badge, self.styles['TableCellSmall']),
543
- Paragraph(f"{top_signal.name}: {top_signal.score:.2f}", self.styles['TableCellSmall']),
544
- Paragraph(f"{result.processing_time:.2f}", self.styles['TableCell'])
545
- ])
546
-
547
- table = Table(data, colWidths=[25, 155, 65, 50, 70, 120, 45])
548
- table.setStyle(self._get_pivot_table_style(len(data)))
549
- story.append(table)
550
 
551
- def _add_batch_metrics_analysis(self, story, results: List[AnalysisResult]):
552
- """Detailed metrics analysis for batch"""
553
- story.append(Paragraph("Metric-wise Analysis", self.styles['SectionTitle']))
554
- story.append(Spacer(1, 8))
555
-
556
- metric_configs = {
557
- 'gradient': {
558
- 'name': 'Gradient-Field PCA Analysis',
559
- 'keys': ['eigenvalue_ratio', 'gradient_vectors_sampled'],
560
- 'labels': ['Eigenvalue\nRatio', 'Vectors\nSampled']
561
- },
562
- 'frequency': {
563
- 'name': 'Frequency Domain Analysis (FFT)',
564
- 'keys': ['hf_ratio', 'roughness', 'spectral_deviation'],
565
- 'labels': ['HF Ratio', 'Roughness', 'Spec.\nDeviation']
566
- },
567
- 'noise': {
568
- 'name': 'Noise Pattern Analysis',
569
- 'keys': ['mean_noise', 'cv', 'patches_valid'],
570
- 'labels': ['Mean Noise', 'CV', 'Patches\nValid']
571
- },
572
- 'texture': {
573
- 'name': 'Texture Statistical Analysis',
574
- 'keys': ['smooth_ratio', 'contrast_mean', 'entropy_mean'],
575
- 'labels': ['Smooth\nRatio', 'Mean\nContrast', 'Mean\nEntropy']
576
- },
577
- 'color': {
578
- 'name': 'Color Distribution Analysis',
579
- 'keys': ['saturation_stats.mean_saturation', 'saturation_stats.high_sat_ratio'],
580
- 'labels': ['Mean\nSaturation', 'High Saturation\nRatio']
581
- }
582
- }
583
-
584
- for metric_key, config in metric_configs.items():
585
- story.append(Paragraph(config['name'], self.styles['SectionHeader']))
586
-
587
- # Build header
588
- header = [
589
- Paragraph("<b>#</b>", self.styles['TableHeader']),
590
- Paragraph("<b>Filename</b>", self.styles['TableHeader']),
591
- Paragraph("<b>Score</b>", self.styles['TableHeader']),
592
- Paragraph("<b>Confidence</b>", self.styles['TableHeader'])
593
- ]
594
-
595
- for label in config['labels']:
596
- header.append(Paragraph(f"<b>{label}</b>", self.styles['TableHeader']))
597
-
598
- data = [header]
599
-
600
- for idx, result in enumerate(results, 1):
601
- metric_result = result.metric_results.get(metric_key)
602
- if not metric_result:
603
- continue
604
-
605
- details = metric_result.details or {}
606
-
607
- row = [
608
- Paragraph(str(idx), self.styles['TableCellSmall']),
609
- Paragraph(result.filename, self.styles['TableCellSmall']),
610
- Paragraph(f"<b>{metric_result.score:.3f}</b>", self.styles['TableCellSmall']),
611
- Paragraph(f"{metric_result.confidence:.2f}" if metric_result.confidence else "N/A",
612
- self.styles['TableCellSmall'])
613
- ]
614
-
615
- # Extract values
616
- for key in config['keys']:
617
- value = self._extract_nested_value(details, key)
618
- formatted_value = self._format_value(value, decimal_places=3)
619
- row.append(Paragraph(formatted_value, self.styles['TableCellSmall']))
620
-
621
- data.append(row)
622
-
623
- # Dynamic column widths
624
- num_detail_cols = len(config['labels'])
625
- detail_col_width = (530 - 25 - 140 - 45 - 35) // num_detail_cols
626
- col_widths = [25, 140, 45, 35] + [detail_col_width] * num_detail_cols
627
-
628
- table = Table(data, colWidths=col_widths)
629
- table.setStyle(self._get_pivot_table_style(len(data)))
630
- story.append(table)
631
- story.append(Spacer(1, 10))
632
-
633
- def _add_individual_results_summary(self, story, results: List[AnalysisResult]):
634
- """Individual image summaries in batch"""
635
- story.append(Paragraph("Individual Image Summaries", self.styles['SectionTitle']))
636
- story.append(Spacer(1, 8))
637
-
638
- for idx, result in enumerate(results, 1):
639
- if idx > 1:
640
- story.append(Spacer(1, 12))
641
-
642
- story.append(Paragraph(f"Image {idx}: {result.filename}", self.styles['SectionHeader']))
643
-
644
- # Quick stats
645
- quick_data = [
646
- [Paragraph("<b>Property</b>", self.styles['TableHeader']),
647
- Paragraph("<b>Value</b>", self.styles['TableHeader'])],
648
- [Paragraph("Score", self.styles['TableCell']),
649
- Paragraph(f"<b>{result.overall_score:.4f}</b>", self.styles['TableCell'])],
650
- [Paragraph("Status", self.styles['TableCell']),
651
- Paragraph(self._get_status_badge_html(result.status.value), self.styles['TableCell'])],
652
- [Paragraph("Confidence", self.styles['TableCell']),
653
- Paragraph(f"{result.confidence}%", self.styles['TableCell'])],
654
- [Paragraph("Dimensions", self.styles['TableCell']),
655
- Paragraph(f"{result.image_size[0]} × {result.image_size[1]}", self.styles['TableCell'])],
656
- ]
657
-
658
- quick_table = Table(quick_data, colWidths=[120, 410])
659
- quick_table.setStyle(self._get_standard_table_style(len(quick_data)))
660
- story.append(quick_table)
661
- story.append(Spacer(1, 5))
662
-
663
- # Top 3 signals
664
- story.append(Paragraph("Top Detection Signals:", self.styles['SubHeader']))
665
-
666
- top_signals = sorted(result.signals, key=lambda s: s.score, reverse=True)[:3]
667
- signal_data = [[
668
- Paragraph("<b>Signal</b>", self.styles['TableHeader']),
669
- Paragraph("<b>Score</b>", self.styles['TableHeader']),
670
- Paragraph("<b>Status</b>", self.styles['TableHeader'])
671
- ]]
672
-
673
- for signal in top_signals:
674
- signal_data.append([
675
- Paragraph(signal.name, self.styles['TableCellSmall']),
676
- Paragraph(f"{signal.score:.3f}", self.styles['TableCellSmall']),
677
- Paragraph(self._get_status_badge_html(signal.status.value), self.styles['TableCellSmall'])
678
- ])
679
-
680
- signal_table = Table(signal_data, colWidths=[200, 165, 165])
681
- signal_table.setStyle(self._get_standard_table_style(len(signal_data)))
682
- story.append(signal_table)
683
-
684
- def _add_batch_recommendations(self, story, batch_result: BatchAnalysisResult):
685
- """Batch-level recommendations"""
686
- story.append(Paragraph("Batch Analysis Recommendations", self.styles['SectionTitle']))
687
- story.append(Spacer(1, 8))
688
-
689
- review_count = batch_result.summary.get('review_required', 0)
690
- total = batch_result.total_images
691
-
692
- if review_count > 0:
693
- rec_text = f"""
694
- <b>⚠️ ACTION REQUIRED</b><br/>
695
- {review_count} out of {total} images require manual review ({review_count/total*100:.1f}%).<br/>
696
- <br/>
697
- <b>Recommended Actions:</b><br/>
698
- • Prioritize high-risk images for immediate review<br/>
699
- • Assign qualified personnel for verification<br/>
700
- • Document review findings and decisions<br/>
701
- • Consider additional forensic analysis for flagged images<br/>
702
- • Update screening protocols based on findings
703
- """
704
- rec_color = self.COLOR_LIGHT_ORANGE
705
- border_color = self.COLOR_WARNING
706
- else:
707
- rec_text = f"""
708
- <b>✅ BATCH PASSED SCREENING</b><br/>
709
- All {total} images appear to be authentic based on current analysis.<br/>
710
- <br/>
711
- <b>Recommended Actions:</b><br/>
712
- • Archive results for audit trail<br/>
713
- • Maintain periodic re-screening schedule<br/>
714
- • Monitor for evolving AI generation techniques<br/>
715
- • Update detection models regularly<br/>
716
- • Document chain of custody
717
- """
718
- rec_color = self.COLOR_LIGHT_GREEN
719
- border_color = self.COLOR_SUCCESS
720
-
721
- rec_table = Table([[Paragraph(rec_text, self.styles['CustomBodyText'])]], colWidths=[530])
722
- rec_table.setStyle(TableStyle([
723
- ('BACKGROUND', (0, 0), (-1, -1), rec_color),
724
- ('BOX', (0, 0), (-1, -1), 2, border_color),
725
- ('LEFTPADDING', (0, 0), (-1, -1), 15),
726
- ('RIGHTPADDING', (0, 0), (-1, -1), 15),
727
- ('TOPPADDING', (0, 0), (-1, -1), 12),
728
- ('BOTTOMPADDING', (0, 0), (-1, -1), 12)
729
- ]))
730
- story.append(rec_table)
731
-
732
- def _add_professional_footer(self, story):
733
- """Professional footer with disclaimers"""
734
- story.append(Spacer(1, 15))
735
-
736
- disclaimer_lines = [
737
- "⚠️ <b>DISCLAIMER</b>: This report provides probabilistic screening results based on current AI detection methodologies, not definitive verdicts.",
738
- "Results should be manually verified for critical applications. False positive rate: ~10-20%. Accuracy may vary with image quality and AI generation techniques.",
739
- "This analysis should be used as one component of a comprehensive verification process, not as the sole basis for decision-making.",
740
- "© 2025 AI Image Screener | Confidential Report | For Authorized Use Only"
741
- ]
742
-
743
- for line in disclaimer_lines:
744
- story.append(Paragraph(line, self.styles['Footer']))
745
- story.append(Spacer(1, 2))
746
-
747
- # Helper methods
748
-
749
- def _get_standard_table_style(self, num_rows):
750
- """Standard table styling"""
751
- return TableStyle([
752
- ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_HEADER_BG),
753
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
754
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
755
- ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
756
- ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, self.COLOR_ALT_ROW]),
757
- ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
758
- ('LEFTPADDING', (0, 0), (-1, -1), 8),
759
- ('RIGHTPADDING', (0, 0), (-1, -1), 8),
760
- ('TOPPADDING', (0, 0), (-1, -1), 5),
761
- ('BOTTOMPADDING', (0, 0), (-1, -1), 5)
762
- ])
763
 
764
- def _get_signal_table_style(self, num_rows):
765
- """Signal table styling with color coding"""
766
  return TableStyle([
767
- ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_HEADER_BG),
768
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
769
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
770
  ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
771
  ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, self.COLOR_ALT_ROW]),
772
  ('VALIGN', (0, 0), (-1, -1), 'TOP'),
773
  ('LEFTPADDING', (0, 0), (-1, -1), 6),
774
  ('RIGHTPADDING', (0, 0), (-1, -1), 6),
775
- ('TOPPADDING', (0, 0), (-1, -1), 5),
776
- ('BOTTOMPADDING', (0, 0), (-1, -1), 5)
777
  ])
778
 
779
- def _get_pivot_table_style(self, num_rows):
780
- """Pivot table styling"""
781
- return TableStyle([
782
- ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_HEADER_BG),
783
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
784
- ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
785
- ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
786
- ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, self.COLOR_ALT_ROW]),
787
- ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
788
- ('ALIGN', (0, 0), (0, -1), 'CENTER'),
789
- ('LEFTPADDING', (0, 0), (-1, -1), 4),
790
- ('RIGHTPADDING', (0, 0), (-1, -1), 4),
791
- ('TOPPADDING', (0, 0), (-1, -1), 4),
792
- ('BOTTOMPADDING', (0, 0), (-1, -1), 4)
793
- ])
794
 
795
- def _get_status_badge_html(self, status: str) -> str:
796
- """Generate status badge HTML"""
797
- if status == "REVIEW_REQUIRED" or status == "flagged":
798
- return "<font color='#B71C1C'><b>🔴 FLAGGED</b></font>"
799
- elif status == "warning":
800
- return "<font color='#E65100'><b>🟡 WARNING</b></font>"
801
- else:
802
- return "<font color='#1B5E20'><b>🟢 PASSED</b></font>"
803
-
804
- def _get_status_badge_short(self, status: str) -> str:
805
- """Short status badge"""
806
- if status == "REVIEW_REQUIRED":
807
- return "<font color='#B71C1C'><b>⚠️ REVIEW REQUIRED</b></font>"
808
- else:
809
- return "<font color='#1B5E20'><b>✓ LIKELY AUTHENTIC</b></font>"
810
-
811
- def _get_metric_status_html(self, score: float) -> str:
812
- """Metric status based on score"""
813
- if score > 0.7:
814
- return "<font color='#B71C1C'><b>High Risk</b></font>"
815
- elif score > 0.5:
816
- return "<font color='#E65100'><b>Moderate Risk</b></font>"
817
- else:
818
- return "<font color='#1B5E20'><b>Low Risk</b></font>"
819
-
820
- def _format_value(self, value: Any, decimal_places: int = 4) -> str:
821
- """Format value for display"""
822
- if value is None or (isinstance(value, dict) and 'reason' in value):
823
- return "N/A"
824
- elif isinstance(value, float):
825
- return f"{value:.{decimal_places}f}"
826
- elif isinstance(value, (int, str, bool)):
827
- return str(value)
828
- else:
829
- return "N/A"
830
-
831
- def _extract_nested_value(self, details: dict, key: str) -> Any:
832
- """Extract nested dictionary values"""
833
- if '.' in key:
834
- parts = key.split('.')
835
- value = details
836
- for part in parts:
837
- if isinstance(value, dict):
838
- value = value.get(part, None)
839
- else:
840
- return None
841
- return value
842
- else:
843
- return details.get(key, None)
 
1
  # Dependencies
2
+ from typing import Any
3
+ from typing import List
4
  from pathlib import Path
5
+ from typing import Optional
6
  from datetime import datetime
7
+ from reportlab.lib import colors
8
  from utils.logger import get_logger
9
  from config.settings import settings
10
+ from reportlab.platypus import Table
11
+ from reportlab.lib.units import inch
12
+ from reportlab.platypus import Spacer
13
+ from reportlab.platypus import Paragraph
14
+ from reportlab.platypus import PageBreak
15
  from reportlab.platypus import TableStyle
16
  from config.schemas import AnalysisResult
17
+ from config.constants import FinalDecision
18
+ from reportlab.lib.pagesizes import LETTER
19
  from utils.helpers import generate_unique_id
20
+ from config.constants import EvidenceStrength
21
  from config.schemas import BatchAnalysisResult
22
+ from reportlab.lib.styles import ParagraphStyle
23
  from reportlab.platypus import SimpleDocTemplate
24
+ from reportlab.lib.styles import getSampleStyleSheet
25
  from features.detailed_result_maker import DetailedResultMaker
 
 
 
26
 
27
 
28
  # Setup Logging
 
31
 
32
  class PDFReporter:
33
  """
34
+ PDF Report Generator
35
+
36
+ Guarantees:
37
+ -----------
38
+ - FinalDecision is authoritative
39
+ - Evidence-first explanations
40
+ - Metrics are informational only
41
+ - Audit-safe and regulator-ready
 
 
42
  """
43
+ COLOR_PRIMARY = colors.HexColor('#0D47A1')
44
+ COLOR_SUCCESS = colors.HexColor('#1B5E20')
45
+ COLOR_WARNING = colors.HexColor('#E65100')
46
+ COLOR_DANGER = colors.HexColor('#B71C1C')
47
+ COLOR_HEADER = colors.HexColor('#1565C0')
48
+ COLOR_ALT_ROW = colors.HexColor('#F5F5F5')
49
+
 
 
 
 
 
 
 
 
 
50
  def __init__(self):
51
  self.detailed_maker = DetailedResultMaker()
52
+ self.styles = self._build_styles()
53
+ logger.debug("PDFReporter initialized")
54
+
55
 
56
  def export_single(self, result: AnalysisResult, output_dir: Optional[Path] = None) -> Path:
57
+ """
58
+ Export single image PDF report
59
+ """
60
+ output_dir = output_dir or settings.REPORTS_DIR
61
+ output_dir.mkdir(parents = True, exist_ok = True)
62
+
63
+ filename = f"ai_screener_report_{generate_unique_id()}.pdf"
64
  output_path = output_dir / filename
65
+
66
+ logger.info(f"Generating single image PDF: {filename}")
67
+
68
+ doc = SimpleDocTemplate(str(output_path),
69
+ pagesize = LETTER,
70
+ rightMargin = 30,
71
+ leftMargin = 30,
72
+ topMargin = 20,
73
+ bottomMargin = 35,
74
+ )
75
+
76
+ story = list()
77
+
78
+ self._add_header(story, "AI Image Analysis Report")
79
+
80
+ self._add_single_executive_summary(story, result)
81
 
82
+ story.append(PageBreak())
83
 
84
+ self._add_evidence_section(story, result)
 
 
 
 
 
 
 
85
 
 
 
 
86
  story.append(PageBreak())
 
 
 
 
 
87
 
88
+ self._add_metrics_section(story, result)
89
+ self._add_footer(story)
90
+
91
+ doc.build(story)
92
+
93
  return output_path
94
 
95
+
96
  def export_batch(self, batch_result: BatchAnalysisResult, output_dir: Optional[Path] = None) -> Path:
97
+ """
98
+ Export batch PDF report
99
+ """
100
+ output_dir = output_dir or settings.REPORTS_DIR
101
+ output_dir.mkdir(parents = True, exist_ok = True)
102
+
103
+ filename = f"ai_screener_batch_{generate_unique_id()}.pdf"
104
  output_path = output_dir / filename
105
+
106
+ logger.info(f"Generating batch PDF: {filename}")
107
+
108
+ doc = SimpleDocTemplate(str(output_path),
109
+ pagesize = LETTER,
110
+ rightMargin = 30,
111
+ leftMargin = 30,
112
+ topMargin = 20,
113
+ bottomMargin = 35,
114
+ )
115
+
116
+ story = list()
117
+
118
+ self._add_header(story, "Batch Image Analysis Report")
119
+ self._add_batch_summary(story, batch_result)
 
 
 
 
 
120
  story.append(PageBreak())
121
+
122
+ for idx, result in enumerate(batch_result.results, 1):
123
+ self._add_single_executive_summary(story, result, index=idx)
124
+ self._add_evidence_section(story, result)
125
+ self._add_metrics_section(story, result)
126
+
127
+ if (idx < len(batch_result.results)):
128
+ story.append(PageBreak())
129
+
130
+ self._add_footer(story)
131
+ doc.build(story)
132
+
133
  return output_path
134
 
135
+
136
+ def _add_header(self, story, title: str):
137
+ story.append(Paragraph("AI IMAGE SCREENER", self.styles['Title']))
138
+ story.append(Paragraph(title, self.styles['Subtitle']))
139
+ story.append(Paragraph(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | Version: {settings.VERSION}", self.styles['Meta']))
140
+ story.append(Spacer(1, 12))
141
+
142
+
143
+ def _add_single_executive_summary(self, story, result: AnalysisResult, index: Optional[int] = None):
144
+ title = "Executive Summary"
145
+
146
+ if index:
147
+ title += f" — Image {index}"
148
+
149
+ story.append(Paragraph(title, self.styles['Section']))
150
+
151
+ decision = result.final_decision.value if result.final_decision else "UNDECIDED"
152
+ color = self._decision_color(result.final_decision)
153
+
154
+ table = Table([["Final Decision", decision],
155
+ ["Confidence", f"{result.confidence}%"],
156
+ ["Explanation", result.decision_explanation or "—"],
157
+ ],
158
+ colWidths = [140, 390]
159
+ )
160
+
161
+ table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), self.COLOR_HEADER),
162
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
163
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
164
+ ('BACKGROUND', (0, 1), (-1, -1), color),
165
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
166
+ ('LEFTPADDING', (0, 0), (-1, -1), 8),
167
+ ('RIGHTPADDING', (0, 0), (-1, -1), 8),
168
+ ])
169
+ )
170
+
171
+ story.append(table)
172
+ story.append(Spacer(1, 10))
173
+
174
+
175
+ def _add_evidence_section(self, story, result: AnalysisResult):
176
+ story.append(Paragraph("Evidence Assessment", self.styles['Section']))
177
+
178
+ if not result.evidence:
179
+ story.append(Paragraph("No declarative evidence detected. Decision derived from Tier-1 metrics.", self.styles['Body']))
180
+ return
181
+
182
+ rows = [["Source", "Direction", "Strength", "Confidence", "Finding"]]
183
+
184
+ for e in result.evidence:
185
+ rows.append([e.source.value,
186
+ e.direction.value,
187
+ e.strength.value,
188
+ f"{e.confidence:.2f}" if e.confidence else "N/A",
189
+ e.finding
190
+ ])
191
+
192
+ table = Table(rows, colWidths = [70, 80, 80, 70, 230])
193
+ table.setStyle(self._standard_table_style())
194
+
195
+ story.append(table)
196
+ story.append(Spacer(1, 10))
197
+
198
+
199
+ def _add_metrics_section(self, story, result: AnalysisResult):
200
+ story.append(Paragraph("Metric Signals (Informational)", self.styles['Section']))
201
+
202
+ rows = [["Metric", "Score", "Confidence", "Notes"]]
203
+
204
+ for mt, mr in result.metric_results.items():
205
+ rows.append([
206
+ self.detailed_maker.metric_display_names.get(mt, mt.value),
207
+ f"{mr.score:.3f}",
208
+ f"{mr.confidence:.3f}" if mr.confidence else "N/A",
209
+ ", ".join(self.detailed_maker.extract_key_findings(mt, mr))
210
+ ])
211
+
212
+ table = Table(rows, colWidths=[180, 70, 80, 210])
213
+ table.setStyle(self._standard_table_style())
214
+
215
+ story.append(table)
216
+ story.append(Spacer(1, 10))
217
+
218
+
219
+ def _add_batch_summary(self, story, batch_result: BatchAnalysisResult):
220
+ story.append(Paragraph("Batch Decision Summary", self.styles['Section']))
221
+
222
+ rows = [
223
+ ["Total Images", batch_result.total_images],
224
+ ["Processed", batch_result.processed],
225
+ ["Failed", batch_result.failed],
226
+ ["Success Rate", f"{batch_result.summary.get('success_rate', 0)}%"],
227
+ ]
228
+
229
+ for decision in FinalDecision:
230
+ rows.append([
231
+ decision.value,
232
+ batch_result.summary.get(decision.value, 0)
233
+ ])
234
+
235
+ table = Table(rows, colWidths=[220, 310])
236
+ table.setStyle(self._standard_table_style())
237
+
238
+ story.append(table)
239
+ story.append(Spacer(1, 10))
240
+
241
+
242
+ def _add_footer(self, story):
243
+ story.append(Spacer(1, 15))
244
+ story.append(Paragraph(
245
+ "DISCLAIMER: Metric scores are non-decisional. "
246
+ "Final decisions are evidence- and policy-based.",
247
+ self.styles['Footer']
248
+ ))
249
+
250
+
251
+ # ------------------------------------------------------------------
252
+ # STYLES & HELPERS
253
+ # ------------------------------------------------------------------
254
+
255
  def _build_styles(self):
 
256
  styles = getSampleStyleSheet()
257
+
258
  styles.add(ParagraphStyle(
259
+ name='Title',
260
  fontSize=18,
261
+ alignment=1,
262
  textColor=self.COLOR_PRIMARY,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  fontName='Helvetica-Bold'
264
  ))
265
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  styles.add(ParagraphStyle(
267
+ name='Subtitle',
268
+ fontSize=12,
269
+ alignment=1,
 
270
  spaceAfter=6
271
  ))
272
+
273
  styles.add(ParagraphStyle(
274
+ name='Meta',
275
  fontSize=8,
276
+ alignment=1,
277
+ spaceAfter=10,
278
+ textColor=colors.grey
 
 
 
 
279
  ))
280
+
281
  styles.add(ParagraphStyle(
282
+ name='Section',
283
+ fontSize=13,
 
284
  fontName='Helvetica-Bold',
285
+ spaceBefore=10,
286
+ spaceAfter=6
287
  ))
288
+
289
  styles.add(ParagraphStyle(
290
+ name='Body',
291
+ fontSize=9,
292
+ spaceAfter=6
 
 
293
  ))
294
+
295
  styles.add(ParagraphStyle(
296
+ name='Footer',
297
+ fontSize=7,
298
+ alignment=1,
299
+ textColor=colors.grey
 
300
  ))
 
 
301
 
302
+ return styles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
+ def _standard_table_style(self):
 
306
  return TableStyle([
307
+ ('BACKGROUND', (0, 0), (-1, 0), self.COLOR_HEADER),
308
  ('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
 
309
  ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
310
  ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, self.COLOR_ALT_ROW]),
311
  ('VALIGN', (0, 0), (-1, -1), 'TOP'),
312
  ('LEFTPADDING', (0, 0), (-1, -1), 6),
313
  ('RIGHTPADDING', (0, 0), (-1, -1), 6),
 
 
314
  ])
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
+ def _decision_color(self, decision: Optional[FinalDecision]):
318
+ if decision == FinalDecision.CONFIRMED_AI_GENERATED:
319
+ return colors.HexColor('#FFEBEE')
320
+ if decision == FinalDecision.SUSPICIOUS_AI_LIKELY:
321
+ return colors.HexColor('#FFF3E0')
322
+ if decision == FinalDecision.AUTHENTIC_BUT_REVIEW:
323
+ return colors.HexColor('#E3F2FD')
324
+ if decision == FinalDecision.MOSTLY_AUTHENTIC:
325
+ return colors.HexColor('#E8F5E9')
326
+ return colors.white
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui/index.html CHANGED
@@ -657,6 +657,12 @@
657
  border: 1px solid rgba(214, 158, 46, 0.3);
658
  }
659
 
 
 
 
 
 
 
660
  .score-indicator {
661
  display: flex;
662
  align-items: center;
@@ -1039,13 +1045,29 @@
1039
  <h3>Multi-Signal Detection</h3>
1040
  <p>Five independent statistical detectors with weighted ensemble aggregation</p>
1041
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1042
 
1043
  <div class="feature-card">
1044
  <div class="feature-icon">
1045
  <i class="fas fa-file-export"></i>
1046
  </div>
1047
  <h3>Comprehensive Reports</h3>
1048
- <p>Export results in CSV, JSON, and PDF formats for integration and documentation</p>
1049
  </div>
1050
 
1051
  <div class="feature-card">
@@ -1215,6 +1237,72 @@
1215
  </div>
1216
  </div>
1217
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1218
  </div>
1219
  </div>
1220
 
@@ -1305,9 +1393,6 @@
1305
  <button class="action-button secondary-action" id="exportCsvBtn">
1306
  <i class="fas fa-file-csv"></i> CSV
1307
  </button>
1308
- <button class="action-button secondary-action" id="exportPdfBtn">
1309
- <i class="fas fa-file-pdf"></i> PDF
1310
- </button>
1311
  <button class="action-button secondary-action" id="exportJsonBtn">
1312
  <i class="fas fa-file-code"></i> JSON
1313
  </button>
@@ -1385,8 +1470,7 @@
1385
  const HEALTH_ENDPOINT = '/health';
1386
  const BATCH_PROGRESS_ENDPOINT = '/batch';
1387
  const CSV_REPORT_ENDPOINT = '/report/csv';
1388
- const PDF_REPORT_ENDPOINT = '/report/pdf';
1389
-
1390
  // Global state
1391
  let files = [];
1392
  let fileDataUrls = {};
@@ -1419,7 +1503,6 @@
1419
  const resultsTableBody = document.getElementById('resultsTableBody');
1420
  const noResultsRow = document.getElementById('noResultsRow');
1421
  const exportCsvBtn = document.getElementById('exportCsvBtn');
1422
- const exportPdfBtn = document.getElementById('exportPdfBtn');
1423
  const exportJsonBtn = document.getElementById('exportJsonBtn');
1424
  const toggleDetailedAnalysis = document.getElementById('toggleDetailedAnalysis');
1425
  const detailedAnalysisIcon = document.getElementById('detailedAnalysisIcon');
@@ -1498,7 +1581,6 @@
1498
 
1499
  // Export
1500
  exportCsvBtn.addEventListener('click', exportCsv);
1501
- exportPdfBtn.addEventListener('click', exportPdf);
1502
  exportJsonBtn.addEventListener('click', exportJson);
1503
 
1504
  // Detailed analysis toggle
@@ -1832,6 +1914,46 @@
1832
  }, 1000);
1833
  }
1834
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1835
  function displayResults() {
1836
  if (!batchResults) {
1837
  console.error('No results to display:', batchResults);
@@ -1855,7 +1977,8 @@
1855
 
1856
  const filename = resultData.filename || 'Unknown';
1857
  const overallScore = resultData.overall_score || 0;
1858
- const status = resultData.status || 'LIKELY_AUTHENTIC';
 
1859
  const confidence = resultData.confidence || 0;
1860
  const imageSize = resultData.image_size || [0, 0];
1861
  const signals = resultData.signals || [];
@@ -1863,21 +1986,13 @@
1863
 
1864
  const scorePercent = Math.round(overallScore * 100);
1865
  let scoreClass = 'score-low';
1866
- let scoreWidth = '30%';
1867
- if (scorePercent >= 70) {
1868
- scoreClass = 'score-high';
1869
- scoreWidth = '90%';
1870
- } else if (scorePercent >= 50) {
1871
- scoreClass = 'score-medium';
1872
- scoreWidth = '60%';
1873
- }
1874
 
1875
  const flaggedCount = signals.filter(s => s.status === 'flagged').length;
1876
  const warningCount = signals.filter(s => s.status === 'warning').length;
1877
 
1878
- // Format status for display (remove underscores)
1879
- const displayStatus = status.replace(/_/g, ' ');
1880
-
1881
  // Get thumbnail
1882
  const thumbnailSrc = fileDataUrls[filename] || 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40"><rect width="40" height="40" fill="%23f0f0f0"/></svg>';
1883
 
@@ -1894,8 +2009,8 @@
1894
  </div>
1895
  </td>
1896
  <td>
1897
- <span class="status-badge ${status === 'LIKELY_AUTHENTIC' ? 'status-authentic' : 'status-review'}" style="white-space: nowrap;">
1898
- ${displayStatus}
1899
  </span>
1900
  </td>
1901
  <td>
@@ -1945,153 +2060,190 @@
1945
  }
1946
 
1947
  function updateSummary(batchResult) {
1948
- const total = batchResult.total_images || 0;
1949
- const processed = batchResult.processed || batchResult.results?.length || 0;
1950
- const failed = batchResult.failed || 0;
1951
-
1952
- let likelyAuthentic = 0;
1953
- let reviewRequired = 0;
1954
-
1955
- if (batchResult.results) {
1956
- batchResult.results.forEach(result => {
1957
- const resultData = result;
1958
- const status = resultData.status || 'LIKELY_AUTHENTIC';
1959
- if (status === 'LIKELY_AUTHENTIC') {
1960
- likelyAuthentic++;
1961
- } else if (status === 'REVIEW_REQUIRED') {
1962
- reviewRequired++;
1963
- }
1964
- });
1965
  }
1966
-
1967
- resultsSummary.innerHTML = `
1968
- <div class="summary-card">
1969
- <div class="summary-value">${processed}</div>
1970
- <div class="summary-label">Total Processed</div>
1971
- </div>
1972
- <div class="summary-card">
1973
- <div class="summary-value">${likelyAuthentic}</div>
1974
- <div class="summary-label">Likely Authentic</div>
1975
- </div>
1976
- <div class="summary-card">
1977
- <div class="summary-value">${reviewRequired}</div>
1978
- <div class="summary-label">Review Required</div>
1979
- </div>
1980
- <div class="summary-card">
1981
- <div class="summary-value">${failed}</div>
1982
- <div class="summary-label">Failed</div>
1983
- </div>
1984
- `;
 
 
 
 
 
 
 
 
 
 
 
 
1985
  }
1986
-
1987
  function showDetailedAnalysis(index) {
1988
  if (!batchResults || !batchResults.results || !batchResults.results[index]) return;
1989
-
1990
  selectedImageIndex = index;
1991
- const result = batchResults.results[index];
1992
- const resultData = result;
1993
-
1994
- const filename = resultData.filename || 'Unknown';
1995
- const overallScore = resultData.overall_score || 0;
1996
- const status = resultData.status || 'LIKELY_AUTHENTIC';
1997
- const confidence = resultData.confidence || 0;
1998
- const imageSize = resultData.image_size || [0, 0];
1999
- const processingTime = resultData.processing_time || 0;
2000
- const signals = resultData.signals || [];
2001
-
2002
- const scorePercent = Math.round(overallScore * 100);
2003
- const displayStatus = status.replace(/_/g, ' ');
2004
-
2005
- // Ensure detailed analysis is expanded
2006
  detailedAnalysisContent.classList.add('show');
2007
  detailedAnalysisIcon.classList.remove('fa-chevron-down');
2008
  detailedAnalysisIcon.classList.add('fa-chevron-up');
2009
-
2010
- document.getElementById('detailedAnalysisContent').scrollIntoView({
2011
  behavior: 'smooth',
2012
  block: 'start'
2013
  });
2014
-
2015
- // Build signals HTML
2016
  let signalsHtml = '';
2017
- if (signals && signals.length > 0) {
 
2018
  signals.forEach(signal => {
2019
  let statusClass = 'signal-passed';
2020
  if (signal.status === 'warning') statusClass = 'signal-warning';
2021
  if (signal.status === 'flagged') statusClass = 'signal-flagged';
2022
-
2023
  const signalScore = Math.round((signal.score || 0) * 100);
2024
-
2025
  signalsHtml += `
2026
  <div class="signal-card">
2027
  <div class="signal-header">
2028
  <strong>${signal.name || 'Unknown Metric'}</strong>
2029
  <span class="signal-badge ${statusClass}">${signal.status}</span>
2030
  </div>
2031
- <p style="font-size: 0.875rem; margin-bottom: 0.5rem; color: var(--text-light);">
2032
  ${signal.explanation || 'No explanation available.'}
2033
  </p>
2034
- <div style="display: flex; justify-content: space-between; align-items: center;">
2035
- <div style="font-size: 0.75rem; color: var(--text-light);">
2036
- Score: ${signalScore}%
2037
- </div>
2038
  </div>
2039
  </div>
2040
  `;
2041
  });
2042
  } else {
2043
- signalsHtml = '<p class="text-center" style="color: var(--text-light);">No detection signals available.</p>';
 
 
 
 
2044
  }
2045
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2046
  detailedAnalysisContent.innerHTML = `
2047
- <div style="margin-bottom: 1.5rem;">
2048
- <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 1rem;">
2049
- <img src="${fileDataUrls[filename] || 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" width="60" height="60" viewBox="0 0 60 60"><rect width="60" height="60" fill="%23f0f0f0"/></svg>'}"
2050
- alt="${filename}"
2051
- style="width: 60px; height: 60px; object-fit: cover; border-radius: 0.5rem; border: 1px solid var(--border);">
2052
  <div>
2053
- <h4 style="margin-bottom: 0.25rem;">${filename}</h4>
2054
- <div style="font-size: 0.875rem; color: var(--text-light);">
2055
  ${imageSize[0]} × ${imageSize[1]} • ${processingTime.toFixed(2)}s
2056
  </div>
2057
  </div>
2058
  </div>
2059
-
2060
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 1.5rem;">
2061
- <div style="text-align: center; padding: 1rem; background-color: #f8fafc; border-radius: 0.5rem;">
2062
- <div style="font-size: 1.5rem; font-weight: 700; color: ${scorePercent >= 70 ? '#e53e3e' : scorePercent >= 50 ? '#d69e2e' : '#38a169'};">${scorePercent}%</div>
2063
- <div style="font-size: 0.875rem; color: var(--text-light);">Score</div>
2064
  </div>
2065
- <div style="text-align: center; padding: 1rem; background-color: #f8fafc; border-radius: 0.5rem;">
2066
- <div style="font-size: 1.5rem; font-weight: 700; color: ${displayStatus.includes('REVIEW') ? '#d69e2e' : '#38a169'};">${displayStatus}</div>
2067
- <div style="font-size: 0.875rem; color: var(--text-light);">Verdict</div>
2068
  </div>
2069
- <div style="text-align: center; padding: 1rem; background-color: #f8fafc; border-radius: 0.5rem;">
2070
- <div style="font-size: 1.5rem; font-weight: 700;">${confidence}%</div>
2071
- <div style="font-size: 0.875rem; color: var(--text-light);">Confidence</div>
2072
  </div>
2073
  </div>
2074
  </div>
2075
-
2076
- <h4 style="margin-bottom: 1rem;">Detection Signals</h4>
2077
  <div class="signal-grid">
2078
  ${signalsHtml}
2079
  </div>
2080
-
2081
- <div class="signal-card" style="margin-top: 1.5rem; background-color: ${displayStatus.includes('REVIEW') ? 'rgba(214, 158, 46, 0.1)' : 'rgba(56, 161, 105, 0.1)'}; border-color: ${displayStatus.includes('REVIEW') ? 'rgba(214, 158, 46, 0.3)' : 'rgba(56, 161, 105, 0.3)'};">
 
 
 
 
 
2082
  <div class="signal-header">
2083
- <strong>Recommendation</strong>
2084
- </div>
2085
- <p style="margin-bottom: 0.5rem;">
2086
- ${displayStatus.includes('REVIEW') ? 'Manual verification recommended' : 'No immediate action required'}
2087
- </p>
2088
- <div style="font-size: 0.875rem; color: var(--text-light);">
2089
- Confidence: ${confidence}% likelihood of ${displayStatus.includes('REVIEW') ? 'AI generation' : 'authenticity'}
2090
  </div>
 
2091
  </div>
2092
  `;
2093
  }
2094
-
2095
  // Export functions
2096
  async function exportCsv() {
2097
  if (!currentBatchId) {
@@ -2129,42 +2281,6 @@
2129
  }
2130
  }
2131
 
2132
- async function exportPdf() {
2133
- if (!currentBatchId) {
2134
- showToast('No analysis results to export.', 'warning');
2135
- return;
2136
- }
2137
-
2138
- showLoading(true);
2139
- try {
2140
- // Using GET request since backend now accepts both GET and POST
2141
- const response = await fetch(`${PDF_REPORT_ENDPOINT}/${currentBatchId}`);
2142
-
2143
- if (response.ok) {
2144
- // Get the blob data
2145
- const blob = await response.blob();
2146
-
2147
- // Create download link
2148
- const downloadLink = document.createElement('a');
2149
- downloadLink.href = URL.createObjectURL(blob);
2150
- downloadLink.download = `ai_screener_report_${currentBatchId}.pdf`;
2151
-
2152
- document.body.appendChild(downloadLink);
2153
- downloadLink.click();
2154
- document.body.removeChild(downloadLink);
2155
-
2156
- showToast('PDF report downloaded successfully.', 'success');
2157
- } else {
2158
- showToast('Failed to generate PDF report.', 'error');
2159
- }
2160
- } catch (error) {
2161
- console.error('PDF export failed:', error);
2162
- showToast('PDF export failed. Please try again.', 'error');
2163
- } finally {
2164
- showLoading(false);
2165
- }
2166
- }
2167
-
2168
  async function exportJson() {
2169
  if (!batchResults) {
2170
  showToast('No analysis results to export.', 'warning');
@@ -2192,7 +2308,7 @@
2192
  showLoading(false);
2193
  }
2194
  }
2195
-
2196
  // Reset functions
2197
  function resetUI() {
2198
  analyzeBtn.disabled = false;
 
657
  border: 1px solid rgba(214, 158, 46, 0.3);
658
  }
659
 
660
+ .status-danger {
661
+ background-color: rgba(229, 62, 62, 0.1);
662
+ color: var(--danger);
663
+ border: 1px solid rgba(229, 62, 62, 0.3);
664
+ }
665
+
666
  .score-indicator {
667
  display: flex;
668
  align-items: center;
 
1045
  <h3>Multi-Signal Detection</h3>
1046
  <p>Five independent statistical detectors with weighted ensemble aggregation</p>
1047
  </div>
1048
+
1049
+ <div class="feature-card">
1050
+ <div class="feature-icon">
1051
+ <i class="fas fa-puzzle-piece"></i>
1052
+ </div>
1053
+ <h3>Evidence Analysis</h3>
1054
+ <p>Aggregates detection signals and metadata into structured evidence, resolving conflicts and supporting explainable, non-scoring decisions</p>
1055
+ </div>
1056
+
1057
+ <div class="feature-card">
1058
+ <div class="feature-icon">
1059
+ <i class="fas fa-balance-scale"></i>
1060
+ </div>
1061
+ <h3>Decision Policy</h3>
1062
+ <p>Applies deterministic rules over metrics and evidence to produce a review-aware final verdict, prioritizing safety and auditability</p>
1063
+ </div>
1064
 
1065
  <div class="feature-card">
1066
  <div class="feature-icon">
1067
  <i class="fas fa-file-export"></i>
1068
  </div>
1069
  <h3>Comprehensive Reports</h3>
1070
+ <p>Export results in CSV and JSON formats for integration and documentation</p>
1071
  </div>
1072
 
1073
  <div class="feature-card">
 
1237
  </div>
1238
  </div>
1239
  </div>
1240
+
1241
+ <div class="metric-card">
1242
+ <div class="metric-header">
1243
+ <div class="metric-icon" style="background: linear-gradient(135deg, #2b6cb0 0%, #63b3ed 100%);">
1244
+ <i class="fas fa-camera-retro"></i>
1245
+ </div>
1246
+ <div>
1247
+ <div class="metric-title">EXIF Analyzer</div>
1248
+ </div>
1249
+ <span class="metric-weight">Auxiliary Metric</span>
1250
+ </div>
1251
+
1252
+ <p class="metric-description">
1253
+ Analyzes image metadata for presence, completeness, and plausibility.
1254
+ Real camera images usually contain coherent EXIF data, while AI-generated
1255
+ images often lack metadata or contain inconsistent fields.
1256
+ </p>
1257
+
1258
+ <div class="metric-details">
1259
+ <div class="detail-item">
1260
+ <span class="detail-label">Detection Method</span>
1261
+ <span class="detail-value">Metadata consistency analysis</span>
1262
+ </div>
1263
+ <div class="detail-item">
1264
+ <span class="detail-label">Signals</span>
1265
+ <span class="detail-value">Missing, stripped, or malformed EXIF</span>
1266
+ </div>
1267
+ <div class="detail-item">
1268
+ <span class="detail-label">Sensitivity</span>
1269
+ <span class="detail-value">Medium</span>
1270
+ </div>
1271
+ </div>
1272
+ </div>
1273
+
1274
+ <div class="metric-card">
1275
+ <div class="metric-header">
1276
+ <div class="metric-icon" style="background: linear-gradient(135deg, #6b46c1 0%, #b794f4 100%);">
1277
+ <i class="fas fa-fingerprint"></i>
1278
+ </div>
1279
+ <div>
1280
+ <div class="metric-title">Watermark Analyzer</div>
1281
+ </div>
1282
+ <span class="metric-weight">Auxiliary Metric</span>
1283
+ </div>
1284
+
1285
+ <p class="metric-description">
1286
+ Detects known and statistical watermark patterns embedded by generative
1287
+ models. Includes checks for frequency-domain artifacts and spatial
1288
+ regularities associated with AI watermarking techniques.
1289
+ </p>
1290
+
1291
+ <div class="metric-details">
1292
+ <div class="detail-item">
1293
+ <span class="detail-label">Detection Method</span>
1294
+ <span class="detail-value">Pattern & frequency-domain analysis</span>
1295
+ </div>
1296
+ <div class="detail-item">
1297
+ <span class="detail-label">Signals</span>
1298
+ <span class="detail-value">Model-specific watermark artifacts</span>
1299
+ </div>
1300
+ <div class="detail-item">
1301
+ <span class="detail-label">Sensitivity</span>
1302
+ <span class="detail-value">Low–Medium (model-dependent)</span>
1303
+ </div>
1304
+ </div>
1305
+ </div>
1306
  </div>
1307
  </div>
1308
 
 
1393
  <button class="action-button secondary-action" id="exportCsvBtn">
1394
  <i class="fas fa-file-csv"></i> CSV
1395
  </button>
 
 
 
1396
  <button class="action-button secondary-action" id="exportJsonBtn">
1397
  <i class="fas fa-file-code"></i> JSON
1398
  </button>
 
1470
  const HEALTH_ENDPOINT = '/health';
1471
  const BATCH_PROGRESS_ENDPOINT = '/batch';
1472
  const CSV_REPORT_ENDPOINT = '/report/csv';
1473
+
 
1474
  // Global state
1475
  let files = [];
1476
  let fileDataUrls = {};
 
1503
  const resultsTableBody = document.getElementById('resultsTableBody');
1504
  const noResultsRow = document.getElementById('noResultsRow');
1505
  const exportCsvBtn = document.getElementById('exportCsvBtn');
 
1506
  const exportJsonBtn = document.getElementById('exportJsonBtn');
1507
  const toggleDetailedAnalysis = document.getElementById('toggleDetailedAnalysis');
1508
  const detailedAnalysisIcon = document.getElementById('detailedAnalysisIcon');
 
1581
 
1582
  // Export
1583
  exportCsvBtn.addEventListener('click', exportCsv);
 
1584
  exportJsonBtn.addEventListener('click', exportJson);
1585
 
1586
  // Detailed analysis toggle
 
1914
  }, 1000);
1915
  }
1916
 
1917
+ function decisionMeta(decision) {
1918
+ switch (decision) {
1919
+ case 'MOSTLY_AUTHENTIC':
1920
+ return {
1921
+ label: 'Mostly Authentic',
1922
+ badgeClass: 'status-authentic',
1923
+ recommendation: 'No immediate action required'
1924
+ };
1925
+
1926
+ case 'AUTHENTIC_BUT_REVIEW':
1927
+ return {
1928
+ label: 'Authentic (Review Suggested)',
1929
+ badgeClass: 'status-review',
1930
+ recommendation: 'Optional human review'
1931
+ };
1932
+
1933
+ case 'SUSPICIOUS_AI_LIKELY':
1934
+ return {
1935
+ label: 'Suspicious (AI Likely)',
1936
+ badgeClass: 'status-review',
1937
+ recommendation: 'Manual verification recommended'
1938
+ };
1939
+
1940
+ case 'CONFIRMED_AI_GENERATED':
1941
+ return {
1942
+ label: 'Confirmed AI Generated',
1943
+ badgeClass: 'status-danger',
1944
+ recommendation: 'Block or audit required'
1945
+ };
1946
+
1947
+ default:
1948
+ console.warn('Unknown decision:', decision);
1949
+ return {
1950
+ label: decision,
1951
+ badgeClass: 'status-review',
1952
+ recommendation: 'Manual review required'
1953
+ };
1954
+ }
1955
+ }
1956
+
1957
  function displayResults() {
1958
  if (!batchResults) {
1959
  console.error('No results to display:', batchResults);
 
1977
 
1978
  const filename = resultData.filename || 'Unknown';
1979
  const overallScore = resultData.overall_score || 0;
1980
+ const decision = resultData.final_decision;
1981
+ const meta = decisionMeta(decision);
1982
  const confidence = resultData.confidence || 0;
1983
  const imageSize = resultData.image_size || [0, 0];
1984
  const signals = resultData.signals || [];
 
1986
 
1987
  const scorePercent = Math.round(overallScore * 100);
1988
  let scoreClass = 'score-low';
1989
+ if (scorePercent >= 70) scoreClass = 'score-high';
1990
+ else if (scorePercent >= 50) scoreClass = 'score-medium';
1991
+ const scoreWidth = `${Math.min(scorePercent, 100)}%`;
 
 
 
 
 
1992
 
1993
  const flaggedCount = signals.filter(s => s.status === 'flagged').length;
1994
  const warningCount = signals.filter(s => s.status === 'warning').length;
1995
 
 
 
 
1996
  // Get thumbnail
1997
  const thumbnailSrc = fileDataUrls[filename] || 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40"><rect width="40" height="40" fill="%23f0f0f0"/></svg>';
1998
 
 
2009
  </div>
2010
  </td>
2011
  <td>
2012
+ <span class="status-badge ${meta.badgeClass}">
2013
+ ${meta.label}
2014
  </span>
2015
  </td>
2016
  <td>
 
2060
  }
2061
 
2062
  function updateSummary(batchResult) {
2063
+ if (!batchResult || !Array.isArray(batchResult.results)) {
2064
+ resultsSummary.innerHTML = '';
2065
+ return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066
  }
2067
+
2068
+ // Count decisions exactly as returned by backend
2069
+ const counts = {};
2070
+ batchResult.results.forEach(result => {
2071
+ const status = result.final_decision;
2072
+ if (!status) return;
2073
+
2074
+ counts[status] = (counts[status] || 0) + 1;
2075
+ });
2076
+
2077
+ // Stable, policy-aligned display order
2078
+ const ORDER = [
2079
+ 'CONFIRMED_AI_GENERATED',
2080
+ 'SUSPICIOUS_AI_LIKELY',
2081
+ 'AUTHENTIC_BUT_REVIEW',
2082
+ 'MOSTLY_AUTHENTIC'
2083
+ ];
2084
+
2085
+ resultsSummary.innerHTML = ORDER
2086
+ .filter(status => counts[status])
2087
+ .map(status => {
2088
+ const meta = decisionMeta(status);
2089
+
2090
+ return `
2091
+ <div class="summary-card">
2092
+ <div class="summary-value">${counts[status]}</div>
2093
+ <div class="summary-label">${meta.label}</div>
2094
+ </div>
2095
+ `;
2096
+ })
2097
+ .join('');
2098
  }
2099
+
2100
  function showDetailedAnalysis(index) {
2101
  if (!batchResults || !batchResults.results || !batchResults.results[index]) return;
2102
+
2103
  selectedImageIndex = index;
2104
+ const result = batchResults.results[index];
2105
+
2106
+ const filename = result.filename || 'Unknown';
2107
+ const overallScore = result.overall_score || 0;
2108
+ const decision = result.final_decision;
2109
+ const meta = decisionMeta(decision);
2110
+ const confidence = result.confidence != null ? Math.round(result.confidence) : 0;
2111
+ const imageSize = result.image_size || [0, 0];
2112
+ const processingTime = result.processing_time || 0;
2113
+ const signals = result.signals || [];
2114
+ const evidence = result.evidence || [];
2115
+
2116
+ const scorePercent = Math.round(overallScore * 100);
2117
+
2118
+ /* ---------- Expand panel ---------- */
2119
  detailedAnalysisContent.classList.add('show');
2120
  detailedAnalysisIcon.classList.remove('fa-chevron-down');
2121
  detailedAnalysisIcon.classList.add('fa-chevron-up');
2122
+
2123
+ detailedAnalysisContent.scrollIntoView({
2124
  behavior: 'smooth',
2125
  block: 'start'
2126
  });
2127
+
2128
+ /* ---------- Signals (Tier-1 Metrics) ---------- */
2129
  let signalsHtml = '';
2130
+
2131
+ if (signals.length > 0) {
2132
  signals.forEach(signal => {
2133
  let statusClass = 'signal-passed';
2134
  if (signal.status === 'warning') statusClass = 'signal-warning';
2135
  if (signal.status === 'flagged') statusClass = 'signal-flagged';
2136
+
2137
  const signalScore = Math.round((signal.score || 0) * 100);
2138
+
2139
  signalsHtml += `
2140
  <div class="signal-card">
2141
  <div class="signal-header">
2142
  <strong>${signal.name || 'Unknown Metric'}</strong>
2143
  <span class="signal-badge ${statusClass}">${signal.status}</span>
2144
  </div>
2145
+ <p style="font-size:0.875rem;color:var(--text-light);margin-bottom:0.5rem;">
2146
  ${signal.explanation || 'No explanation available.'}
2147
  </p>
2148
+ <div style="font-size:0.75rem;color:var(--text-light);">
2149
+ Score: ${signalScore}%
 
 
2150
  </div>
2151
  </div>
2152
  `;
2153
  });
2154
  } else {
2155
+ signalsHtml = `
2156
+ <p class="text-center" style="color:var(--text-light);">
2157
+ No detection signals available.
2158
+ </p>
2159
+ `;
2160
  }
2161
+
2162
+ /* ---------- Evidence (Tier-2 Declarative Evidence) ---------- */
2163
+ let evidenceHtml = '';
2164
+
2165
+ if (evidence.length > 0) {
2166
+ evidence.forEach(ev => {
2167
+ let badgeClass = 'signal-passed';
2168
+ if (ev.direction === 'AI_GENERATED') badgeClass = 'signal-flagged';
2169
+ if (ev.direction === 'INDETERMINATE') badgeClass = 'signal-warning';
2170
+
2171
+ evidenceHtml += `
2172
+ <div class="signal-card" style="background:#f1f5f9;">
2173
+ <div class="signal-header">
2174
+ <strong>${ev.source.toUpperCase()}</strong>
2175
+ <span class="signal-badge ${badgeClass}">
2176
+ ${ev.strength}
2177
+ </span>
2178
+ </div>
2179
+ <p style="font-size:0.875rem;margin-bottom:0.5rem;">
2180
+ ${ev.finding}
2181
+ </p>
2182
+ <div style="font-size:0.75rem;color:#64748b;">
2183
+ Analyzer: ${ev.analyzer}
2184
+ ${ev.confidence != null ? ` • Confidence: ${Math.round(ev.confidence * 100)}%` : ''}
2185
+ </div>
2186
+ </div>
2187
+ `;
2188
+ });
2189
+ } else {
2190
+ evidenceHtml = `
2191
+ <p class="text-center" style="color:var(--text-light);">
2192
+ No declarative evidence detected.
2193
+ </p>
2194
+ `;
2195
+ }
2196
+
2197
+ /* ---------- Render ---------- */
2198
  detailedAnalysisContent.innerHTML = `
2199
+ <div style="margin-bottom:1.5rem;">
2200
+ <div style="display:flex;align-items:center;gap:1rem;margin-bottom:1rem;">
2201
+ <img src="${fileDataUrls[filename] || 'data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" width="60" height="60"><rect width="60" height="60" fill="%23f0f0f0"/></svg>'}"
2202
+ alt="${filename}"
2203
+ style="width:60px;height:60px;object-fit:cover;border-radius:0.5rem;border:1px solid var(--border);">
2204
  <div>
2205
+ <h4 style="margin-bottom:0.25rem;">${filename}</h4>
2206
+ <div style="font-size:0.875rem;color:var(--text-light);">
2207
  ${imageSize[0]} × ${imageSize[1]} • ${processingTime.toFixed(2)}s
2208
  </div>
2209
  </div>
2210
  </div>
2211
+
2212
+ <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:1rem;">
2213
+ <div style="text-align:center;padding:1rem;background:#f8fafc;border-radius:0.5rem;">
2214
+ <div style="font-size:1.5rem;font-weight:700;">${scorePercent}%</div>
2215
+ <div style="font-size:0.875rem;color:var(--text-light);">Score</div>
2216
  </div>
2217
+ <div style="text-align:center;padding:1rem;background:#f8fafc;border-radius:0.5rem;">
2218
+ <div style="font-size:1.25rem;font-weight:700;">${meta.label}</div>
2219
+ <div style="font-size:0.875rem;color:var(--text-light);">Verdict</div>
2220
  </div>
2221
+ <div style="text-align:center;padding:1rem;background:#f8fafc;border-radius:0.5rem;">
2222
+ <div style="font-size:1.5rem;font-weight:700;">${confidence}%</div>
2223
+ <div style="font-size:0.875rem;color:var(--text-light);">Confidence</div>
2224
  </div>
2225
  </div>
2226
  </div>
2227
+
2228
+ <h4 style="margin-bottom:0.75rem;">Detection Signals</h4>
2229
  <div class="signal-grid">
2230
  ${signalsHtml}
2231
  </div>
2232
+
2233
+ <h4 style="margin:1.5rem 0 0.75rem;">Evidence</h4>
2234
+ <div class="signal-grid">
2235
+ ${evidenceHtml}
2236
+ </div>
2237
+
2238
+ <div class="signal-card" style="margin-top:1.5rem;">
2239
  <div class="signal-header">
2240
+ <strong>Decision Explanation</strong>
 
 
 
 
 
 
2241
  </div>
2242
+ <p>${result.decision_explanation || meta.recommendation}</p>
2243
  </div>
2244
  `;
2245
  }
2246
+
2247
  // Export functions
2248
  async function exportCsv() {
2249
  if (!currentBatchId) {
 
2281
  }
2282
  }
2283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2284
  async function exportJson() {
2285
  if (!batchResults) {
2286
  showToast('No analysis results to export.', 'warning');
 
2308
  showLoading(false);
2309
  }
2310
  }
2311
+
2312
  // Reset functions
2313
  function resetUI() {
2314
  analyzeBtn.disabled = false;