text_auth_ai / reporter /report_generator.py
satyakimitra's picture
UI updated
38d5865
# DEPENDENCIES
import json
from typing import Any
from typing import Dict
from typing import List
from pathlib import Path
from loguru import logger
from typing import Optional
from datetime import datetime
from dataclasses import dataclass
from detector.orchestrator import DetectionResult
from detector.attribution import AttributionResult
from reporter.reasoning_generator import DetailedReasoning
from reporter.reasoning_generator import ReasoningGenerator
@dataclass
class DetailedMetric:
"""
Metric data structure with sub-metrics
"""
name : str
ai_probability : float
human_probability : float
confidence : float
verdict : str
description : str
detailed_metrics : Dict[str, float]
weight : float
class ReportGenerator:
"""
Generates comprehensive detection reports with detailed metrics
Supports:
- JSON (structured data with all details)
- PDF (printable reports with tables and formatting)
"""
def __init__(self, output_dir: Optional[Path] = None):
"""
Initialize report generator
Arguments:
----------
output_dir { str } : Directory for saving reports (default: data/reports)
"""
if (output_dir is None):
output_dir = Path(__file__).parent.parent / "data" / "reports"
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents = True,
exist_ok = True,
)
self.reasoning_generator = ReasoningGenerator()
logger.info(f"ReportGenerator initialized (output_dir={self.output_dir})")
def generate_complete_report(self, detection_result: DetectionResult, attribution_result: Optional[AttributionResult] = None, highlighted_sentences: Optional[List] = None,
formats: List[str] = ["json", "pdf"], filename_prefix: str = "ai_detection_report") -> Dict[str, str]:
"""
Generate comprehensive report in JSON and PDF formats with detailed metrics
Arguments:
----------
detection_result : Detection analysis result
attribution_result : Model attribution result (optional)
highlighted_sentences : List of highlighted sentences (optional)
formats : List of formats to generate (json, pdf)
filename_prefix : Prefix for output filenames
Returns:
--------
{ dict } : Dictionary mapping format to filepath
"""
# Convert DetectionResult to dict for consistent access
detection_dict = detection_result.to_dict() if hasattr(detection_result, 'to_dict') else detection_result
# Generate detailed reasoning
reasoning = self.reasoning_generator.generate(ensemble_result = detection_result.ensemble_result,
metric_results = detection_result.metric_results,
domain = detection_result.domain_prediction.primary_domain,
attribution_result = attribution_result,
text_length = detection_result.processed_text.word_count,
)
# Extract detailed metrics from ACTUAL detection results
detailed_metrics = self._extract_detailed_metrics(detection_dict)
# Timestamp for filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
generated_files = dict()
# Generate requested formats
if ("json" in formats):
json_path = self._generate_json_report(detection_dict = detection_dict,
reasoning = reasoning,
detailed_metrics = detailed_metrics,
attribution_result = attribution_result,
highlighted_sentences = highlighted_sentences,
filename = f"{filename_prefix}_{timestamp}.json",
)
generated_files["json"] = str(json_path)
if ("pdf" in formats):
try:
pdf_path = self._generate_pdf_report(detection_dict = detection_dict,
reasoning = reasoning,
detailed_metrics = detailed_metrics,
attribution_result = attribution_result,
highlighted_sentences = highlighted_sentences,
filename = f"{filename_prefix}_{timestamp}.pdf",
)
generated_files["pdf"] = str(pdf_path)
except Exception as e:
logger.warning(f"PDF generation failed: {repr(e)}")
logger.info("Install reportlab for PDF support: pip install reportlab")
logger.info(f"Generated {len(generated_files)} report(s): {list(generated_files.keys())}")
return generated_files
def _extract_detailed_metrics(self, detection_dict: Dict) -> List[DetailedMetric]:
"""
Extract detailed metrics with sub-metrics from ACTUAL detection result
"""
detailed_metrics = list()
metrics_data = detection_dict.get("metrics", {})
ensemble_data = detection_dict.get("ensemble", {})
# Get actual metric weights from ensemble
metric_weights = ensemble_data.get("metric_contributions", {})
# Extract actual metric data
for metric_name, metric_result in metrics_data.items():
if not isinstance(metric_result, dict):
continue
if metric_result.get("error") is not None:
continue
# Get actual probabilities and confidence
ai_prob = metric_result.get("ai_probability", 0) * 100
human_prob = metric_result.get("human_probability", 0) * 100
confidence = metric_result.get("confidence", 0) * 100
# Determine verdict based on actual probability
if (ai_prob >= 60):
verdict = "AI"
elif (ai_prob <= 40):
verdict = "HUMAN"
else:
verdict = "MIXED (AI + HUMAN)"
# Get actual weight or use default
weight = 0.0
if metric_name in metric_weights:
weight = metric_weights[metric_name].get("weight", 0.0) * 100
# Extract actual detailed metrics from metric result
detailed_metrics_data = self._extract_metric_details(metric_name = metric_name,
metric_result = metric_result,
)
# Get description based on metric type
description = self._get_metric_description(metric_name = metric_name)
detailed_metrics.append(DetailedMetric(name = metric_name,
ai_probability = ai_prob,
human_probability = human_prob,
confidence = confidence,
verdict = verdict,
description = description,
detailed_metrics = detailed_metrics_data,
weight = weight,
)
)
return detailed_metrics
def _extract_metric_details(self, metric_name: str, metric_result: Dict) -> Dict[str, float]:
"""
Extract detailed sub-metrics from metric result
"""
details = dict()
# Try to get details from metric result
if metric_result.get("details"):
details = metric_result["details"].copy()
# If no details available, provide basic calculated values
if not details:
details = {"ai_probability" : metric_result.get("ai_probability", 0) * 100,
"human_probability" : metric_result.get("human_probability", 0) * 100,
"confidence" : metric_result.get("confidence", 0) * 100,
"score" : metric_result.get("score", 0) * 100,
}
return details
def _get_metric_description(self, metric_name: str) -> str:
"""
Get description for each metric type
"""
descriptions = {"structural" : "Analyzes sentence structure, length patterns, and statistical features",
"perplexity" : "Measures text predictability using language model cross-entropy",
"entropy" : "Evaluates token diversity and sequence unpredictability",
"semantic_analysis" : "Examines semantic coherence, topic consistency, and logical flow",
"linguistic" : "Assesses grammatical patterns, syntactic complexity, and style markers",
"multi_perturbation_stability" : "Tests text stability under perturbation using curvature analysis",
}
return descriptions.get(metric_name, "Advanced text analysis metric.")
def _generate_json_report(self, detection_dict: Dict, reasoning: DetailedReasoning, detailed_metrics: List[DetailedMetric],
attribution_result: Optional[AttributionResult], highlighted_sentences: Optional[List] = None, filename: str = None) -> Path:
"""
Generate JSON format report with detailed metrics
"""
# Convert metrics to serializable format
metrics_data = list()
for metric in detailed_metrics:
metrics_data.append({"name" : metric.name,
"ai_probability" : metric.ai_probability,
"human_probability" : metric.human_probability,
"confidence" : metric.confidence,
"verdict" : metric.verdict,
"description" : metric.description,
"weight" : metric.weight,
"detailed_metrics" : metric.detailed_metrics,
})
# Convert highlighted sentences to serializable format
highlighted_data = None
if highlighted_sentences:
highlighted_data = list()
for sent in highlighted_sentences:
highlighted_data.append({"text" : sent.text,
"ai_probability" : sent.ai_probability,
"confidence" : sent.confidence,
"color_class" : sent.color_class,
"index" : sent.index,
})
# Attribution data
attribution_data = None
if attribution_result:
attribution_data = {"predicted_model" : attribution_result.predicted_model.value,
"confidence" : attribution_result.confidence,
"model_probabilities" : attribution_result.model_probabilities,
"reasoning" : attribution_result.reasoning,
"fingerprint_matches" : attribution_result.fingerprint_matches,
"domain_used" : attribution_result.domain_used.value,
"metric_contributions": attribution_result.metric_contributions,
}
# Use ACTUAL detection results from dictionary
ensemble_data = detection_dict.get("ensemble", {})
analysis_data = detection_dict.get("analysis", {})
metrics_data_dict = detection_dict.get("metrics", {})
performance_data = detection_dict.get("performance", {})
report_data = {"report_metadata" : {"generated_at" : datetime.now().isoformat(),
"version" : "1.0.0",
"format" : "json",
"report_id" : filename.replace('.json', ''),
},
"overall_results" : {"final_verdict" : ensemble_data.get("final_verdict", "Unknown"),
"ai_probability" : ensemble_data.get("ai_probability", 0),
"human_probability" : ensemble_data.get("human_probability", 0),
"mixed_probability" : ensemble_data.get("mixed_probability", 0),
"overall_confidence" : ensemble_data.get("overall_confidence", 0),
"uncertainty_score" : ensemble_data.get("uncertainty_score", 0),
"consensus_level" : ensemble_data.get("consensus_level", 0),
"domain" : analysis_data.get("domain", "general"),
"domain_confidence" : analysis_data.get("domain_confidence", 0),
"text_length" : analysis_data.get("text_length", 0),
"sentence_count" : analysis_data.get("sentence_count", 0),
},
"ensemble_analysis" : {"method_used" : "confidence_calibrated",
"metric_weights" : ensemble_data.get("metric_contributions", {}),
"reasoning" : ensemble_data.get("reasoning", []),
},
"detailed_metrics" : metrics_data,
"detection_reasoning" : {"summary" : reasoning.summary,
"key_indicators" : reasoning.key_indicators,
"metric_explanations" : reasoning.metric_explanations,
"supporting_evidence" : reasoning.supporting_evidence,
"contradicting_evidence" : reasoning.contradicting_evidence,
"confidence_explanation" : reasoning.confidence_explanation,
"domain_analysis" : reasoning.domain_analysis,
"ensemble_analysis" : reasoning.ensemble_analysis,
"uncertainty_analysis" : reasoning.uncertainty_analysis,
"recommendations" : reasoning.recommendations,
},
"highlighted_text" : highlighted_data,
"model_attribution" : attribution_data,
"performance_metrics" : {"total_processing_time" : performance_data.get("total_time", 0),
"metrics_execution_time" : performance_data.get("metrics_time", {}),
"warnings" : detection_dict.get("warnings", []),
"errors" : detection_dict.get("errors", []),
}
}
output_path = self.output_dir / filename
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(obj = report_data,
fp = f,
indent = 4,
ensure_ascii = False,
)
logger.info(f"JSON report saved: {output_path}")
return output_path
def _generate_pdf_report(self, detection_dict: Dict, reasoning: DetailedReasoning, detailed_metrics: List[DetailedMetric],
attribution_result: Optional[AttributionResult], highlighted_sentences: Optional[List] = None, filename: str = None) -> Path:
"""
Generate PDF format report with detailed metrics
"""
try:
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
except ImportError:
raise ImportError("reportlab is required for PDF generation. Install: pip install reportlab")
output_path = self.output_dir / filename
# Create PDF
doc = SimpleDocTemplate(str(output_path),
pagesize = letter,
rightMargin = 50,
leftMargin = 50,
topMargin = 50,
bottomMargin = 20,
)
# Container for PDF elements
elements = list()
styles = getSampleStyleSheet()
# Custom styles
title_style = ParagraphStyle('CustomTitle',
parent = styles['Heading1'],
fontSize = 20,
textColor = colors.HexColor('#667eea'),
spaceAfter = 20,
alignment = TA_CENTER,
)
heading_style = ParagraphStyle('CustomHeading',
parent = styles['Heading2'],
fontSize = 14,
textColor = colors.HexColor('#111827'),
spaceAfter = 12,
spaceBefore = 12,
)
body_style = ParagraphStyle('CustomBody',
parent = styles['BodyText'],
fontSize = 10,
alignment = TA_JUSTIFY,
spaceAfter = 8,
)
# Use detection results from dictionary
ensemble_data = detection_dict.get("ensemble", {})
analysis_data = detection_dict.get("analysis", {})
# Title and main sections
elements.append(Paragraph("AI Text Detection Analysis Report", title_style))
elements.append(Paragraph(f"Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}", styles['Normal']))
elements.append(Spacer(1, 0.3*inch))
# Verdict section with ensemble metrics
elements.append(Paragraph("Detection Summary", heading_style))
verdict_data = [['Final Verdict:', ensemble_data.get("final_verdict", "Unknown")],
['AI Probability:', f"{ensemble_data.get('ai_probability', 0):.1%}"],
['Human Probability:', f"{ensemble_data.get('human_probability', 0):.1%}"],
['Mixed Probability:', f"{ensemble_data.get('mixed_probability', 0):.1%}"],
['Overall Confidence:', f"{ensemble_data.get('overall_confidence', 0):.1%}"],
['Uncertainty Score:', f"{ensemble_data.get('uncertainty_score', 0):.1%}"],
['Consensus Level:', f"{ensemble_data.get('consensus_level', 0):.1%}"],
]
verdict_table = Table(verdict_data, colWidths=[2*inch, 3*inch])
verdict_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#f8fafc')),
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 10),
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
])
)
elements.append(verdict_table)
elements.append(Spacer(1, 0.2*inch))
# Content analysis
elements.append(Paragraph("Content Analysis", heading_style))
content_data = [['Content Domain:', analysis_data.get("domain", "general").title()],
['Domain Confidence:', f"{analysis_data.get('domain_confidence', 0):.1%}"],
['Word Count:', str(analysis_data.get("text_length", 0))],
['Sentence Count:', str(analysis_data.get("sentence_count", 0))],
['Processing Time:', f"{detection_dict.get('performance', {}).get('total_time', 0):.2f}s"],
]
content_table = Table(content_data, colWidths=[2*inch, 3*inch])
content_table.setStyle(TableStyle([('FONTSIZE', (0, 0), (-1, -1), 10),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
])
)
elements.append(content_table)
elements.append(Spacer(1, 0.2*inch))
# Ensemble Analysis
elements.append(Paragraph("Ensemble Analysis", heading_style))
elements.append(Paragraph("Method: Confidence Calibrated Aggregation", styles['Normal']))
elements.append(Spacer(1, 0.1*inch))
# Metric weights table
metric_contributions = ensemble_data.get("metric_contributions", {})
if metric_contributions:
elements.append(Paragraph("Metric Weights", styles['Heading3']))
weight_data = [['Metric', 'Weight']]
for metric, contribution in metric_contributions.items():
weight = contribution.get("weight", 0)
weight_data.append([metric.title(), f"{weight:.1%}"])
weight_table = Table(weight_data, colWidths=[3*inch, 1*inch])
weight_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#667eea')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('GRID', (0, 0), (-1, -1), 1, colors.black),
])
)
elements.append(weight_table)
elements.append(Spacer(1, 0.2*inch))
# Detailed metrics
elements.append(Paragraph("Detailed Metric Analysis", heading_style))
for metric in detailed_metrics:
elements.append(Paragraph(f"{metric.name.title().replace('_', ' ')}", styles['Heading3']))
metric_data = [['Verdict:', metric.verdict],
['AI Probability:', f"{metric.ai_probability:.1f}%"],
['Human Probability:', f"{metric.human_probability:.1f}%"],
['Confidence:', f"{metric.confidence:.1f}%"],
['Ensemble Weight:', f"{metric.weight:.1f}%"],
]
metric_table = Table(metric_data, colWidths=[1.5*inch, 1.5*inch])
metric_table.setStyle(TableStyle([('FONTSIZE', (0, 0), (-1, -1), 9),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
])
)
elements.append(metric_table)
elements.append(Paragraph(metric.description, body_style))
# Add detailed sub-metrics if available
if metric.detailed_metrics:
elements.append(Paragraph("Detailed Metrics:", styles['Heading4']))
sub_metric_data = [['Metric', 'Value']]
for sub_name, sub_value in list(metric.detailed_metrics.items())[:6]: # Show top 6
sub_metric_data.append([sub_name.replace('_', ' ').title(), f"{sub_value:.2f}"])
sub_metric_table = Table(sub_metric_data, colWidths=[2*inch, 1*inch])
sub_metric_table.setStyle(TableStyle([('FONTSIZE', (0, 0), (-1, -1), 8),
('BOTTOMPADDING', (0, 0), (-1, -1), 2),
('GRID', (0, 0), (-1, -1), 1, colors.grey),
])
)
elements.append(sub_metric_table)
elements.append(Spacer(1, 0.1*inch))
# Detection Reasoning
elements.append(Paragraph("Detection Reasoning", heading_style))
elements.append(Paragraph(reasoning.summary, body_style))
elements.append(Spacer(1, 0.1*inch))
# Key Indicators
elements.append(Paragraph("Key Indicators", styles['Heading3']))
for indicator in reasoning.key_indicators[:5]: # Show top 5
elements.append(Paragraph(f"• {indicator}", body_style))
elements.append(Spacer(1, 0.1*inch))
# Confidence Explanation
elements.append(Paragraph("Confidence Analysis", styles['Heading3']))
elements.append(Paragraph(reasoning.confidence_explanation, body_style))
elements.append(Spacer(1, 0.1*inch))
# Uncertainty Analysis
elements.append(Paragraph("Uncertainty Analysis", styles['Heading3']))
elements.append(Paragraph(reasoning.uncertainty_analysis, body_style))
# Model Attribution Section
if attribution_result:
elements.append(PageBreak())
elements.append(Paragraph("AI Model Attribution", heading_style))
# Attribution summary
predicted_model = attribution_result.predicted_model.value.replace("_", " ").title()
confidence = attribution_result.confidence * 100
attribution_summary = [['Predicted Model:', predicted_model],
['Attribution Confidence:', f"{confidence:.1f}%"],
['Domain Used:', attribution_result.domain_used.value.title()],
]
attribution_table = Table(attribution_summary, colWidths=[2*inch, 3*inch])
attribution_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#f8fafc')),
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 10),
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
])
)
elements.append(attribution_table)
elements.append(Spacer(1, 0.1*inch))
# Model probabilities table
if attribution_result.model_probabilities:
elements.append(Paragraph("Model Probability Breakdown", styles['Heading3']))
prob_data = [['Model', 'Probability']]
# Show top 5
sorted_models = sorted(attribution_result.model_probabilities.items(),
key = lambda x: x[1],
reverse = True)[:5]
for model_name, probability in sorted_models:
display_name = model_name.replace("_", " ").replace("-", " ").title()
prob_data.append([display_name, f"{probability:.1%}"])
prob_table = Table(prob_data, colWidths=[3*inch, 1*inch])
prob_table.setStyle(TableStyle([('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#667eea')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('GRID', (0, 0), (-1, -1), 1, colors.black),
])
)
elements.append(prob_table)
elements.append(Spacer(1, 0.2*inch))
# Attribution reasoning
if attribution_result.reasoning:
elements.append(Paragraph("Attribution Reasoning", styles['Heading3']))
for reason in attribution_result.reasoning[:3]: # Show top 3 reasons
elements.append(Paragraph(f"• {reason}", body_style))
# Recommendations
elements.append(PageBreak())
elements.append(Paragraph("Recommendations", heading_style))
for recommendation in reasoning.recommendations:
elements.append(Paragraph(f"• {recommendation}", body_style))
# Footer
elements.append(Spacer(1, 0.3*inch))
elements.append(Paragraph(f"Generated by AI Text Detector v2.0 | Processing Time: {detection_dict.get('performance', {}).get('total_time', 0):.2f}s",
ParagraphStyle('Footer', parent=styles['Normal'], fontSize=8, textColor=colors.gray)))
# Build PDF
doc.build(elements)
logger.info(f"PDF report saved: {output_path}")
return output_path
# Export
__all__ = ["ReportGenerator",
"DetailedMetric",
]