File size: 7,542 Bytes

178b774

"""
Report generation — JSON and Markdown output for senator profiles.
"""
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional

from .fusion import SenatorProfile

log = logging.getLogger(__name__)


def generate_json_report(
    profile: SenatorProfile,
    output_path: Optional[str] = None,
) -> dict:
    """Generate a JSON report from a SenatorProfile."""
    report = {
        "meta": {
            "generated_at": datetime.utcnow().isoformat() + "Z",
            "pipeline_version": "0.1.0",
            "model_stack": {
                "embeddings": "Qwen/Qwen3-Embedding-0.6B",
                "sentiment": "cardiffnlp/twitter-roberta-base-sentiment-latest",
                "emotion": "cardiffnlp/twitter-roberta-base-emotion",
                "offensive": "cardiffnlp/twitter-roberta-base-offensive",
                "irony": "cardiffnlp/twitter-roberta-base-irony",
                "hate": "cardiffnlp/twitter-roberta-base-hate-multiclass-latest",
                "toxicity": "s-nlp/roberta_toxicity_classifier",
            },
        },
        "senator": {
            "name": profile.senator_name,
            "twitter_handle": profile.twitter_handle,
            "party": profile.party,
            "state": profile.state,
        },
        "summary": {
            "n_tweets_analyzed": profile.n_tweets_analyzed,
            "date_range": profile.date_range,
            "compulsion_score": profile.compulsion_score,
            "virulence_score": profile.virulence_score,
            "overall_risk_score": profile.overall_risk_score,
        },
        "compulsion": {
            "score": profile.compulsion_score,
            "subscores": profile.compulsion_subscores,
        },
        "virulence": {
            "score": profile.virulence_score,
            "subscores": profile.virulence_subscores,
            "distribution": profile.virulence_distribution,
        },
        "classification_detail": {
            "sentiment_distribution": profile.sentiment_distribution,
            "emotion_distribution": profile.emotion_distribution,
            "toxicity_stats": profile.toxicity_stats,
        },
        "top_rage_tweets": profile.top_rage_tweets,
        "disclaimers": [
            "This analysis does not constitute a clinical diagnosis of addiction, "
            "compulsion, or mental health condition.",
            "Scores are derived from automated classifiers with known error rates "
            "and should not be treated as ground truth.",
            "Temporal analysis uses UTC timestamps which may not reflect the "
            "poster's local timezone.",
            "Classifier models were trained on general Twitter data, not "
            "specifically on political speech.",
        ],
    }

    if output_path:
        p = Path(output_path)
        p.parent.mkdir(parents=True, exist_ok=True)
        with open(p, "w") as f:
            json.dump(report, f, indent=2, default=str)
        log.info("JSON report saved to %s", p)

    return report


def generate_markdown_report(
    profile: SenatorProfile,
    output_path: Optional[str] = None,
) -> str:
    """Generate a Markdown report from a SenatorProfile."""

    lines = []
    lines.append(f"# X-Box Analysis: {profile.senator_name}")
    lines.append(f"**@{profile.twitter_handle}** | {profile.party} | {profile.state}")
    lines.append("")

    # Summary
    lines.append("## Summary")
    lines.append(f"- **Tweets analyzed**: {profile.n_tweets_analyzed:,}")
    lines.append(f"- **Date range**: {profile.date_range}")
    lines.append(f"- **Compulsion score**: {profile.compulsion_score}/100")
    lines.append(f"- **Virulence score**: {profile.virulence_score}/100")
    lines.append(f"- **Overall risk score**: {profile.overall_risk_score}/100")
    lines.append("")

    # Compulsion breakdown
    lines.append("## Compulsion-Like Behavior")
    lines.append("| Dimension | Score |")
    lines.append("| --- | ---: |")
    for k, v in profile.compulsion_subscores.items():
        lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
    lines.append("")

    # Virulence breakdown
    lines.append("## Virulence Analysis")
    lines.append("| Dimension | Score |")
    lines.append("| --- | ---: |")
    for k, v in profile.virulence_subscores.items():
        lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
    lines.append("")

    # Classification detail
    if profile.sentiment_distribution:
        lines.append("### Sentiment Distribution")
        lines.append("| Label | Share |")
        lines.append("| --- | ---: |")
        for k, v in sorted(profile.sentiment_distribution.items()):
            lines.append(f"| {k} | {v:.1%} |")
        lines.append("")

    if profile.emotion_distribution:
        lines.append("### Emotion Distribution")
        lines.append("| Emotion | Share |")
        lines.append("| --- | ---: |")
        for k, v in sorted(profile.emotion_distribution.items()):
            lines.append(f"| {k} | {v:.1%} |")
        lines.append("")

    if profile.toxicity_stats:
        lines.append("### Toxicity")
        tox = profile.toxicity_stats
        lines.append(f"- Mean toxicity score: {tox.get('mean', 0):.4f}")
        lines.append(f"- % classified toxic: {tox.get('pct_toxic', 0):.2f}%")
        lines.append(f"- P90 toxicity: {tox.get('p90', 0):.4f}")
        lines.append("")

    # Top rage events
    if profile.top_rage_tweets:
        lines.append("## Top Rage Events")
        lines.append("| Date | Virulence | Outrage | Ad Hominem | Text |")
        lines.append("| --- | ---: | ---: | ---: | --- |")
        for evt in profile.top_rage_tweets[:10]:
            date = str(evt.get("created_at", ""))[:10]
            text = evt.get("text", "")[:80].replace("|", "\\|")
            lines.append(
                f"| {date} | {evt.get('composite_virulence', 0):.3f} "
                f"| {evt.get('outrage_intensity', 0):.3f} "
                f"| {evt.get('ad_hominem', 0):.3f} "
                f"| {text}... |"
            )
        lines.append("")

    # Methodology
    lines.append("## Methodology")
    lines.append("- **Embeddings**: Qwen/Qwen3-Embedding-0.6B (MTEB #1 under 1B params)")
    lines.append("- **Sentiment**: cardiffnlp/twitter-roberta-base-sentiment-latest")
    lines.append("- **Emotion**: cardiffnlp/twitter-roberta-base-emotion (anger/joy/optimism/sadness)")
    lines.append("- **Offensive**: cardiffnlp/twitter-roberta-base-offensive")
    lines.append("- **Irony**: cardiffnlp/twitter-roberta-base-irony")
    lines.append("- **Hate speech**: cardiffnlp/twitter-roberta-base-hate-multiclass-latest")
    lines.append("- **Toxicity**: s-nlp/roberta_toxicity_classifier")
    lines.append("- **Behavioral**: Temporal/metadata features with sigmoid-scaled scoring")
    lines.append("")

    # Disclaimers
    lines.append("## Disclaimers")
    lines.append("- This analysis does not constitute a clinical diagnosis.")
    lines.append("- Classifier scores are probabilistic and subject to error.")
    lines.append("- UTC timestamps may not reflect the poster's local timezone.")
    lines.append("- Models trained on general Twitter data, not political speech specifically.")
    lines.append("")

    text = "\n".join(lines)

    if output_path:
        p = Path(output_path)
        p.parent.mkdir(parents=True, exist_ok=True)
        with open(p, "w") as f:
            f.write(text)
        log.info("Markdown report saved to %s", p)

    return text