File size: 7,542 Bytes
178b774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
Report generation — JSON and Markdown output for senator profiles.
"""
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional

from .fusion import SenatorProfile

log = logging.getLogger(__name__)


def generate_json_report(
    profile: SenatorProfile,
    output_path: Optional[str] = None,
) -> dict:
    """Generate a JSON report from a SenatorProfile."""
    report = {
        "meta": {
            "generated_at": datetime.utcnow().isoformat() + "Z",
            "pipeline_version": "0.1.0",
            "model_stack": {
                "embeddings": "Qwen/Qwen3-Embedding-0.6B",
                "sentiment": "cardiffnlp/twitter-roberta-base-sentiment-latest",
                "emotion": "cardiffnlp/twitter-roberta-base-emotion",
                "offensive": "cardiffnlp/twitter-roberta-base-offensive",
                "irony": "cardiffnlp/twitter-roberta-base-irony",
                "hate": "cardiffnlp/twitter-roberta-base-hate-multiclass-latest",
                "toxicity": "s-nlp/roberta_toxicity_classifier",
            },
        },
        "senator": {
            "name": profile.senator_name,
            "twitter_handle": profile.twitter_handle,
            "party": profile.party,
            "state": profile.state,
        },
        "summary": {
            "n_tweets_analyzed": profile.n_tweets_analyzed,
            "date_range": profile.date_range,
            "compulsion_score": profile.compulsion_score,
            "virulence_score": profile.virulence_score,
            "overall_risk_score": profile.overall_risk_score,
        },
        "compulsion": {
            "score": profile.compulsion_score,
            "subscores": profile.compulsion_subscores,
        },
        "virulence": {
            "score": profile.virulence_score,
            "subscores": profile.virulence_subscores,
            "distribution": profile.virulence_distribution,
        },
        "classification_detail": {
            "sentiment_distribution": profile.sentiment_distribution,
            "emotion_distribution": profile.emotion_distribution,
            "toxicity_stats": profile.toxicity_stats,
        },
        "top_rage_tweets": profile.top_rage_tweets,
        "disclaimers": [
            "This analysis does not constitute a clinical diagnosis of addiction, "
            "compulsion, or mental health condition.",
            "Scores are derived from automated classifiers with known error rates "
            "and should not be treated as ground truth.",
            "Temporal analysis uses UTC timestamps which may not reflect the "
            "poster's local timezone.",
            "Classifier models were trained on general Twitter data, not "
            "specifically on political speech.",
        ],
    }

    if output_path:
        p = Path(output_path)
        p.parent.mkdir(parents=True, exist_ok=True)
        with open(p, "w") as f:
            json.dump(report, f, indent=2, default=str)
        log.info("JSON report saved to %s", p)

    return report


def generate_markdown_report(
    profile: SenatorProfile,
    output_path: Optional[str] = None,
) -> str:
    """Generate a Markdown report from a SenatorProfile."""

    lines = []
    lines.append(f"# X-Box Analysis: {profile.senator_name}")
    lines.append(f"**@{profile.twitter_handle}** | {profile.party} | {profile.state}")
    lines.append("")

    # Summary
    lines.append("## Summary")
    lines.append(f"- **Tweets analyzed**: {profile.n_tweets_analyzed:,}")
    lines.append(f"- **Date range**: {profile.date_range}")
    lines.append(f"- **Compulsion score**: {profile.compulsion_score}/100")
    lines.append(f"- **Virulence score**: {profile.virulence_score}/100")
    lines.append(f"- **Overall risk score**: {profile.overall_risk_score}/100")
    lines.append("")

    # Compulsion breakdown
    lines.append("## Compulsion-Like Behavior")
    lines.append("| Dimension | Score |")
    lines.append("| --- | ---: |")
    for k, v in profile.compulsion_subscores.items():
        lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
    lines.append("")

    # Virulence breakdown
    lines.append("## Virulence Analysis")
    lines.append("| Dimension | Score |")
    lines.append("| --- | ---: |")
    for k, v in profile.virulence_subscores.items():
        lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
    lines.append("")

    # Classification detail
    if profile.sentiment_distribution:
        lines.append("### Sentiment Distribution")
        lines.append("| Label | Share |")
        lines.append("| --- | ---: |")
        for k, v in sorted(profile.sentiment_distribution.items()):
            lines.append(f"| {k} | {v:.1%} |")
        lines.append("")

    if profile.emotion_distribution:
        lines.append("### Emotion Distribution")
        lines.append("| Emotion | Share |")
        lines.append("| --- | ---: |")
        for k, v in sorted(profile.emotion_distribution.items()):
            lines.append(f"| {k} | {v:.1%} |")
        lines.append("")

    if profile.toxicity_stats:
        lines.append("### Toxicity")
        tox = profile.toxicity_stats
        lines.append(f"- Mean toxicity score: {tox.get('mean', 0):.4f}")
        lines.append(f"- % classified toxic: {tox.get('pct_toxic', 0):.2f}%")
        lines.append(f"- P90 toxicity: {tox.get('p90', 0):.4f}")
        lines.append("")

    # Top rage events
    if profile.top_rage_tweets:
        lines.append("## Top Rage Events")
        lines.append("| Date | Virulence | Outrage | Ad Hominem | Text |")
        lines.append("| --- | ---: | ---: | ---: | --- |")
        for evt in profile.top_rage_tweets[:10]:
            date = str(evt.get("created_at", ""))[:10]
            text = evt.get("text", "")[:80].replace("|", "\\|")
            lines.append(
                f"| {date} | {evt.get('composite_virulence', 0):.3f} "
                f"| {evt.get('outrage_intensity', 0):.3f} "
                f"| {evt.get('ad_hominem', 0):.3f} "
                f"| {text}... |"
            )
        lines.append("")

    # Methodology
    lines.append("## Methodology")
    lines.append("- **Embeddings**: Qwen/Qwen3-Embedding-0.6B (MTEB #1 under 1B params)")
    lines.append("- **Sentiment**: cardiffnlp/twitter-roberta-base-sentiment-latest")
    lines.append("- **Emotion**: cardiffnlp/twitter-roberta-base-emotion (anger/joy/optimism/sadness)")
    lines.append("- **Offensive**: cardiffnlp/twitter-roberta-base-offensive")
    lines.append("- **Irony**: cardiffnlp/twitter-roberta-base-irony")
    lines.append("- **Hate speech**: cardiffnlp/twitter-roberta-base-hate-multiclass-latest")
    lines.append("- **Toxicity**: s-nlp/roberta_toxicity_classifier")
    lines.append("- **Behavioral**: Temporal/metadata features with sigmoid-scaled scoring")
    lines.append("")

    # Disclaimers
    lines.append("## Disclaimers")
    lines.append("- This analysis does not constitute a clinical diagnosis.")
    lines.append("- Classifier scores are probabilistic and subject to error.")
    lines.append("- UTC timestamps may not reflect the poster's local timezone.")
    lines.append("- Models trained on general Twitter data, not political speech specifically.")
    lines.append("")

    text = "\n".join(lines)

    if output_path:
        p = Path(output_path)
        p.parent.mkdir(parents=True, exist_ok=True)
        with open(p, "w") as f:
            f.write(text)
        log.info("Markdown report saved to %s", p)

    return text