jimnoneill's picture
Upload folder using huggingface_hub
178b774 verified
"""
Report generation — JSON and Markdown output for senator profiles.
"""
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional
from .fusion import SenatorProfile
log = logging.getLogger(__name__)
def generate_json_report(
profile: SenatorProfile,
output_path: Optional[str] = None,
) -> dict:
"""Generate a JSON report from a SenatorProfile."""
report = {
"meta": {
"generated_at": datetime.utcnow().isoformat() + "Z",
"pipeline_version": "0.1.0",
"model_stack": {
"embeddings": "Qwen/Qwen3-Embedding-0.6B",
"sentiment": "cardiffnlp/twitter-roberta-base-sentiment-latest",
"emotion": "cardiffnlp/twitter-roberta-base-emotion",
"offensive": "cardiffnlp/twitter-roberta-base-offensive",
"irony": "cardiffnlp/twitter-roberta-base-irony",
"hate": "cardiffnlp/twitter-roberta-base-hate-multiclass-latest",
"toxicity": "s-nlp/roberta_toxicity_classifier",
},
},
"senator": {
"name": profile.senator_name,
"twitter_handle": profile.twitter_handle,
"party": profile.party,
"state": profile.state,
},
"summary": {
"n_tweets_analyzed": profile.n_tweets_analyzed,
"date_range": profile.date_range,
"compulsion_score": profile.compulsion_score,
"virulence_score": profile.virulence_score,
"overall_risk_score": profile.overall_risk_score,
},
"compulsion": {
"score": profile.compulsion_score,
"subscores": profile.compulsion_subscores,
},
"virulence": {
"score": profile.virulence_score,
"subscores": profile.virulence_subscores,
"distribution": profile.virulence_distribution,
},
"classification_detail": {
"sentiment_distribution": profile.sentiment_distribution,
"emotion_distribution": profile.emotion_distribution,
"toxicity_stats": profile.toxicity_stats,
},
"top_rage_tweets": profile.top_rage_tweets,
"disclaimers": [
"This analysis does not constitute a clinical diagnosis of addiction, "
"compulsion, or mental health condition.",
"Scores are derived from automated classifiers with known error rates "
"and should not be treated as ground truth.",
"Temporal analysis uses UTC timestamps which may not reflect the "
"poster's local timezone.",
"Classifier models were trained on general Twitter data, not "
"specifically on political speech.",
],
}
if output_path:
p = Path(output_path)
p.parent.mkdir(parents=True, exist_ok=True)
with open(p, "w") as f:
json.dump(report, f, indent=2, default=str)
log.info("JSON report saved to %s", p)
return report
def generate_markdown_report(
profile: SenatorProfile,
output_path: Optional[str] = None,
) -> str:
"""Generate a Markdown report from a SenatorProfile."""
lines = []
lines.append(f"# X-Box Analysis: {profile.senator_name}")
lines.append(f"**@{profile.twitter_handle}** | {profile.party} | {profile.state}")
lines.append("")
# Summary
lines.append("## Summary")
lines.append(f"- **Tweets analyzed**: {profile.n_tweets_analyzed:,}")
lines.append(f"- **Date range**: {profile.date_range}")
lines.append(f"- **Compulsion score**: {profile.compulsion_score}/100")
lines.append(f"- **Virulence score**: {profile.virulence_score}/100")
lines.append(f"- **Overall risk score**: {profile.overall_risk_score}/100")
lines.append("")
# Compulsion breakdown
lines.append("## Compulsion-Like Behavior")
lines.append("| Dimension | Score |")
lines.append("| --- | ---: |")
for k, v in profile.compulsion_subscores.items():
lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
lines.append("")
# Virulence breakdown
lines.append("## Virulence Analysis")
lines.append("| Dimension | Score |")
lines.append("| --- | ---: |")
for k, v in profile.virulence_subscores.items():
lines.append(f"| {k.replace('_', ' ').title()} | {v} |")
lines.append("")
# Classification detail
if profile.sentiment_distribution:
lines.append("### Sentiment Distribution")
lines.append("| Label | Share |")
lines.append("| --- | ---: |")
for k, v in sorted(profile.sentiment_distribution.items()):
lines.append(f"| {k} | {v:.1%} |")
lines.append("")
if profile.emotion_distribution:
lines.append("### Emotion Distribution")
lines.append("| Emotion | Share |")
lines.append("| --- | ---: |")
for k, v in sorted(profile.emotion_distribution.items()):
lines.append(f"| {k} | {v:.1%} |")
lines.append("")
if profile.toxicity_stats:
lines.append("### Toxicity")
tox = profile.toxicity_stats
lines.append(f"- Mean toxicity score: {tox.get('mean', 0):.4f}")
lines.append(f"- % classified toxic: {tox.get('pct_toxic', 0):.2f}%")
lines.append(f"- P90 toxicity: {tox.get('p90', 0):.4f}")
lines.append("")
# Top rage events
if profile.top_rage_tweets:
lines.append("## Top Rage Events")
lines.append("| Date | Virulence | Outrage | Ad Hominem | Text |")
lines.append("| --- | ---: | ---: | ---: | --- |")
for evt in profile.top_rage_tweets[:10]:
date = str(evt.get("created_at", ""))[:10]
text = evt.get("text", "")[:80].replace("|", "\\|")
lines.append(
f"| {date} | {evt.get('composite_virulence', 0):.3f} "
f"| {evt.get('outrage_intensity', 0):.3f} "
f"| {evt.get('ad_hominem', 0):.3f} "
f"| {text}... |"
)
lines.append("")
# Methodology
lines.append("## Methodology")
lines.append("- **Embeddings**: Qwen/Qwen3-Embedding-0.6B (MTEB #1 under 1B params)")
lines.append("- **Sentiment**: cardiffnlp/twitter-roberta-base-sentiment-latest")
lines.append("- **Emotion**: cardiffnlp/twitter-roberta-base-emotion (anger/joy/optimism/sadness)")
lines.append("- **Offensive**: cardiffnlp/twitter-roberta-base-offensive")
lines.append("- **Irony**: cardiffnlp/twitter-roberta-base-irony")
lines.append("- **Hate speech**: cardiffnlp/twitter-roberta-base-hate-multiclass-latest")
lines.append("- **Toxicity**: s-nlp/roberta_toxicity_classifier")
lines.append("- **Behavioral**: Temporal/metadata features with sigmoid-scaled scoring")
lines.append("")
# Disclaimers
lines.append("## Disclaimers")
lines.append("- This analysis does not constitute a clinical diagnosis.")
lines.append("- Classifier scores are probabilistic and subject to error.")
lines.append("- UTC timestamps may not reflect the poster's local timezone.")
lines.append("- Models trained on general Twitter data, not political speech specifically.")
lines.append("")
text = "\n".join(lines)
if output_path:
p = Path(output_path)
p.parent.mkdir(parents=True, exist_ok=True)
with open(p, "w") as f:
f.write(text)
log.info("Markdown report saved to %s", p)
return text