jimnoneill's picture
Upload folder using huggingface_hub
178b774 verified
"""
Pipeline configuration β€” model IDs, paths, thresholds.
"""
from pathlib import Path
# ── Paths ──────────────────────────────────────────────────────────
PROJECT_ROOT = Path(__file__).resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
MODELS_DIR = PROJECT_ROOT / "models"
OUTPUT_DIR = PROJECT_ROOT / "output"
# Windows mount path for existing data
XBOX_DATA = Path("/mnt/c/x_box") if Path("/mnt/c").exists() else Path("C:/x_box")
# ── Embedding model (MTEB #1 under 1B) ────────────────────────────
EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
EMBEDDING_DIM = 1024
EMBEDDING_FALLBACK = "dunzhang/stella_en_400M_v5"
# ── Classification models (CardiffNLP Twitter-RoBERTa suite) ──────
CLASSIFIER_MODELS = {
"sentiment": "cardiffnlp/twitter-roberta-base-sentiment-latest",
"emotion": "cardiffnlp/twitter-roberta-base-emotion",
"offensive": "cardiffnlp/twitter-roberta-base-offensive",
"irony": "cardiffnlp/twitter-roberta-base-irony",
"hate": "cardiffnlp/twitter-roberta-base-hate-multiclass-latest",
}
# ── Toxicity model ────────────────────────────────────────────────
TOXICITY_MODEL = "s-nlp/roberta_toxicity_classifier"
# ── Senator data sources ──────────────────────────────────────────
CONGRESS_LEGISLATORS_URL = (
"https://raw.githubusercontent.com/unitedstates/congress-legislators"
"/main/legislators-social-media.yaml"
)
CONGRESS_LEGISLATORS_CURRENT_URL = (
"https://raw.githubusercontent.com/unitedstates/congress-legislators"
"/main/legislators-current.yaml"
)
SENATOR_TWEETS_DATASET = "m-newhauser/senator-tweets"
# ── Behavioral thresholds ─────────────────────────────────────────
SESSION_GAP_MINUTES = 30 # gap before new session
NIGHT_START_HOUR = 0 # UTC
NIGHT_END_HOUR = 6 # UTC
BURST_WINDOW_MINUTES = 60 # window for burst detection
# ── Virulence score weights ───────────────────────────────────────
VIRULENCE_WEIGHTS = {
"sentiment_negative": 0.15,
"emotion_anger": 0.20,
"offensive": 0.20,
"toxicity": 0.15,
"hate": 0.10,
"irony": 0.05,
"engagement_controversy": 0.10,
"burst_bonus": 0.05,
}
# ── Compulsion score weights ──────────────────────────────────────
COMPULSION_WEIGHTS = {
"activity": 0.20,
"burstiness": 0.25,
"night_activity": 0.15,
"session_intensity": 0.15,
"reply_reactivity": 0.10,
"repetition": 0.10,
"emoji_media_sparsity": 0.05,
}
# ── Batch sizes ───────────────────────────────────────────────────
CLASSIFICATION_BATCH_SIZE = 32
EMBEDDING_BATCH_SIZE = 16