jimnoneill
/

xbox-compulsion-classifier

Text Classification

social-media-analysis

compulsion-detection

political-tweets

bayesian-classifier

digital-phenotyping

Model card Files Files and versions

xbox-compulsion-classifier / xbox /config.py

jimnoneill's picture

Upload folder using huggingface_hub

178b774 verified 2 months ago

history blame contribute delete

3.29 kB

	"""
	Pipeline configuration — model IDs, paths, thresholds.
	"""
	from pathlib import Path

	# ── Paths ──────────────────────────────────────────────────────────
	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	DATA_DIR = PROJECT_ROOT / "data"
	MODELS_DIR = PROJECT_ROOT / "models"
	OUTPUT_DIR = PROJECT_ROOT / "output"

	# Windows mount path for existing data
	XBOX_DATA = Path("/mnt/c/x_box") if Path("/mnt/c").exists() else Path("C:/x_box")

	# ── Embedding model (MTEB #1 under 1B) ────────────────────────────
	EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-0.6B"
	EMBEDDING_DIM = 1024
	EMBEDDING_FALLBACK = "dunzhang/stella_en_400M_v5"

	# ── Classification models (CardiffNLP Twitter-RoBERTa suite) ──────
	CLASSIFIER_MODELS = {
	"sentiment": "cardiffnlp/twitter-roberta-base-sentiment-latest",
	"emotion": "cardiffnlp/twitter-roberta-base-emotion",
	"offensive": "cardiffnlp/twitter-roberta-base-offensive",
	"irony": "cardiffnlp/twitter-roberta-base-irony",
	"hate": "cardiffnlp/twitter-roberta-base-hate-multiclass-latest",
	}

	# ── Toxicity model ────────────────────────────────────────────────
	TOXICITY_MODEL = "s-nlp/roberta_toxicity_classifier"

	# ── Senator data sources ──────────────────────────────────────────
	CONGRESS_LEGISLATORS_URL = (
	"https://raw.githubusercontent.com/unitedstates/congress-legislators"
	"/main/legislators-social-media.yaml"
	)
	CONGRESS_LEGISLATORS_CURRENT_URL = (
	"https://raw.githubusercontent.com/unitedstates/congress-legislators"
	"/main/legislators-current.yaml"
	)
	SENATOR_TWEETS_DATASET = "m-newhauser/senator-tweets"

	# ── Behavioral thresholds ─────────────────────────────────────────
	SESSION_GAP_MINUTES = 30 # gap before new session
	NIGHT_START_HOUR = 0 # UTC
	NIGHT_END_HOUR = 6 # UTC
	BURST_WINDOW_MINUTES = 60 # window for burst detection

	# ── Virulence score weights ───────────────────────────────────────
	VIRULENCE_WEIGHTS = {
	"sentiment_negative": 0.15,
	"emotion_anger": 0.20,
	"offensive": 0.20,
	"toxicity": 0.15,
	"hate": 0.10,
	"irony": 0.05,
	"engagement_controversy": 0.10,
	"burst_bonus": 0.05,
	}

	# ── Compulsion score weights ──────────────────────────────────────
	COMPULSION_WEIGHTS = {
	"activity": 0.20,
	"burstiness": 0.25,
	"night_activity": 0.15,
	"session_intensity": 0.15,
	"reply_reactivity": 0.10,
	"repetition": 0.10,
	"emoji_media_sparsity": 0.05,
	}

	# ── Batch sizes ───────────────────────────────────────────────────
	CLASSIFICATION_BATCH_SIZE = 32
	EMBEDDING_BATCH_SIZE = 16