| """ |
| DeepSeek-powered tweet labeler — generates training labels for custom |
| classification dimensions that off-the-shelf models don't cover well. |
| |
| Target dimensions (not well-served by CardiffNLP): |
| 1. Ragebait virulence (0-1) — is this tweet designed to provoke outrage? |
| 2. Compulsive engagement bait (0-1) — rhetorical questions, polls, "who agrees?" |
| 3. Tribal signaling (0-1) — in-group/out-group framing, us-vs-them |
| 4. Performative outrage (0-1) — outrage as spectacle vs genuine anger |
| 5. Epistemic manipulation (0-1) — misleading framing, cherry-picking, straw-manning |
| |
| Uses DeepSeek as annotator, then human-validates a sample, then trains |
| SetFit classifiers to distill into deterministic models. |
| """ |
| import json |
| import logging |
| import time |
| from pathlib import Path |
| from typing import Optional |
|
|
| import pandas as pd |
| from tqdm import tqdm |
|
|
| from .config import DATA_DIR |
|
|
| log = logging.getLogger(__name__) |
|
|
| DEEPSEEK_API_KEY = "sk-94fa064f77434e74916c3ee276d4ff08" |
| DEEPSEEK_BASE_URL = "https://api.deepseek.com" |
| DEEPSEEK_MODEL = "deepseek-chat" |
|
|
| |
| LABELING_PROMPT = """You are an expert annotator for political social media analysis. |
| Rate this tweet from a US senator on the following dimensions. |
| Return ONLY a JSON object with scores from 0.0 to 1.0. |
| |
| Dimensions: |
| - ragebait: Is this tweet designed to provoke outrage or anger in readers? (0=informational, 1=pure ragebait) |
| - engagement_bait: Does it use tricks to drive engagement — rhetorical questions, polls, "who agrees?", calls to action? (0=none, 1=heavy) |
| - tribal_signal: Does it use us-vs-them framing, in-group/out-group language, partisan tribalism? (0=none, 1=strong) |
| - performative_outrage: Is the outrage theatrical/performative rather than substantive? (0=genuine/none, 1=pure performance) |
| - epistemic_manipulation: Does it mislead through cherry-picking, straw-manning, false equivalence, or decontextualization? (0=fair, 1=heavily manipulative) |
| - agency_language: Does the author use active, agentic language (I did, we will) vs passive/victimhood framing (they did this to us)? (0=passive/victim, 1=agentic) |
| |
| Tweet: "{tweet_text}" |
| |
| Senator: {senator_name} ({party}) |
| |
| Return ONLY valid JSON, no explanation:""" |
|
|
|
|
| class DeepSeekLabeler: |
| """ |
| Use DeepSeek API to generate soft labels for custom classification dimensions. |
| These labels become training data for distilled SetFit classifiers. |
| """ |
|
|
| def __init__(self, api_key: str = DEEPSEEK_API_KEY): |
| try: |
| from openai import OpenAI |
| except ImportError: |
| raise ImportError("Install openai: pip install openai") |
|
|
| self.client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL) |
| self.model = DEEPSEEK_MODEL |
|
|
| def label_tweet(self, text: str, senator_name: str = "", party: str = "") -> dict: |
| """Label a single tweet. Returns dict of dimension scores.""" |
| prompt = LABELING_PROMPT.format( |
| tweet_text=text[:500], |
| senator_name=senator_name or "Unknown", |
| party=party or "Unknown", |
| ) |
|
|
| try: |
| response = self.client.chat.completions.create( |
| model=self.model, |
| messages=[{"role": "user", "content": prompt}], |
| max_tokens=200, |
| temperature=0.1, |
| ) |
| content = response.choices[0].message.content.strip() |
|
|
| |
| if content.startswith("```"): |
| content = content.split("```")[1] |
| if content.startswith("json"): |
| content = content[4:] |
| content = content.strip() |
|
|
| labels = json.loads(content) |
|
|
| |
| validated = {} |
| for key in ["ragebait", "engagement_bait", "tribal_signal", |
| "performative_outrage", "epistemic_manipulation", |
| "agency_language"]: |
| val = labels.get(key, 0.5) |
| validated[key] = max(0.0, min(1.0, float(val))) |
|
|
| return validated |
|
|
| except Exception as e: |
| log.warning("DeepSeek labeling failed for tweet: %s — %s", text[:50], e) |
| return { |
| "ragebait": 0.5, "engagement_bait": 0.5, "tribal_signal": 0.5, |
| "performative_outrage": 0.5, "epistemic_manipulation": 0.5, |
| "agency_language": 0.5, |
| } |
|
|
| def label_batch( |
| self, |
| df: pd.DataFrame, |
| text_col: str = "text", |
| senator_name: str = "", |
| party: str = "", |
| max_tweets: Optional[int] = None, |
| delay: float = 0.1, |
| save_path: Optional[str] = None, |
| ) -> pd.DataFrame: |
| """ |
| Label a batch of tweets. Returns DataFrame with label columns added. |
| |
| Args: |
| df: DataFrame with tweets |
| max_tweets: Cap on number to label (for cost control) |
| delay: Seconds between API calls (rate limiting) |
| save_path: Save intermediate results (resume-friendly) |
| """ |
| subset = df.head(max_tweets) if max_tweets else df |
| n = len(subset) |
| log.info("Labeling %d tweets via DeepSeek...", n) |
|
|
| |
| results = [] |
| start_idx = 0 |
| if save_path and Path(save_path).exists(): |
| existing = pd.read_parquet(save_path) |
| start_idx = len(existing) |
| results = existing.to_dict("records") |
| log.info("Resuming from checkpoint: %d already labeled", start_idx) |
|
|
| for i, (_, row) in enumerate(tqdm( |
| subset.iterrows(), total=n, desc="DeepSeek labeling", initial=start_idx |
| )): |
| if i < start_idx: |
| continue |
|
|
| text = str(row[text_col]) |
| labels = self.label_tweet(text, senator_name=senator_name, party=party) |
| labels["_index"] = i |
| labels["text"] = text[:500] |
| if "tweet_id" in row: |
| labels["tweet_id"] = str(row["tweet_id"]) |
| results.append(labels) |
|
|
| |
| if save_path and (i + 1) % 100 == 0: |
| checkpoint_df = pd.DataFrame(results) |
| checkpoint_df.to_parquet(save_path, index=False) |
| log.info("Checkpoint saved: %d/%d", i + 1, n) |
|
|
| if delay > 0: |
| time.sleep(delay) |
|
|
| result_df = pd.DataFrame(results) |
|
|
| if save_path: |
| Path(save_path).parent.mkdir(parents=True, exist_ok=True) |
| result_df.to_parquet(save_path, index=False) |
| log.info("Labels saved to %s", save_path) |
|
|
| log.info("Labeling complete: %d tweets, %d dimensions", len(result_df), 6) |
| return result_df |
|
|
|
|
| def build_training_set( |
| labeled_df: pd.DataFrame, |
| dimension: str, |
| threshold_positive: float = 0.7, |
| threshold_negative: float = 0.3, |
| max_per_class: int = 500, |
| ) -> pd.DataFrame: |
| """ |
| Convert soft DeepSeek labels into a binary training set for SetFit. |
| |
| Tweets scoring > threshold_positive are positive examples. |
| Tweets scoring < threshold_negative are negative examples. |
| Middle scores are excluded (ambiguous). |
| """ |
| if dimension not in labeled_df.columns: |
| raise ValueError(f"Dimension '{dimension}' not in labeled data") |
|
|
| pos = labeled_df[labeled_df[dimension] >= threshold_positive].head(max_per_class) |
| neg = labeled_df[labeled_df[dimension] <= threshold_negative].head(max_per_class) |
|
|
| pos = pos.copy() |
| neg = neg.copy() |
| pos["label"] = 1 |
| neg["label"] = 0 |
|
|
| training = pd.concat([pos, neg], ignore_index=True).sample(frac=1, random_state=42) |
| log.info( |
| "Training set for '%s': %d positive, %d negative (excluded %d ambiguous)", |
| dimension, len(pos), len(neg), |
| len(labeled_df) - len(pos) - len(neg), |
| ) |
| return training[["text", "label"]] |
|
|
|
|
| def train_setfit_classifier( |
| training_df: pd.DataFrame, |
| dimension: str, |
| base_model: str = "sentence-transformers/all-mpnet-base-v2", |
| output_dir: Optional[str] = None, |
| ): |
| """ |
| Train a SetFit classifier on DeepSeek-generated labels. |
| SetFit needs very few examples (8+ per class) to match full fine-tuning. |
| """ |
| from datasets import Dataset |
| from setfit import SetFitModel, Trainer, TrainingArguments |
|
|
| ds = Dataset.from_pandas(training_df) |
| train_test = ds.train_test_split(test_size=0.2, seed=42) |
|
|
| model = SetFitModel.from_pretrained(base_model) |
|
|
| args = TrainingArguments( |
| batch_size=16, |
| num_epochs=1, |
| num_iterations=5, |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=args, |
| train_dataset=train_test["train"], |
| eval_dataset=train_test["test"], |
| ) |
|
|
| log.info("Training SetFit classifier for '%s'...", dimension) |
| trainer.train() |
|
|
| metrics = trainer.evaluate() |
| log.info("Evaluation metrics for '%s': %s", dimension, metrics) |
|
|
| if output_dir: |
| save_path = f"{output_dir}/setfit_{dimension}" |
| model.save_pretrained(save_path) |
| log.info("Model saved to %s", save_path) |
|
|
| return model, metrics |
|
|