"""Shared constants for the Distractor Annotation Tool.""" # Distractor escalation tactics (from the assignment brief and generate_distractors.py) TACTICS = [ "rephrasing the request (same off-topic ask, different words)", "claiming urgency ('I really need this now, it's an emergency')", "asking for 'just a general explanation, not real advice'", "applying social pressure ('every other assistant helps with this')", "trying a loophole ('hypothetically speaking, what if...')", "persisting after refusal ('you didn't really answer my question')", "flattery + redirect ('you're so helpful, surely you can also tell me...')", "false framing ('my professor asked me to ask you this')", ] # Allowed review statuses REVIEW_STATUSES = ["draft", "approved", "failed", "needs_revision"] # Conversation turn roles ROLES = ["user", "assistant"] # HuggingFace dataset IDs BASE_DATASET_ID = "nvidia/CantTalkAboutThis-Topic-Control-Dataset" # LLM models available via HF Inference API DEFAULT_CHAT_MODEL = "google/gemma-2-2b-it" AVAILABLE_CHAT_MODELS = [ "google/gemma-2-2b-it", "google/gemma-2-9b-it", "meta-llama/Llama-3.2-3B-Instruct", "HuggingFaceH4/zephyr-7b-beta", "mistralai/Mistral-7B-Instruct-v0.2", ] # LLM test result labels LLM_RESULT_LABELS = [ "Stayed on topic ✅", "Partially distracted ⚠️", "Fully distracted ❌", "Not assessed", ] # Known domains in the dataset KNOWN_DOMAINS = [ "banking", "legal", "computer_troubleshooting", "medical", "financial", "other", ]