TemperCheck / tempercheck /prompt.py
Joseph Antolick
Make NSFW flag a genuine visual check
7be85a5
Raw
History Blame Contribute Delete
6.68 kB
"""Prompt construction and structured-output parsing for TemperCheck.
The model is asked to return a single JSON object. Small VLMs drift from
requested formats, so `parse_verdict` is deliberately defensive: it extracts the
first JSON object it can find and clamps/falls back on every field.
"""
from __future__ import annotations
import json
import re
from dataclasses import dataclass, field
from typing import Any
# The whole app hangs off this contract. If you change the shape, update
# parse_verdict, the Gradio output rendering, and tests/test_parsing.py together.
SYSTEM_PROMPT = """\
You are TemperCheck, a playful but sharp-eyed party-game judge. Given a photo or
screenshot of a social-media profile, you estimate β€” for entertainment only β€” how
short-tempered / "cranky to deal with" the person seems. This is a whimsical
novelty read, NOT a real personality assessment or a factual claim about anyone.
Judge mainly by what the person CHOSE TO PUT in their profile (bio text, display
name, handle, captions, stated attitude); treat facial expression and overall
vibe as a lighter, secondary signal. Read the profile text carefully and look for:
1. HOSTILITY & SARCASM β€” a biting, contemptuous, passive-aggressive, or sarcastic
tone; insults; mocking others; "I don't care what you think" energy.
2. PRIDE IN CONFLICT β€” does the profile brag about upsetting people, starting
fights, blocking/roasting/"destroying" others, or being "brutally honest"?
Treating confrontation as a personality is a strong temper signal.
3. EXPLICIT WARNING SIGNS stated right in the profile β€” slurs, hateful or
demeaning language, open anger, ALL-CAPS ranting, relentless negativity or
doom, or harsh criticism aimed at other people.
The more of these appear, and the more prominent they are, the higher the score.
A warm, friendly, or neutral profile with none of them scores low. If there is
little readable text, lean on overall vibe, say so, and keep the score mid-range.
Score guide (0 = delightful, 100 = run):
0-20 warm/friendly, no red flags
21-40 mostly pleasant, minor edge
41-60 mixed signals, or too little to tell
61-80 clearly prickly: sarcasm, negativity, or pride-in-conflict present
81-100 multiple strong warning signs (e.g. hostility + slurs/anger + conflict-pride)
Two extra content flags, independent of temper β€” ALWAYS add the tag when present:
- NSFW β€” inspect the ACTUAL IMAGE as carefully as the text, and flag adult
content from EITHER source. Add the signal "NSFW" if the picture shows nudity,
sexual or fetish content, underwear/lingerie as the focus, or other explicit or
adult visual material, OR if the text is sexually suggestive ("18+", "spicy
content", adult-site links). This visual check OVERRIDES the "text-first"
guidance above: flag NSFW based on the image even when the bio text is perfectly
innocuous. In your rationale, say whether the NSFW signal came from the image,
the text, or both.
- If the profile mentions crypto (Bitcoin, Ethereum, NFTs, tokens, "$TICKER",
"to the moon", web3, "diamond hands", trading shills), add the signal "crypto".
Put any NSFW/crypto tags FIRST in the signals list so they are never dropped.
Respond with ONLY a single JSON object, no prose before or after, of the form:
{
"score": <integer 0-100, higher = shorter-tempered / crankier>,
"verdict": "<3-6 word punchy label>",
"rationale": "<1-2 sentences naming the specific signals you actually saw>",
"signals": ["<short flag>", "<short flag>", "<short flag>"]
}
Each signal must be a concrete thing you observed, e.g. "sarcastic bio", "brags
about blocking people", "ALL-CAPS anger", "slur in bio", or "warm smile, kind bio".
Stay good-natured: you are reading the profile's stated content and vibe, never
mocking how someone looks."""
USER_INSTRUCTION = (
"Read this profile β€” its bio/handle text first, then the overall vibe β€” and "
"return the temper JSON. Be playful, not mean."
)
# 0-100 score bucket -> emoji label used by the UI.
SCORE_BANDS = [
(20, "πŸ˜‡ Sunshine"),
(40, "πŸ™‚ Easygoing"),
(60, "😐 Depends on the day"),
(80, "😀 Bit prickly"),
(101, "πŸŒ‹ Approach with coffee"),
]
@dataclass
class TemperVerdict:
score: int
verdict: str
rationale: str
signals: list[str] = field(default_factory=list)
raw: str = "" # original model text, for debugging / agent traces
@property
def band(self) -> str:
for ceiling, label in SCORE_BANDS:
if self.score < ceiling:
return label
return SCORE_BANDS[-1][1]
def build_messages(image_ref: Any) -> list[dict]:
"""Chat messages in the Gemma 4 multimodal format (image before text)."""
return [
{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
{
"role": "user",
"content": [
{"type": "image", "image": image_ref},
{"type": "text", "text": USER_INSTRUCTION},
],
},
]
def _first_json_object(text: str) -> dict | None:
"""Find the first balanced {...} block and parse it, tolerating junk around it."""
start = text.find("{")
while start != -1:
depth = 0
for i in range(start, len(text)):
if text[i] == "{":
depth += 1
elif text[i] == "}":
depth -= 1
if depth == 0:
try:
return json.loads(text[start : i + 1])
except json.JSONDecodeError:
break
start = text.find("{", start + 1)
return None
def parse_verdict(text: str) -> TemperVerdict:
"""Parse model output into a TemperVerdict, never raising on bad output."""
data = _first_json_object(text) or {}
# score: accept int/float/str, clamp to 0-100, default mid on failure.
raw_score = data.get("score", 50)
try:
score = int(round(float(raw_score)))
except (TypeError, ValueError):
m = re.search(r"-?\d+", str(raw_score))
score = int(m.group()) if m else 50
score = max(0, min(100, score))
signals = data.get("signals", [])
if not isinstance(signals, list):
signals = [str(signals)]
signals = [str(s).strip() for s in signals if str(s).strip()][:6]
return TemperVerdict(
score=score,
verdict=str(data.get("verdict", "Inscrutable")).strip() or "Inscrutable",
rationale=str(data.get("rationale", "The model kept its cards close."))
.strip()
or "The model kept its cards close.",
signals=signals,
raw=text,
)