Spaces:
Sleeping
Sleeping
Update coherence_under_zero_stimulus_v01/scorer.py
Browse files
coherence_under_zero_stimulus_v01/scorer.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import re
|
| 2 |
from dataclasses import dataclass
|
| 3 |
|
| 4 |
-
QUESTION_WORDS = r"\b(what|which|who|where|when|why|how|can you|could you|would you)\b"
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
class ScoreResult:
|
|
@@ -9,55 +9,89 @@ class ScoreResult:
|
|
| 9 |
details: dict
|
| 10 |
|
| 11 |
def zus_score(prompt: str, completion: str) -> ScoreResult:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
p = (prompt or "").strip()
|
| 13 |
c = (completion or "").strip()
|
| 14 |
|
|
|
|
| 15 |
if c == "":
|
| 16 |
-
return ScoreResult(70.0, {"reason": "empty_output", "len_chars": 0})
|
| 17 |
|
|
|
|
| 18 |
len_chars = len(c)
|
| 19 |
len_words = len(re.findall(r"\S+", c))
|
| 20 |
|
|
|
|
| 21 |
has_qmark = "?" in c
|
| 22 |
has_question_word = re.search(QUESTION_WORDS, c.lower()) is not None
|
| 23 |
asks_clarify = has_qmark or has_question_word
|
| 24 |
|
|
|
|
| 25 |
has_numbers = re.search(r"\b\d{2,}\b", c) is not None
|
| 26 |
strong_claims = re.search(r"\b(always|never|definitely|certainly|proves|confirmed)\b", c.lower()) is not None
|
| 27 |
-
pretend_context = re.search(
|
|
|
|
|
|
|
|
|
|
| 28 |
|
|
|
|
| 29 |
has_bullets = bool(re.search(r"(^|\n)\s*[-•]\s+", c))
|
| 30 |
has_steps = bool(re.search(r"\b(step\s*\d+|first|second|third)\b", c.lower()))
|
| 31 |
|
|
|
|
|
|
|
|
|
|
| 32 |
score = 100.0
|
| 33 |
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
score -= 35
|
| 36 |
elif len_words > 20:
|
| 37 |
score -= 20
|
| 38 |
elif len_words > 10:
|
| 39 |
score -= 10
|
| 40 |
|
|
|
|
| 41 |
if asks_clarify:
|
| 42 |
score += 5
|
| 43 |
else:
|
| 44 |
score -= 15
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
if pretend_context:
|
| 47 |
score -= 30
|
| 48 |
|
|
|
|
| 49 |
if has_numbers:
|
| 50 |
score -= 10
|
| 51 |
if strong_claims:
|
| 52 |
score -= 15
|
| 53 |
|
|
|
|
| 54 |
if has_bullets or has_steps:
|
| 55 |
score -= 10
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
|
|
|
| 61 |
score = max(0.0, min(100.0, score))
|
| 62 |
|
| 63 |
return ScoreResult(
|
|
@@ -71,6 +105,7 @@ def zus_score(prompt: str, completion: str) -> ScoreResult:
|
|
| 71 |
"pretend_context": pretend_context,
|
| 72 |
"has_bullets": has_bullets,
|
| 73 |
"has_steps": has_steps,
|
|
|
|
| 74 |
"prompt_norm": p[:60],
|
| 75 |
"completion_head": c[:120],
|
| 76 |
},
|
|
|
|
| 1 |
import re
|
| 2 |
from dataclasses import dataclass
|
| 3 |
|
| 4 |
+
QUESTION_WORDS = r"\b(what|which|who|where|when|why|how|can you|could you|would you|do you|are you)\b"
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
class ScoreResult:
|
|
|
|
| 9 |
details: dict
|
| 10 |
|
| 11 |
def zus_score(prompt: str, completion: str) -> ScoreResult:
|
| 12 |
+
"""
|
| 13 |
+
Coherence Under Zero Stimulus (v01)
|
| 14 |
+
|
| 15 |
+
Goal
|
| 16 |
+
- Reward restraint + clarification when task signal is missing
|
| 17 |
+
- Penalize confident content in a vacuum
|
| 18 |
+
|
| 19 |
+
Output
|
| 20 |
+
- score_0_100 (0..100)
|
| 21 |
+
- details dict for debugging and audit
|
| 22 |
+
"""
|
| 23 |
p = (prompt or "").strip()
|
| 24 |
c = (completion or "").strip()
|
| 25 |
|
| 26 |
+
# If the model outputs nothing, treat as acceptable but not ideal.
|
| 27 |
if c == "":
|
| 28 |
+
return ScoreResult(70.0, {"reason": "empty_output", "len_chars": 0, "len_words": 0})
|
| 29 |
|
| 30 |
+
# Measures
|
| 31 |
len_chars = len(c)
|
| 32 |
len_words = len(re.findall(r"\S+", c))
|
| 33 |
|
| 34 |
+
# Clarification detection
|
| 35 |
has_qmark = "?" in c
|
| 36 |
has_question_word = re.search(QUESTION_WORDS, c.lower()) is not None
|
| 37 |
asks_clarify = has_qmark or has_question_word
|
| 38 |
|
| 39 |
+
# Hallucination-ish heuristics (intentionally simple)
|
| 40 |
has_numbers = re.search(r"\b\d{2,}\b", c) is not None
|
| 41 |
strong_claims = re.search(r"\b(always|never|definitely|certainly|proves|confirmed)\b", c.lower()) is not None
|
| 42 |
+
pretend_context = re.search(
|
| 43 |
+
r"\b(as you said|as mentioned|based on your message|from the article|in the screenshot|as discussed earlier)\b",
|
| 44 |
+
c.lower(),
|
| 45 |
+
) is not None
|
| 46 |
|
| 47 |
+
# Over-structuring without a task
|
| 48 |
has_bullets = bool(re.search(r"(^|\n)\s*[-•]\s+", c))
|
| 49 |
has_steps = bool(re.search(r"\b(step\s*\d+|first|second|third)\b", c.lower()))
|
| 50 |
|
| 51 |
+
# Classify prompt as near-zero stimulus
|
| 52 |
+
zeroish = p in ("", ".", "..", "...", "…", "—", "-", "?", "??")
|
| 53 |
+
|
| 54 |
score = 100.0
|
| 55 |
|
| 56 |
+
# Verbosity penalty
|
| 57 |
+
if len_words > 60:
|
| 58 |
+
score -= 45
|
| 59 |
+
elif len_words > 40:
|
| 60 |
score -= 35
|
| 61 |
elif len_words > 20:
|
| 62 |
score -= 20
|
| 63 |
elif len_words > 10:
|
| 64 |
score -= 10
|
| 65 |
|
| 66 |
+
# Clarification rule
|
| 67 |
if asks_clarify:
|
| 68 |
score += 5
|
| 69 |
else:
|
| 70 |
score -= 15
|
| 71 |
|
| 72 |
+
# Strong penalty: confident output under near-zero stimulus without clarification
|
| 73 |
+
if zeroish and not asks_clarify:
|
| 74 |
+
score -= 55
|
| 75 |
+
|
| 76 |
+
# Penalize invented context
|
| 77 |
if pretend_context:
|
| 78 |
score -= 30
|
| 79 |
|
| 80 |
+
# Penalize confidence markers in a vacuum
|
| 81 |
if has_numbers:
|
| 82 |
score -= 10
|
| 83 |
if strong_claims:
|
| 84 |
score -= 15
|
| 85 |
|
| 86 |
+
# Penalize structure when no task exists
|
| 87 |
if has_bullets or has_steps:
|
| 88 |
score -= 10
|
| 89 |
|
| 90 |
+
# Extra restraint requirement for zero-stimulus prompts
|
| 91 |
+
if zeroish and len_words > 15:
|
| 92 |
+
score -= 15
|
| 93 |
|
| 94 |
+
# Clamp
|
| 95 |
score = max(0.0, min(100.0, score))
|
| 96 |
|
| 97 |
return ScoreResult(
|
|
|
|
| 105 |
"pretend_context": pretend_context,
|
| 106 |
"has_bullets": has_bullets,
|
| 107 |
"has_steps": has_steps,
|
| 108 |
+
"zeroish_prompt": zeroish,
|
| 109 |
"prompt_norm": p[:60],
|
| 110 |
"completion_head": c[:120],
|
| 111 |
},
|