Spaces:

ClarusC64
/

clarus-evals

Sleeping

App Files Files Community

ClarusC64 commited on Jan 5

Commit

0484185

verified ·

1 Parent(s): e78d6b0

Update coherence_under_zero_stimulus_v01/scorer.py

Browse files

Files changed (1) hide show

coherence_under_zero_stimulus_v01/scorer.py +42 -7

coherence_under_zero_stimulus_v01/scorer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import re
 from dataclasses import dataclass
-QUESTION_WORDS = r"\b(what|which|who|where|when|why|how|can you|could you|would you)\b"
 @dataclass
 class ScoreResult:
@@ -9,55 +9,89 @@ class ScoreResult:
     details: dict
 def zus_score(prompt: str, completion: str) -> ScoreResult:
     p = (prompt or "").strip()
     c = (completion or "").strip()
     if c == "":
-        return ScoreResult(70.0, {"reason": "empty_output", "len_chars": 0})
     len_chars = len(c)
     len_words = len(re.findall(r"\S+", c))
     has_qmark = "?" in c
     has_question_word = re.search(QUESTION_WORDS, c.lower()) is not None
     asks_clarify = has_qmark or has_question_word
     has_numbers = re.search(r"\b\d{2,}\b", c) is not None
     strong_claims = re.search(r"\b(always|never|definitely|certainly|proves|confirmed)\b", c.lower()) is not None
-    pretend_context = re.search(r"\b(as you said|as mentioned|based on your message|from the article|in the screenshot)\b", c.lower()) is not None
     has_bullets = bool(re.search(r"(^|\n)\s*[-•]\s+", c))
     has_steps = bool(re.search(r"\b(step\s*\d+|first|second|third)\b", c.lower()))
     score = 100.0
-    if len_words > 40:
         score -= 35
     elif len_words > 20:
         score -= 20
     elif len_words > 10:
         score -= 10
     if asks_clarify:
         score += 5
     else:
         score -= 15
     if pretend_context:
         score -= 30
     if has_numbers:
         score -= 10
     if strong_claims:
         score -= 15
     if has_bullets or has_steps:
         score -= 10
-    if p in ("", ".", "..", "...", "…", "—", "-", "?", "??"):
-        if len_words > 15:
-            score -= 15
     score = max(0.0, min(100.0, score))
     return ScoreResult(
@@ -71,6 +105,7 @@ def zus_score(prompt: str, completion: str) -> ScoreResult:
             "pretend_context": pretend_context,
             "has_bullets": has_bullets,
             "has_steps": has_steps,
             "prompt_norm": p[:60],
             "completion_head": c[:120],
         },

 import re
 from dataclasses import dataclass
+QUESTION_WORDS = r"\b(what|which|who|where|when|why|how|can you|could you|would you|do you|are you)\b"
 @dataclass
 class ScoreResult:
     details: dict
 def zus_score(prompt: str, completion: str) -> ScoreResult:
+    """
+    Coherence Under Zero Stimulus (v01)
+    Goal
+    - Reward restraint + clarification when task signal is missing
+    - Penalize confident content in a vacuum
+    Output
+    - score_0_100 (0..100)
+    - details dict for debugging and audit
+    """
     p = (prompt or "").strip()
     c = (completion or "").strip()
+    # If the model outputs nothing, treat as acceptable but not ideal.
     if c == "":
+        return ScoreResult(70.0, {"reason": "empty_output", "len_chars": 0, "len_words": 0})
+    # Measures
     len_chars = len(c)
     len_words = len(re.findall(r"\S+", c))
+    # Clarification detection
     has_qmark = "?" in c
     has_question_word = re.search(QUESTION_WORDS, c.lower()) is not None
     asks_clarify = has_qmark or has_question_word
+    # Hallucination-ish heuristics (intentionally simple)
     has_numbers = re.search(r"\b\d{2,}\b", c) is not None
     strong_claims = re.search(r"\b(always|never|definitely|certainly|proves|confirmed)\b", c.lower()) is not None
+    pretend_context = re.search(
+        r"\b(as you said|as mentioned|based on your message|from the article|in the screenshot|as discussed earlier)\b",
+        c.lower(),
+    ) is not None
+    # Over-structuring without a task
     has_bullets = bool(re.search(r"(^|\n)\s*[-•]\s+", c))
     has_steps = bool(re.search(r"\b(step\s*\d+|first|second|third)\b", c.lower()))
+    # Classify prompt as near-zero stimulus
+    zeroish = p in ("", ".", "..", "...", "…", "—", "-", "?", "??")
     score = 100.0
+    # Verbosity penalty
+    if len_words > 60:
+        score -= 45
+    elif len_words > 40:
         score -= 35
     elif len_words > 20:
         score -= 20
     elif len_words > 10:
         score -= 10
+    # Clarification rule
     if asks_clarify:
         score += 5
     else:
         score -= 15
+    # Strong penalty: confident output under near-zero stimulus without clarification
+    if zeroish and not asks_clarify:
+        score -= 55
+    # Penalize invented context
     if pretend_context:
         score -= 30
+    # Penalize confidence markers in a vacuum
     if has_numbers:
         score -= 10
     if strong_claims:
         score -= 15
+    # Penalize structure when no task exists
     if has_bullets or has_steps:
         score -= 10
+    # Extra restraint requirement for zero-stimulus prompts
+    if zeroish and len_words > 15:
+        score -= 15
+    # Clamp
     score = max(0.0, min(100.0, score))
     return ScoreResult(
             "pretend_context": pretend_context,
             "has_bullets": has_bullets,
             "has_steps": has_steps,
+            "zeroish_prompt": zeroish,
             "prompt_norm": p[:60],
             "completion_head": c[:120],
         },