Spaces:
Sleeping
Sleeping
Commit ·
0823edb
1
Parent(s): 5a92002
ML_POSITIVE_LABELS
Browse files- checker.py +45 -10
checker.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
checker.py — core logic for Image + Text Compliance Check
|
| 3 |
-
"""
|
| 4 |
from __future__ import annotations
|
| 5 |
from pathlib import Path
|
| 6 |
from typing import List, Optional, Dict, Any, Iterable, Union, Tuple
|
|
@@ -40,6 +37,12 @@ USE_TINY_ML = os.getenv("USE_TINY_ML", "1") == "1"
|
|
| 40 |
HF_REPO = os.getenv("HF_REPO", "tlogandesigns/fairhousing-bert-tiny")
|
| 41 |
HF_THRESH = float(os.getenv("HF_THRESH", "0.75"))
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
BASE_DIR = Path(__file__).parent
|
| 44 |
PHRASES_PATH = Path(os.getenv("PHRASES_PATH", str(BASE_DIR / "phrases.yaml")))
|
| 45 |
|
|
@@ -93,14 +96,12 @@ def contains_disclaimer(text: str, disclaimer: str) -> bool:
|
|
| 93 |
|
| 94 |
return squeeze(disclaimer) in squeeze(text)
|
| 95 |
|
| 96 |
-
|
| 97 |
@dataclass
|
| 98 |
class Rule:
|
| 99 |
regex: re.Pattern
|
| 100 |
category: str
|
| 101 |
suggests: list[str]
|
| 102 |
|
| 103 |
-
|
| 104 |
PHRASE_RULES: list[Rule] = []
|
| 105 |
PHRASES_ERROR: Optional[str] = None
|
| 106 |
|
|
@@ -173,6 +174,33 @@ if USE_TINY_ML:
|
|
| 173 |
)
|
| 174 |
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def fair_housing_flags(text: str) -> List[str]:
|
| 177 |
flags: List[str] = []
|
| 178 |
t = (text or "")[:1500]
|
|
@@ -185,11 +213,9 @@ def fair_housing_flags(text: str) -> List[str]:
|
|
| 185 |
flags.append(rule.category)
|
| 186 |
if hf_pipe:
|
| 187 |
try:
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
|
| 192 |
-
flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
|
| 193 |
except Exception as e:
|
| 194 |
flags.append(f"MLFlag: inference error: {e}")
|
| 195 |
return flags
|
|
@@ -353,6 +379,13 @@ def run_check(
|
|
| 353 |
)
|
| 354 |
img_findings, img_spans = find_rule_matches(itxt)
|
| 355 |
ptxt_findings, ptxt_spans = find_rule_matches(ptxt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
results = {
|
| 357 |
"Fair_Housing": fair_housing_block,
|
| 358 |
"img": img_block,
|
|
@@ -371,6 +404,8 @@ def run_check(
|
|
| 371 |
"OCR": pytesseract is not None,
|
| 372 |
"Categories": sorted({r.category for r in PHRASE_RULES}),
|
| 373 |
"DisclaimerRequiredOnNonSocial": REQUIRE_DISCLAIMER_ON_NON_SOCIAL,
|
|
|
|
|
|
|
| 374 |
},
|
| 375 |
}
|
| 376 |
send_email_notification(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import List, Optional, Dict, Any, Iterable, Union, Tuple
|
|
|
|
| 37 |
HF_REPO = os.getenv("HF_REPO", "tlogandesigns/fairhousing-bert-tiny")
|
| 38 |
HF_THRESH = float(os.getenv("HF_THRESH", "0.75"))
|
| 39 |
|
| 40 |
+
ML_POSITIVE_LABELS = {
|
| 41 |
+
s.strip().lower()
|
| 42 |
+
for s in re.split(r"\s*,\s*", os.getenv("ML_POSITIVE_LABELS", "Potential Violation,violation,positive,LABEL_1,1"))
|
| 43 |
+
if s.strip()
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
BASE_DIR = Path(__file__).parent
|
| 47 |
PHRASES_PATH = Path(os.getenv("PHRASES_PATH", str(BASE_DIR / "phrases.yaml")))
|
| 48 |
|
|
|
|
| 96 |
|
| 97 |
return squeeze(disclaimer) in squeeze(text)
|
| 98 |
|
|
|
|
| 99 |
@dataclass
|
| 100 |
class Rule:
|
| 101 |
regex: re.Pattern
|
| 102 |
category: str
|
| 103 |
suggests: list[str]
|
| 104 |
|
|
|
|
| 105 |
PHRASE_RULES: list[Rule] = []
|
| 106 |
PHRASES_ERROR: Optional[str] = None
|
| 107 |
|
|
|
|
| 174 |
)
|
| 175 |
|
| 176 |
|
| 177 |
+
def _violation_score(pipe, text: str) -> float:
|
| 178 |
+
try:
|
| 179 |
+
preds = pipe(text, return_all_scores=True)
|
| 180 |
+
scores = {str(d["label"]).lower(): float(d["score"]) for d in preds[0]}
|
| 181 |
+
except TypeError:
|
| 182 |
+
preds = pipe(text)
|
| 183 |
+
if isinstance(preds, list) and preds:
|
| 184 |
+
p = preds[0]
|
| 185 |
+
label = str(p.get("label", "")).lower()
|
| 186 |
+
score = float(p.get("score", 0.0))
|
| 187 |
+
if label in ML_POSITIVE_LABELS:
|
| 188 |
+
return score
|
| 189 |
+
return score
|
| 190 |
+
return 0.0
|
| 191 |
+
except Exception:
|
| 192 |
+
return 0.0
|
| 193 |
+
for name in ML_POSITIVE_LABELS:
|
| 194 |
+
if name in scores:
|
| 195 |
+
return scores[name]
|
| 196 |
+
if "non-violation" in scores:
|
| 197 |
+
return 1.0 - scores["non-violation"]
|
| 198 |
+
candidates = {k: v for k, v in scores.items() if any(tok in k for tok in ("violat", "posit", "flag", "risk", "unsafe", "toxic"))}
|
| 199 |
+
if candidates:
|
| 200 |
+
return max(candidates.values())
|
| 201 |
+
return max(scores.values()) if scores else 0.0
|
| 202 |
+
|
| 203 |
+
|
| 204 |
def fair_housing_flags(text: str) -> List[str]:
|
| 205 |
flags: List[str] = []
|
| 206 |
t = (text or "")[:1500]
|
|
|
|
| 213 |
flags.append(rule.category)
|
| 214 |
if hf_pipe:
|
| 215 |
try:
|
| 216 |
+
score = _violation_score(hf_pipe, t)
|
| 217 |
+
if score >= HF_THRESH:
|
| 218 |
+
flags.append(f"MLFlag: model={HF_REPO} score={score:.2f}")
|
|
|
|
|
|
|
| 219 |
except Exception as e:
|
| 220 |
flags.append(f"MLFlag: inference error: {e}")
|
| 221 |
return flags
|
|
|
|
| 379 |
)
|
| 380 |
img_findings, img_spans = find_rule_matches(itxt)
|
| 381 |
ptxt_findings, ptxt_spans = find_rule_matches(ptxt)
|
| 382 |
+
model_labels = []
|
| 383 |
+
try:
|
| 384 |
+
if hf_pipe is not None and hasattr(hf_pipe, "model") and hasattr(hf_pipe.model, "config"):
|
| 385 |
+
labels_map = getattr(hf_pipe.model.config, "id2label", {}) or {}
|
| 386 |
+
model_labels = list(labels_map.values())
|
| 387 |
+
except Exception:
|
| 388 |
+
model_labels = []
|
| 389 |
results = {
|
| 390 |
"Fair_Housing": fair_housing_block,
|
| 391 |
"img": img_block,
|
|
|
|
| 404 |
"OCR": pytesseract is not None,
|
| 405 |
"Categories": sorted({r.category for r in PHRASE_RULES}),
|
| 406 |
"DisclaimerRequiredOnNonSocial": REQUIRE_DISCLAIMER_ON_NON_SOCIAL,
|
| 407 |
+
"ModelLabels": model_labels,
|
| 408 |
+
"MLPositiveLabels": sorted(list(ML_POSITIVE_LABELS)),
|
| 409 |
},
|
| 410 |
}
|
| 411 |
send_email_notification(results)
|