Menon-nb-bert-base-v2 / inference_rules.py
RozaA's picture
Upload folder using huggingface_hub
bc25b1d verified
"""
Inference-time gate for the relevance scorer.
PURPOSE
-------
Some procurement notices have descriptions like "Se konkurransegrunnlag" or are
empty entirely. The model can't classify what isn't there. Instead of returning
a (low-confidence) prediction, the pipeline returns "needs_review" so a human
can fetch the missing content from the linked documents/website.
USE
---
Same gate must be applied in:
1. Training-data filtering — drop rows where needs_review() is True.
2. Inference time — skip the model call, return "needs_review".
This keeps training and serving aligned.
USAGE
-----
from inference_rules import needs_review
flag, reason = needs_review(kort_beskrivelse)
if flag:
return {"label": "needs_review", "reason": reason}
# else: run the model
"""
import re
from langdetect import DetectorFactory, detect
DetectorFactory.seed = 0
MIN_LEN = 30 # below this → needs_review
# Languages we consider close enough to Norwegian Bokmål for the model.
# - 'no' (Norwegian) is the obvious one.
# - 'da' (Danish) is mutually intelligible with Norwegian; nb-bert-base handles it.
# - 'sv' (Swedish) is close enough that langdetect often confuses it with Norwegian.
# Anything else → routed to human review (production assumption: scraper translated).
NORWEGIAN_READABLE = {"no", "da", "sv"}
# Lowercased placeholder phrases (text == one of these, after strip+lower).
PLACEHOLDER_PHRASES = {
"se tittel",
"se tittel.",
"tittelen sier vel alt",
"tittelen sier vel alt.",
"se konkurransegrunnlag",
"se konkurransegrunnlag.",
"se vedlegg",
"se vedlegg.",
"se dokumentene",
"se dokumentene.",
"se dokumentene som ble sendt på mail",
"se henvendelse på e-post",
"se henvendelse på epost",
"se nettside",
"se nettside.",
"se utlysning",
"se utlysning.",
"se anbudsdokumenter",
"se anbudsdokumenter.",
"rammeavtale",
"rammeavtale.",
}
# Substring patterns: short descriptions that *contain* these phrases also fail.
PLACEHOLDER_PATTERNS = [
re.compile(r"^se\s+(tittel|konkurransegrunnlag|vedlegg|dokumenter|nettside|utlysning|anbud|henvendelse)", re.IGNORECASE),
re.compile(r"tittelen sier vel alt", re.IGNORECASE),
re.compile(r"sjekk (dokumentene|vedlegg|nettsiden|websiden)", re.IGNORECASE),
re.compile(r"check (the doc|website|attachment)", re.IGNORECASE),
re.compile(r"read (website|the website|the doc)", re.IGNORECASE),
]
def needs_review(text):
"""Return (True, reason) if the description should NOT be sent to the model.
Otherwise returns (False, "ok").
"""
if text is None:
return True, "empty"
s = str(text).strip()
if s == "" or s.lower() == "nan":
return True, "empty"
if len(s) < MIN_LEN:
return True, f"too_short(len={len(s)})"
s_lower = s.lower().strip().rstrip(".").strip()
if s_lower in {p.rstrip(".") for p in PLACEHOLDER_PHRASES}:
return True, "placeholder_phrase"
for pat in PLACEHOLDER_PATTERNS:
if pat.search(s):
# Only fire as placeholder if the description is also short — a long
# description that *mentions* "se vedlegg" inside a real sentence is fine.
if len(s) < 80:
return True, f"placeholder_match({pat.pattern[:30]})"
# Last check: language. Production assumption is that the scraper has already
# translated foreign-language leads into Norwegian. Anything that arrives here
# in another language is unexpected — route to human review.
try:
lang = detect(s[:500])
if lang not in NORWEGIAN_READABLE:
return True, f"non_norwegian({lang})"
except Exception:
# Detection failure → fall through (assume Norwegian, don't block)
pass
return False, "ok"
if __name__ == "__main__":
tests = [
"",
None,
" ",
"Se konkurransegrunnlag",
"Se tittel.",
"Tittelen sier vel alt.",
"Rammeavtale",
"Sjekk dokumentene",
"Anskaffelse av samfunnsøkonomisk analyse for transportforskning innen evaluering.",
"Short text",
"Se vedlegg for full beskrivelse av kontraktens innhold inkludert alle leveranser.", # long → ok
"TRANSQ is a joint qualification system for Scandinavian transport suppliers.", # English → flagged
"Hilma on Suomen julkisten hankintojen ilmoituskanava ja keskitetty palvelu.", # Finnish → flagged
]
for t in tests:
flag, reason = needs_review(t)
print(f"{flag!s:<6} {reason:<35} | {t!r}")