File size: 4,641 Bytes
bc25b1d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | """
Inference-time gate for the relevance scorer.
PURPOSE
-------
Some procurement notices have descriptions like "Se konkurransegrunnlag" or are
empty entirely. The model can't classify what isn't there. Instead of returning
a (low-confidence) prediction, the pipeline returns "needs_review" so a human
can fetch the missing content from the linked documents/website.
USE
---
Same gate must be applied in:
1. Training-data filtering — drop rows where needs_review() is True.
2. Inference time — skip the model call, return "needs_review".
This keeps training and serving aligned.
USAGE
-----
from inference_rules import needs_review
flag, reason = needs_review(kort_beskrivelse)
if flag:
return {"label": "needs_review", "reason": reason}
# else: run the model
"""
import re
from langdetect import DetectorFactory, detect
DetectorFactory.seed = 0
MIN_LEN = 30 # below this → needs_review
# Languages we consider close enough to Norwegian Bokmål for the model.
# - 'no' (Norwegian) is the obvious one.
# - 'da' (Danish) is mutually intelligible with Norwegian; nb-bert-base handles it.
# - 'sv' (Swedish) is close enough that langdetect often confuses it with Norwegian.
# Anything else → routed to human review (production assumption: scraper translated).
NORWEGIAN_READABLE = {"no", "da", "sv"}
# Lowercased placeholder phrases (text == one of these, after strip+lower).
PLACEHOLDER_PHRASES = {
"se tittel",
"se tittel.",
"tittelen sier vel alt",
"tittelen sier vel alt.",
"se konkurransegrunnlag",
"se konkurransegrunnlag.",
"se vedlegg",
"se vedlegg.",
"se dokumentene",
"se dokumentene.",
"se dokumentene som ble sendt på mail",
"se henvendelse på e-post",
"se henvendelse på epost",
"se nettside",
"se nettside.",
"se utlysning",
"se utlysning.",
"se anbudsdokumenter",
"se anbudsdokumenter.",
"rammeavtale",
"rammeavtale.",
}
# Substring patterns: short descriptions that *contain* these phrases also fail.
PLACEHOLDER_PATTERNS = [
re.compile(r"^se\s+(tittel|konkurransegrunnlag|vedlegg|dokumenter|nettside|utlysning|anbud|henvendelse)", re.IGNORECASE),
re.compile(r"tittelen sier vel alt", re.IGNORECASE),
re.compile(r"sjekk (dokumentene|vedlegg|nettsiden|websiden)", re.IGNORECASE),
re.compile(r"check (the doc|website|attachment)", re.IGNORECASE),
re.compile(r"read (website|the website|the doc)", re.IGNORECASE),
]
def needs_review(text):
"""Return (True, reason) if the description should NOT be sent to the model.
Otherwise returns (False, "ok").
"""
if text is None:
return True, "empty"
s = str(text).strip()
if s == "" or s.lower() == "nan":
return True, "empty"
if len(s) < MIN_LEN:
return True, f"too_short(len={len(s)})"
s_lower = s.lower().strip().rstrip(".").strip()
if s_lower in {p.rstrip(".") for p in PLACEHOLDER_PHRASES}:
return True, "placeholder_phrase"
for pat in PLACEHOLDER_PATTERNS:
if pat.search(s):
# Only fire as placeholder if the description is also short — a long
# description that *mentions* "se vedlegg" inside a real sentence is fine.
if len(s) < 80:
return True, f"placeholder_match({pat.pattern[:30]})"
# Last check: language. Production assumption is that the scraper has already
# translated foreign-language leads into Norwegian. Anything that arrives here
# in another language is unexpected — route to human review.
try:
lang = detect(s[:500])
if lang not in NORWEGIAN_READABLE:
return True, f"non_norwegian({lang})"
except Exception:
# Detection failure → fall through (assume Norwegian, don't block)
pass
return False, "ok"
if __name__ == "__main__":
tests = [
"",
None,
" ",
"Se konkurransegrunnlag",
"Se tittel.",
"Tittelen sier vel alt.",
"Rammeavtale",
"Sjekk dokumentene",
"Anskaffelse av samfunnsøkonomisk analyse for transportforskning innen evaluering.",
"Short text",
"Se vedlegg for full beskrivelse av kontraktens innhold inkludert alle leveranser.", # long → ok
"TRANSQ is a joint qualification system for Scandinavian transport suppliers.", # English → flagged
"Hilma on Suomen julkisten hankintojen ilmoituskanava ja keskitetty palvelu.", # Finnish → flagged
]
for t in tests:
flag, reason = needs_review(t)
print(f"{flag!s:<6} {reason:<35} | {t!r}")
|