| """ |
| Inference-time gate for the relevance scorer. |
| |
| PURPOSE |
| ------- |
| Some procurement notices have descriptions like "Se konkurransegrunnlag" or are |
| empty entirely. The model can't classify what isn't there. Instead of returning |
| a (low-confidence) prediction, the pipeline returns "needs_review" so a human |
| can fetch the missing content from the linked documents/website. |
| |
| USE |
| --- |
| Same gate must be applied in: |
| 1. Training-data filtering — drop rows where needs_review() is True. |
| 2. Inference time — skip the model call, return "needs_review". |
| |
| This keeps training and serving aligned. |
| |
| USAGE |
| ----- |
| from inference_rules import needs_review |
| flag, reason = needs_review(kort_beskrivelse) |
| if flag: |
| return {"label": "needs_review", "reason": reason} |
| # else: run the model |
| """ |
|
|
| import re |
|
|
| from langdetect import DetectorFactory, detect |
|
|
| DetectorFactory.seed = 0 |
|
|
| MIN_LEN = 30 |
|
|
| |
| |
| |
| |
| |
| NORWEGIAN_READABLE = {"no", "da", "sv"} |
|
|
| |
| PLACEHOLDER_PHRASES = { |
| "se tittel", |
| "se tittel.", |
| "tittelen sier vel alt", |
| "tittelen sier vel alt.", |
| "se konkurransegrunnlag", |
| "se konkurransegrunnlag.", |
| "se vedlegg", |
| "se vedlegg.", |
| "se dokumentene", |
| "se dokumentene.", |
| "se dokumentene som ble sendt på mail", |
| "se henvendelse på e-post", |
| "se henvendelse på epost", |
| "se nettside", |
| "se nettside.", |
| "se utlysning", |
| "se utlysning.", |
| "se anbudsdokumenter", |
| "se anbudsdokumenter.", |
| "rammeavtale", |
| "rammeavtale.", |
| } |
|
|
| |
| PLACEHOLDER_PATTERNS = [ |
| re.compile(r"^se\s+(tittel|konkurransegrunnlag|vedlegg|dokumenter|nettside|utlysning|anbud|henvendelse)", re.IGNORECASE), |
| re.compile(r"tittelen sier vel alt", re.IGNORECASE), |
| re.compile(r"sjekk (dokumentene|vedlegg|nettsiden|websiden)", re.IGNORECASE), |
| re.compile(r"check (the doc|website|attachment)", re.IGNORECASE), |
| re.compile(r"read (website|the website|the doc)", re.IGNORECASE), |
| ] |
|
|
|
|
| def needs_review(text): |
| """Return (True, reason) if the description should NOT be sent to the model. |
| |
| Otherwise returns (False, "ok"). |
| """ |
| if text is None: |
| return True, "empty" |
|
|
| s = str(text).strip() |
| if s == "" or s.lower() == "nan": |
| return True, "empty" |
|
|
| if len(s) < MIN_LEN: |
| return True, f"too_short(len={len(s)})" |
|
|
| s_lower = s.lower().strip().rstrip(".").strip() |
| if s_lower in {p.rstrip(".") for p in PLACEHOLDER_PHRASES}: |
| return True, "placeholder_phrase" |
|
|
| for pat in PLACEHOLDER_PATTERNS: |
| if pat.search(s): |
| |
| |
| if len(s) < 80: |
| return True, f"placeholder_match({pat.pattern[:30]})" |
|
|
| |
| |
| |
| try: |
| lang = detect(s[:500]) |
| if lang not in NORWEGIAN_READABLE: |
| return True, f"non_norwegian({lang})" |
| except Exception: |
| |
| pass |
|
|
| return False, "ok" |
|
|
|
|
| if __name__ == "__main__": |
| tests = [ |
| "", |
| None, |
| " ", |
| "Se konkurransegrunnlag", |
| "Se tittel.", |
| "Tittelen sier vel alt.", |
| "Rammeavtale", |
| "Sjekk dokumentene", |
| "Anskaffelse av samfunnsøkonomisk analyse for transportforskning innen evaluering.", |
| "Short text", |
| "Se vedlegg for full beskrivelse av kontraktens innhold inkludert alle leveranser.", |
| "TRANSQ is a joint qualification system for Scandinavian transport suppliers.", |
| "Hilma on Suomen julkisten hankintojen ilmoituskanava ja keskitetty palvelu.", |
| ] |
| for t in tests: |
| flag, reason = needs_review(t) |
| print(f"{flag!s:<6} {reason:<35} | {t!r}") |
|
|