File size: 4,641 Bytes
bc25b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Inference-time gate for the relevance scorer.

PURPOSE
-------
Some procurement notices have descriptions like "Se konkurransegrunnlag" or are
empty entirely. The model can't classify what isn't there. Instead of returning
a (low-confidence) prediction, the pipeline returns "needs_review" so a human
can fetch the missing content from the linked documents/website.

USE
---
Same gate must be applied in:
  1. Training-data filtering — drop rows where needs_review() is True.
  2. Inference time         — skip the model call, return "needs_review".

This keeps training and serving aligned.

USAGE
-----
    from inference_rules import needs_review
    flag, reason = needs_review(kort_beskrivelse)
    if flag:
        return {"label": "needs_review", "reason": reason}
    # else: run the model
"""

import re

from langdetect import DetectorFactory, detect

DetectorFactory.seed = 0

MIN_LEN = 30  # below this → needs_review

# Languages we consider close enough to Norwegian Bokmål for the model.
# - 'no' (Norwegian) is the obvious one.
# - 'da' (Danish) is mutually intelligible with Norwegian; nb-bert-base handles it.
# - 'sv' (Swedish) is close enough that langdetect often confuses it with Norwegian.
# Anything else → routed to human review (production assumption: scraper translated).
NORWEGIAN_READABLE = {"no", "da", "sv"}

# Lowercased placeholder phrases (text == one of these, after strip+lower).
PLACEHOLDER_PHRASES = {
    "se tittel",
    "se tittel.",
    "tittelen sier vel alt",
    "tittelen sier vel alt.",
    "se konkurransegrunnlag",
    "se konkurransegrunnlag.",
    "se vedlegg",
    "se vedlegg.",
    "se dokumentene",
    "se dokumentene.",
    "se dokumentene som ble sendt på mail",
    "se henvendelse på e-post",
    "se henvendelse på epost",
    "se nettside",
    "se nettside.",
    "se utlysning",
    "se utlysning.",
    "se anbudsdokumenter",
    "se anbudsdokumenter.",
    "rammeavtale",
    "rammeavtale.",
}

# Substring patterns: short descriptions that *contain* these phrases also fail.
PLACEHOLDER_PATTERNS = [
    re.compile(r"^se\s+(tittel|konkurransegrunnlag|vedlegg|dokumenter|nettside|utlysning|anbud|henvendelse)", re.IGNORECASE),
    re.compile(r"tittelen sier vel alt", re.IGNORECASE),
    re.compile(r"sjekk (dokumentene|vedlegg|nettsiden|websiden)", re.IGNORECASE),
    re.compile(r"check (the doc|website|attachment)", re.IGNORECASE),
    re.compile(r"read (website|the website|the doc)", re.IGNORECASE),
]


def needs_review(text):
    """Return (True, reason) if the description should NOT be sent to the model.

    Otherwise returns (False, "ok").
    """
    if text is None:
        return True, "empty"

    s = str(text).strip()
    if s == "" or s.lower() == "nan":
        return True, "empty"

    if len(s) < MIN_LEN:
        return True, f"too_short(len={len(s)})"

    s_lower = s.lower().strip().rstrip(".").strip()
    if s_lower in {p.rstrip(".") for p in PLACEHOLDER_PHRASES}:
        return True, "placeholder_phrase"

    for pat in PLACEHOLDER_PATTERNS:
        if pat.search(s):
            # Only fire as placeholder if the description is also short — a long
            # description that *mentions* "se vedlegg" inside a real sentence is fine.
            if len(s) < 80:
                return True, f"placeholder_match({pat.pattern[:30]})"

    # Last check: language. Production assumption is that the scraper has already
    # translated foreign-language leads into Norwegian. Anything that arrives here
    # in another language is unexpected — route to human review.
    try:
        lang = detect(s[:500])
        if lang not in NORWEGIAN_READABLE:
            return True, f"non_norwegian({lang})"
    except Exception:
        # Detection failure → fall through (assume Norwegian, don't block)
        pass

    return False, "ok"


if __name__ == "__main__":
    tests = [
        "",
        None,
        "   ",
        "Se konkurransegrunnlag",
        "Se tittel.",
        "Tittelen sier vel alt.",
        "Rammeavtale",
        "Sjekk dokumentene",
        "Anskaffelse av samfunnsøkonomisk analyse for transportforskning innen evaluering.",
        "Short text",
        "Se vedlegg for full beskrivelse av kontraktens innhold inkludert alle leveranser.",  # long → ok
        "TRANSQ is a joint qualification system for Scandinavian transport suppliers.",  # English → flagged
        "Hilma on Suomen julkisten hankintojen ilmoituskanava ja keskitetty palvelu.",  # Finnish → flagged
    ]
    for t in tests:
        flag, reason = needs_review(t)
        print(f"{flag!s:<6} {reason:<35} | {t!r}")