temsa's picture
Publish rc7 with spec-driven scanner release
32bcb86 verified
#!/usr/bin/env python3
import argparse
import random
import re
from typing import Iterator, Tuple, Dict, Any
# PPSN format: 7 digits + check letter + optional suffix letter.
# Check letter uses weighted sum of digits (8..2) plus suffix value (A=1..Z=26, W/blank=0) * 9.
# Mod 23, remainder 0 => W, remainder 1..22 => A..V.
# Publicly documented common suffixes are blank / W for legacy numbers, A or B for
# post-2013 individual numbers, and H for certain non-individual registrations.
SEPARATORS_RE = re.compile(r"[\s\-\./\u00A0]+")
# Strict mode keeps the commonly documented suffixes seen in public PPSN guidance.
COMMON_SUFFIXES = ["", "W", "A", "B", "H"]
EXTENDED_PUBLIC_SUFFIXES = ["", "W"] + [chr(c) for c in range(ord("A"), ord("Z") + 1) if chr(c) != "W"]
DEFAULT_SUFFIX_SET_STRICT = set(COMMON_SUFFIXES)
def normalize(value: str) -> str:
value = value.strip().upper()
value = SEPARATORS_RE.sub("", value)
return value
def _is_ascii_digit(ch: str) -> bool:
return "0" <= ch <= "9"
def _is_ascii_letter(ch: str) -> bool:
ch = ch.upper()
return "A" <= ch <= "Z"
def _is_word_boundary(text: str, index: int) -> bool:
if index < 0 or index >= len(text):
return True
return not text[index].isalnum()
def _is_separator(ch: str) -> bool:
return ch in " -./\u00A0\t\r\n"
def _suffix_value(letter: str) -> int:
if not letter or letter == "W":
return 0
return ord(letter) - ord("A") + 1
def checksum_letter(digits: str, suffix: str = "") -> str:
if len(digits) != 7 or not digits.isdigit():
raise ValueError("digits must be 7 numeric characters")
weights = [8, 7, 6, 5, 4, 3, 2]
total = sum(int(d) * w for d, w in zip(digits, weights))
total += _suffix_value(suffix) * 9
remainder = total % 23
if remainder == 0:
return "W"
return chr(ord("A") + remainder - 1)
def is_valid_ppsn(value: str, strict_suffix: bool = False) -> bool:
compact = normalize(value)
if len(compact) not in {8, 9}:
return False
if not compact[:7].isdigit():
return False
if not ("A" <= compact[7] <= "W"):
return False
if len(compact) == 9 and not _is_ascii_letter(compact[8]):
return False
digits = compact[:7]
check_letter = compact[7]
suffix = compact[8:] if len(compact) > 8 else ""
if strict_suffix and suffix not in DEFAULT_SUFFIX_SET_STRICT:
return False
return checksum_letter(digits, suffix) == check_letter
def is_plausible_ppsn(value: str) -> bool:
compact = normalize(value)
if len(compact) not in {8, 9}:
return False
if not compact[:7].isdigit():
return False
if not ("A" <= compact[7] <= "W"):
return False
return len(compact) == 8 or _is_ascii_letter(compact[8])
def iter_ppsn_candidates(text: str) -> Iterator[Dict[str, Any]]:
i = 0
n = len(text)
while i < n:
if not _is_ascii_digit(text[i]) or not _is_word_boundary(text, i - 1):
i += 1
continue
j = i
while j < n and j - i < 7 and _is_ascii_digit(text[j]):
j += 1
if j - i != 7:
i += 1
continue
k = j
while k < n and _is_separator(text[k]):
k += 1
if k >= n or not _is_ascii_letter(text[k]) or text[k].upper() > "W":
i += 1
continue
end = k + 1
if end < n and _is_ascii_letter(text[end]) and _is_word_boundary(text, end + 1):
end += 1
if not _is_word_boundary(text, end):
i += 1
continue
raw = text[i:end]
yield {
"start": i,
"end": end,
"text": raw,
"normalized": normalize(raw),
}
i = end
def generate_ppsn(suffix_policy: str = "mixed") -> str:
digits = "".join(str(random.randint(0, 9)) for _ in range(7))
if suffix_policy == "none":
suffix = ""
elif suffix_policy == "legacy":
suffix = random.choice(["", "", "W"])
elif suffix_policy == "modern":
suffix = random.choice(["A", "A", "B", "H"])
elif suffix_policy in {"mixed", "official_common", "spec"}:
suffix = random.choice(["", "", "", "A", "A", "B", "H", "W"])
elif suffix_policy in {"broad", "official_extended"}:
weighted = ["", "", "W", "A", "A", "B", "H"] + EXTENDED_PUBLIC_SUFFIXES
suffix = random.choice(weighted)
else:
raise ValueError("invalid suffix_policy")
check = checksum_letter(digits, suffix)
return f"{digits}{check}{suffix}"
def corrupt_ppsn(value: str) -> str:
"""Create an invalid near-miss PPSN (wrong checksum or length)."""
compact = normalize(value)
if len(compact) >= 8 and compact[:7].isdigit():
# flip last digit
d = list(compact)
d[6] = str((int(d[6]) + random.randint(1, 9)) % 10)
return "".join(d)
# fallback: random invalid
return "".join(str(random.randint(0, 9)) for _ in range(6)) + "ZZ"
def main() -> None:
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
v = sub.add_parser("validate")
v.add_argument("--value", required=True)
v.add_argument("--strict-suffix", action="store_true")
g = sub.add_parser("generate")
g.add_argument("--count", type=int, default=10)
g.add_argument("--suffix-policy", default="mixed")
args = parser.parse_args()
if args.cmd == "validate":
ok = is_valid_ppsn(args.value, strict_suffix=args.strict_suffix)
print("valid" if ok else "invalid")
elif args.cmd == "generate":
for _ in range(args.count):
print(generate_ppsn(args.suffix_policy))
if __name__ == "__main__":
main()