| |
| import argparse |
| import random |
| import re |
| from typing import Iterator, Tuple, Dict, Any |
|
|
| |
| |
| |
| |
| |
|
|
| SEPARATORS_RE = re.compile(r"[\s\-\./\u00A0]+") |
|
|
| |
| COMMON_SUFFIXES = ["", "W", "A", "B", "H"] |
| EXTENDED_PUBLIC_SUFFIXES = ["", "W"] + [chr(c) for c in range(ord("A"), ord("Z") + 1) if chr(c) != "W"] |
| DEFAULT_SUFFIX_SET_STRICT = set(COMMON_SUFFIXES) |
|
|
|
|
| def normalize(value: str) -> str: |
| value = value.strip().upper() |
| value = SEPARATORS_RE.sub("", value) |
| return value |
|
|
|
|
| def _is_ascii_digit(ch: str) -> bool: |
| return "0" <= ch <= "9" |
|
|
|
|
| def _is_ascii_letter(ch: str) -> bool: |
| ch = ch.upper() |
| return "A" <= ch <= "Z" |
|
|
|
|
| def _is_word_boundary(text: str, index: int) -> bool: |
| if index < 0 or index >= len(text): |
| return True |
| return not text[index].isalnum() |
|
|
|
|
| def _is_separator(ch: str) -> bool: |
| return ch in " -./\u00A0\t\r\n" |
|
|
|
|
| def _suffix_value(letter: str) -> int: |
| if not letter or letter == "W": |
| return 0 |
| return ord(letter) - ord("A") + 1 |
|
|
|
|
| def checksum_letter(digits: str, suffix: str = "") -> str: |
| if len(digits) != 7 or not digits.isdigit(): |
| raise ValueError("digits must be 7 numeric characters") |
| weights = [8, 7, 6, 5, 4, 3, 2] |
| total = sum(int(d) * w for d, w in zip(digits, weights)) |
| total += _suffix_value(suffix) * 9 |
| remainder = total % 23 |
| if remainder == 0: |
| return "W" |
| return chr(ord("A") + remainder - 1) |
|
|
|
|
| def is_valid_ppsn(value: str, strict_suffix: bool = False) -> bool: |
| compact = normalize(value) |
| if len(compact) not in {8, 9}: |
| return False |
| if not compact[:7].isdigit(): |
| return False |
| if not ("A" <= compact[7] <= "W"): |
| return False |
| if len(compact) == 9 and not _is_ascii_letter(compact[8]): |
| return False |
| digits = compact[:7] |
| check_letter = compact[7] |
| suffix = compact[8:] if len(compact) > 8 else "" |
| if strict_suffix and suffix not in DEFAULT_SUFFIX_SET_STRICT: |
| return False |
| return checksum_letter(digits, suffix) == check_letter |
|
|
|
|
| def is_plausible_ppsn(value: str) -> bool: |
| compact = normalize(value) |
| if len(compact) not in {8, 9}: |
| return False |
| if not compact[:7].isdigit(): |
| return False |
| if not ("A" <= compact[7] <= "W"): |
| return False |
| return len(compact) == 8 or _is_ascii_letter(compact[8]) |
|
|
|
|
| def iter_ppsn_candidates(text: str) -> Iterator[Dict[str, Any]]: |
| i = 0 |
| n = len(text) |
| while i < n: |
| if not _is_ascii_digit(text[i]) or not _is_word_boundary(text, i - 1): |
| i += 1 |
| continue |
| j = i |
| while j < n and j - i < 7 and _is_ascii_digit(text[j]): |
| j += 1 |
| if j - i != 7: |
| i += 1 |
| continue |
| k = j |
| while k < n and _is_separator(text[k]): |
| k += 1 |
| if k >= n or not _is_ascii_letter(text[k]) or text[k].upper() > "W": |
| i += 1 |
| continue |
| end = k + 1 |
| if end < n and _is_ascii_letter(text[end]) and _is_word_boundary(text, end + 1): |
| end += 1 |
| if not _is_word_boundary(text, end): |
| i += 1 |
| continue |
| raw = text[i:end] |
| yield { |
| "start": i, |
| "end": end, |
| "text": raw, |
| "normalized": normalize(raw), |
| } |
| i = end |
|
|
|
|
| def generate_ppsn(suffix_policy: str = "mixed") -> str: |
| digits = "".join(str(random.randint(0, 9)) for _ in range(7)) |
| if suffix_policy == "none": |
| suffix = "" |
| elif suffix_policy == "legacy": |
| suffix = random.choice(["", "", "W"]) |
| elif suffix_policy == "modern": |
| suffix = random.choice(["A", "A", "B", "H"]) |
| elif suffix_policy in {"mixed", "official_common", "spec"}: |
| suffix = random.choice(["", "", "", "A", "A", "B", "H", "W"]) |
| elif suffix_policy in {"broad", "official_extended"}: |
| weighted = ["", "", "W", "A", "A", "B", "H"] + EXTENDED_PUBLIC_SUFFIXES |
| suffix = random.choice(weighted) |
| else: |
| raise ValueError("invalid suffix_policy") |
|
|
| check = checksum_letter(digits, suffix) |
| return f"{digits}{check}{suffix}" |
|
|
|
|
| def corrupt_ppsn(value: str) -> str: |
| """Create an invalid near-miss PPSN (wrong checksum or length).""" |
| compact = normalize(value) |
| if len(compact) >= 8 and compact[:7].isdigit(): |
| |
| d = list(compact) |
| d[6] = str((int(d[6]) + random.randint(1, 9)) % 10) |
| return "".join(d) |
| |
| return "".join(str(random.randint(0, 9)) for _ in range(6)) + "ZZ" |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| sub = parser.add_subparsers(dest="cmd", required=True) |
|
|
| v = sub.add_parser("validate") |
| v.add_argument("--value", required=True) |
| v.add_argument("--strict-suffix", action="store_true") |
|
|
| g = sub.add_parser("generate") |
| g.add_argument("--count", type=int, default=10) |
| g.add_argument("--suffix-policy", default="mixed") |
|
|
| args = parser.parse_args() |
|
|
| if args.cmd == "validate": |
| ok = is_valid_ppsn(args.value, strict_suffix=args.strict_suffix) |
| print("valid" if ok else "invalid") |
| elif args.cmd == "generate": |
| for _ in range(args.count): |
| print(generate_ppsn(args.suffix_policy)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|