temsa's picture
Add rc6 release with decoder repair improvements
c487a4b verified
#!/usr/bin/env python3
import argparse
import random
import re
from typing import Iterator, Tuple, Dict, Any
# PPSN format: 7 digits + check letter + optional suffix letter.
# Check letter uses weighted sum of digits (8..2) plus suffix value (A=1..Z=26, W/blank=0) * 9.
# Mod 23, remainder 0 => W, remainder 1..22 => A..V.
# Publicly documented common suffixes are blank / W for legacy numbers, A or B for
# post-2013 individual numbers, and H for certain non-individual registrations.
COMPACT_RE = re.compile(r"^\d{7}[A-W][A-Z]?$", re.IGNORECASE)
SEPARATORS_RE = re.compile(r"[\s\-\./\u00A0]+")
CANDIDATE_RE = re.compile(
r"\b\d{7}(?:[\s\-\./\u00A0]*[A-Wa-w])(?:[\s\-\./\u00A0]*[A-Za-z])?\b"
)
# Strict mode keeps the commonly documented suffixes seen in public PPSN guidance.
COMMON_SUFFIXES = ["", "W", "A", "B", "H"]
EXTENDED_PUBLIC_SUFFIXES = ["", "W"] + [chr(c) for c in range(ord("A"), ord("Z") + 1) if chr(c) != "W"]
DEFAULT_SUFFIX_SET_STRICT = set(COMMON_SUFFIXES)
def normalize(value: str) -> str:
value = value.strip().upper()
value = SEPARATORS_RE.sub("", value)
return value
def _suffix_value(letter: str) -> int:
if not letter or letter == "W":
return 0
return ord(letter) - ord("A") + 1
def checksum_letter(digits: str, suffix: str = "") -> str:
if len(digits) != 7 or not digits.isdigit():
raise ValueError("digits must be 7 numeric characters")
weights = [8, 7, 6, 5, 4, 3, 2]
total = sum(int(d) * w for d, w in zip(digits, weights))
total += _suffix_value(suffix) * 9
remainder = total % 23
if remainder == 0:
return "W"
return chr(ord("A") + remainder - 1)
def is_valid_ppsn(value: str, strict_suffix: bool = False) -> bool:
compact = normalize(value)
if not COMPACT_RE.match(compact):
return False
digits = compact[:7]
check_letter = compact[7]
suffix = compact[8:] if len(compact) > 8 else ""
if strict_suffix and suffix not in DEFAULT_SUFFIX_SET_STRICT:
return False
return checksum_letter(digits, suffix) == check_letter
def is_plausible_ppsn(value: str) -> bool:
return COMPACT_RE.match(normalize(value)) is not None
def iter_ppsn_candidates(text: str) -> Iterator[Dict[str, Any]]:
for m in CANDIDATE_RE.finditer(text):
raw = m.group(0)
yield {
"start": m.start(),
"end": m.end(),
"text": raw,
"normalized": normalize(raw),
}
def generate_ppsn(suffix_policy: str = "mixed") -> str:
digits = "".join(str(random.randint(0, 9)) for _ in range(7))
if suffix_policy == "none":
suffix = ""
elif suffix_policy == "legacy":
suffix = random.choice(["", "", "W"])
elif suffix_policy == "modern":
suffix = random.choice(["A", "A", "B", "H"])
elif suffix_policy in {"mixed", "official_common", "spec"}:
suffix = random.choice(["", "", "", "A", "A", "B", "H", "W"])
elif suffix_policy in {"broad", "official_extended"}:
weighted = ["", "", "W", "A", "A", "B", "H"] + EXTENDED_PUBLIC_SUFFIXES
suffix = random.choice(weighted)
else:
raise ValueError("invalid suffix_policy")
check = checksum_letter(digits, suffix)
return f"{digits}{check}{suffix}"
def corrupt_ppsn(value: str) -> str:
"""Create an invalid near-miss PPSN (wrong checksum or length)."""
compact = normalize(value)
if len(compact) >= 8 and compact[:7].isdigit():
# flip last digit
d = list(compact)
d[6] = str((int(d[6]) + random.randint(1, 9)) % 10)
return "".join(d)
# fallback: random invalid
return "".join(str(random.randint(0, 9)) for _ in range(6)) + "ZZ"
def main() -> None:
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd", required=True)
v = sub.add_parser("validate")
v.add_argument("--value", required=True)
v.add_argument("--strict-suffix", action="store_true")
g = sub.add_parser("generate")
g.add_argument("--count", type=int, default=10)
g.add_argument("--suffix-policy", default="mixed")
args = parser.parse_args()
if args.cmd == "validate":
ok = is_valid_ppsn(args.value, strict_suffix=args.strict_suffix)
print("valid" if ok else "invalid")
elif args.cmd == "generate":
for _ in range(args.count):
print(generate_ppsn(args.suffix_policy))
if __name__ == "__main__":
main()