rabukasim / tools /verify /verify_id_normalization.py
trioskosmos's picture
chore: remove large files for HF Space
9bd4ce5
import json
import os
from collections import defaultdict
def normalize_code(code: str) -> str:
if not code:
return ""
# Exact match of Rust/JS normalization logic
return code.strip().replace("+", "+").replace("-", "-").replace("ー", "-").upper()
def run_audit():
cards_path = "data/cards.json"
if not os.path.exists(cards_path):
print(f"Error: {cards_path} not found.")
return
with open(cards_path, "r", encoding="utf-8") as f:
cards = json.load(f)
print(f"Auditing {len(cards)} card entries...")
normalized_map = defaultdict(list)
errors = []
# 1. Check for basic normalization coverage
for card_id, data in cards.items():
card_no = data.get("card_no", card_id)
norm_id = normalize_code(card_id)
norm_no = normalize_code(card_no)
normalized_map[norm_id].append(card_id)
if norm_id != norm_no:
errors.append(
f"Mismatch: key '{card_id}' vs card_no '{card_no}' (Normalized into '{norm_id}' and '{norm_no}')"
)
# 2. Check for collisions (different IDs mapping to same norm)
collisions = []
for norm, original_ids in normalized_map.items():
if len(original_ids) > 1:
# We filter out suspected identical entries if we find any
collisions.append({"norm": norm, "originals": original_ids})
# 3. Generate Report
report_path = "reports/id_normalization_audit.md"
os.makedirs("reports", exist_ok=True)
with open(report_path, "w", encoding="utf-8") as f:
f.write("# Card ID Normalization Audit Report\n\n")
f.write(f"- **Total Cards Audited**: {len(cards)}\n")
f.write("- **Normalization Logic**: `Uppercase + Trim + [+, -, ー] to [+, -, -]`\n\n")
if not collisions and not errors:
f.write("> [!NOTE]\n> **ALL CLEAR**: No collisions or inconsistencies found. Normalization is safe.\n")
else:
if errors:
f.write("## Inconsistencies (Key vs Card No)\n")
f.write(
"> [!WARNING]\n> These cards have a internal key that doesn't match their `card_no` when normalized.\n\n"
)
for e in errors[:50]:
f.write(f"- {e}\n")
if len(errors) > 50:
f.write(f"\n... and {len(errors) - 50} more.\n")
if collisions:
f.write("\n## Collisions\n")
f.write(
"> [!CAUTION]\n> Multiple different IDs map to the same normalized string. This will cause matching ambiguity!\n\n"
)
f.write("| Normalized ID | Original IDs |\n")
f.write("| :--- | :--- |\n")
for c in collisions:
f.write(f"| `{c['norm']}` | `{', '.join(c['originals'])}` |\n")
print(f"Audit complete. Report written to {report_path}")
if collisions:
print(f"WARNING: Found {len(collisions)} collisions!")
if errors:
print(f"WARNING: Found {len(errors)} inconsistencies!")
if __name__ == "__main__":
run_audit()