| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | import json |
| | import os |
| | from collections import defaultdict |
| | |
| | from engine.parser_rules import parse_text_rules |
| | from engine.parser_ext import parse_text_extended |
| | |
| | |
| | GOLD_PATH = "training/gold_tests.json" |
| | ALIAS_PATH = "data/alias_maps.json" |
| | |
| | |
| | def normalise(s): |
| | if s is None: |
| | return "" |
| | return str(s).strip().lower() |
| | |
| | |
| | def learn_aliases(): |
| | """ |
| | Learns synonym mappings from gold tests. |
| | """ |
| | if not os.path.exists(GOLD_PATH): |
| | return {"error": f"Gold tests missing: {GOLD_PATH}"} |
| | |
| | with open(GOLD_PATH, "r", encoding="utf-8") as f: |
| | gold = json.load(f) |
| | |
| | |
| | if os.path.exists(ALIAS_PATH): |
| | with open(ALIAS_PATH, "r", encoding="utf-8") as f: |
| | alias_maps = json.load(f) |
| | else: |
| | alias_maps = {} |
| | |
| | |
| | suggestions = defaultdict(lambda: defaultdict(int)) |
| | |
| | |
| | |
| | |
| | for test in gold: |
| | text = test.get("input", "") |
| | expected = test.get("expected", {}) |
| | |
| | rules = parse_text_rules(text).get("parsed_fields", {}) |
| | ext = parse_text_extended(text).get("parsed_fields", {}) |
| | |
| | |
| | merged = dict(rules) |
| | for k, v in ext.items(): |
| | if v != "Unknown": |
| | merged[k] = v |
| | |
| | |
| | for field, exp_val in expected.items(): |
| | exp_norm = normalise(exp_val) |
| | got_norm = normalise(merged.get(field, "Unknown")) |
| | |
| | |
| | if exp_norm == got_norm: |
| | continue |
| | |
| | |
| | if exp_norm in ["", "unknown"]: |
| | continue |
| | |
| | |
| | if got_norm not in ["", "unknown"]: |
| | suggestions[field][got_norm] += 1 |
| | |
| | |
| | |
| | |
| | alias_updates = {} |
| | |
| | for field, values in suggestions.items(): |
| | |
| | for wrong_value, count in values.items(): |
| | if count < 2: |
| | continue |
| | |
| | |
| | if field not in alias_maps: |
| | alias_maps[field] = {} |
| | |
| | |
| | |
| | canonical = None |
| | |
| | field_values = [normalise(t["expected"][field]) for t in gold if field in t["expected"]] |
| | if field_values: |
| | |
| | canonical = max(set(field_values), key=field_values.count) |
| | |
| | if canonical: |
| | alias_maps[field][wrong_value] = canonical |
| | alias_updates[f"{field}:{wrong_value}"] = canonical |
| | |
| | |
| | |
| | |
| | with open(ALIAS_PATH, "w", encoding="utf-8") as f: |
| | json.dump(alias_maps, f, indent=2) |
| | |
| | return { |
| | "ok": True, |
| | "updated_aliases": alias_updates, |
| | "total_updates": len(alias_updates), |
| | "alias_map_path": ALIAS_PATH, |
| | } |
| |
|