| |
| |
| |
| |
| |
| |
| |
|
|
| import spacy, subprocess, sys, json |
| from pathlib import Path |
|
|
| MODEL_PATH = "./models/civil_registry_model/model-best" |
|
|
| |
| TEST = [ |
| { |
| "form": "Form 102 β Birth Certificate", |
| "text": ( |
| "Registry No.: 2024-001\n" |
| "1. NAME (First): Ana (Middle): Garcia (Last): Reyes\n" |
| "2. SEX: Female\n" |
| "3. DATE OF BIRTH: August 21, 1995\n" |
| "4. PLACE OF BIRTH: Pasig City\n" |
| "7. MAIDEN NAME (First): Gloria (Middle): Santos (Last): Garcia\n" |
| "8. CITIZENSHIP: Filipino\n" |
| "14. NAME (First): Ramon (Middle): Cruz (Last): Reyes\n" |
| "15. CITIZENSHIP: Filipino" |
| ), |
| "expected": [ |
| "F102_CHILD_FIRST", "F102_CHILD_MIDDLE", "F102_CHILD_LAST", |
| "F102_SEX", "F102_DATE_OF_BIRTH", "F102_PLACE_OF_BIRTH", |
| "F102_MOTHER_FIRST", "F102_FATHER_FIRST", |
| ], |
| }, |
| { |
| "form": "Form 103 β Death Certificate", |
| "text": ( |
| "1. NAME (First): Fernando (Middle): Santos (Last): Cruz\n" |
| "2. SEX: Male\n" |
| "4. AGE: 70\n" |
| "5. PLACE OF DEATH: PGH Manila\n" |
| "6. DATE OF DEATH: March 3, 2023\n" |
| "Immediate cause: Renal Failure" |
| ), |
| "expected": [ |
| "F103_DECEASED_FIRST", "F103_DECEASED_MIDDLE", "F103_DECEASED_LAST", |
| "F103_SEX", "F103_AGE", "F103_PLACE_OF_DEATH", |
| "F103_DATE_OF_DEATH", "F103_CAUSE_IMMEDIATE", |
| ], |
| }, |
| { |
| "form": "Form 97 β Marriage Certificate", |
| "text": ( |
| "Registry No.: 2021-MC-088\n" |
| "MC Date of Registration: December 15, 2021\n" |
| "Husband (First): Miguel\n" |
| "Husband (Middle): Santos\n" |
| "Husband (Last): dela Cruz\n" |
| "Husband Age: 30\n" |
| "Husband Citizenship: Filipino\n" |
| "Husband Father (First): Fernando\n" |
| "Husband Father (Middle): Reyes\n" |
| "Husband Father (Last): Mendoza\n" |
| "Husband Father Citizenship: Filipino\n" |
| "Husband Mother (First): Rosario\n" |
| "Husband Mother (Middle): Lim\n" |
| "Husband Mother (Last): Santos\n" |
| "Husband Mother Citizenship: Filipino\n" |
| "Wife (First): Sofia\n" |
| "Wife (Middle): Tan\n" |
| "Wife (Last): Lim\n" |
| "Wife Age: 27\n" |
| "Wife Citizenship: Filipino\n" |
| "Wife Father (First): Antonio\n" |
| "Wife Father (Middle): Cruz\n" |
| "Wife Father (Last): Tan\n" |
| "Wife Father Citizenship: Filipino\n" |
| "Wife Mother (First): Shirley\n" |
| "Wife Mother (Middle): Go\n" |
| "Wife Mother (Last): Reyes\n" |
| "Wife Mother Citizenship: Filipino\n" |
| "MC Date of Marriage: December 12, 2021\n" |
| "MC Place of Marriage: Taguig City" |
| ), |
| "expected": [ |
| "F97_REGISTRY_NO", "F97_DATE_OF_REGISTRATION", |
| "F97_HUSBAND_FIRST", "F97_HUSBAND_MIDDLE", "F97_HUSBAND_LAST", |
| "F97_HUSBAND_AGE", "F97_HUSBAND_CITIZENSHIP", |
| "F97_HUSBAND_FATHER_FIRST", "F97_HUSBAND_MOTHER_FIRST", |
| "F97_WIFE_FIRST", "F97_WIFE_MIDDLE", "F97_WIFE_LAST", |
| "F97_WIFE_AGE", "F97_WIFE_CITIZENSHIP", |
| "F97_WIFE_FATHER_FIRST", "F97_WIFE_MOTHER_FIRST", |
| "F97_DATE_OF_MARRIAGE", "F97_PLACE_OF_MARRIAGE", |
| ], |
| }, |
| { |
| "form": "Form 90 β Marriage License (Groom + Bride)", |
| "text": ( |
| "Registry No.: 2024-BC-001\n" |
| "ML Date of Registration: January 10, 2024\n" |
| "GROOM\n" |
| "Groom (First): Jose\n" |
| "Groom (Middle): Santos\n" |
| "Groom (Last): Ramos\n" |
| "Groom Date of Birth: March 15, 1995\n" |
| "Groom Age: 39\n" |
| "Groom Place of Birth: Manila\n" |
| "Groom Sex: Male\n" |
| "Groom Citizenship: Filipino\n" |
| "Groom Father (First): Pedro\n" |
| "Groom Father (Middle): dela Cruz\n" |
| "Groom Father (Last): Villanueva\n" |
| "Groom Father Citizenship: Filipino\n" |
| "Groom Mother (First): Lourdes\n" |
| "Groom Mother (Middle): Reyes\n" |
| "Groom Mother (Last): Bautista\n" |
| "Groom Mother Citizenship: Filipino\n" |
| "BRIDE\n" |
| "Bride (First): Maria\n" |
| "Bride (Middle): Garcia\n" |
| "Bride (Last): Torres\n" |
| "Bride Date of Birth: August 3, 1995\n" |
| "Bride Age: 35\n" |
| "Bride Place of Birth: Quezon City\n" |
| "Bride Sex: Female\n" |
| "Bride Citizenship: Filipino\n" |
| "Bride Father (First): Eduardo\n" |
| "Bride Father (Middle): Mendoza\n" |
| "Bride Father (Last): Aquino\n" |
| "Bride Father Citizenship: Filipino\n" |
| "Bride Mother (First): Gloria\n" |
| "Bride Mother (Middle): Santos\n" |
| "Bride Mother (Last): Lopez\n" |
| "Bride Mother Citizenship: Filipino" |
| ), |
| "expected": [ |
| "F90_REGISTRY_NO", "F90_DATE_OF_REGISTRATION", |
| "F90_GROOM_FIRST", "F90_GROOM_LAST", |
| "F90_GROOM_DATE_OF_BIRTH", "F90_GROOM_AGE", |
| "F90_GROOM_PLACE_OF_BIRTH", "F90_GROOM_SEX", "F90_GROOM_CITIZENSHIP", |
| "F90_GROOM_FATHER_FIRST", "F90_GROOM_MOTHER_FIRST", |
| "F90_BRIDE_FIRST", "F90_BRIDE_LAST", |
| "F90_BRIDE_DATE_OF_BIRTH", "F90_BRIDE_AGE", |
| "F90_BRIDE_PLACE_OF_BIRTH", "F90_BRIDE_SEX", "F90_BRIDE_CITIZENSHIP", |
| "F90_BRIDE_FATHER_FIRST", "F90_BRIDE_MOTHER_FIRST", |
| ], |
| }, |
| ] |
|
|
|
|
| def visual_test(nlp): |
| print("=" * 62) |
| print(" VISUAL TEST β Does the model find the right labels?") |
| print("=" * 62) |
|
|
| total_correct = 0 |
| total_expected = 0 |
|
|
| for case in TEST: |
| doc = nlp(case["text"]) |
| found = {ent.label_: ent.text for ent in doc.ents} |
| extra = {l: t for l, t in found.items() |
| if l not in case["expected"]} |
|
|
| print(f"\n {case['form']}") |
| print(f" {'β'*56}") |
|
|
| correct = 0 |
| for label in case["expected"]: |
| if label in found: |
| print(f" β
{label:<35} = '{found[label]}'") |
| correct += 1 |
| else: |
| print(f" β {label:<35} β NOT FOUND") |
|
|
| if extra: |
| print(f" {'Β·'*56}") |
| for label, text in list(extra.items())[:5]: |
| print(f" β οΈ {label:<35} = '{text}' (extra)") |
|
|
| pct = correct / len(case["expected"]) * 100 |
| bar = "β" * int(pct / 5) + "β" * (20 - int(pct / 5)) |
| grade = "GOOD" if pct >= 70 else "PARTIAL" if pct >= 40 else "POOR" |
| print(f"\n [{bar}] {pct:.0f}% {grade} ({correct}/{len(case['expected'])})") |
|
|
| total_correct += correct |
| total_expected += len(case["expected"]) |
|
|
| overall = total_correct / total_expected * 100 |
| bar = "β" * int(overall / 5) + "β" * (20 - int(overall / 5)) |
|
|
| print(f"\n{'=' * 62}") |
| print(f" OVERALL: [{bar}] {overall:.0f}% ({total_correct}/{total_expected})") |
| if overall >= 70: |
| print(f" Grade: β
GOOD β model is working well") |
| elif overall >= 40: |
| print(f" Grade: β οΈ PARTIAL β needs more training examples") |
| else: |
| print(f" Grade: β POOR β check training pipeline") |
| print(f"{'=' * 62}") |
|
|
| return overall |
|
|
|
|
| def spacy_eval(model_path): |
| """Run official spaCy evaluate on civil-only dev.spacy.""" |
| dev = Path("data/training/dev.spacy") |
| if not dev.exists(): |
| print(f"\n β οΈ dev.spacy not found β skipping spaCy eval") |
| print(f" β Run: python training/prepare_data.py") |
| return |
|
|
| print(f"\n{'=' * 62}") |
| print(f" spaCy OFFICIAL EVAL β civil registry labels only") |
| print(f" Dev file: {dev} (civil registry, NOT merged)") |
| print(f"{'=' * 62}\n") |
|
|
| result = subprocess.run([ |
| sys.executable, "-m", "spacy", "evaluate", |
| str(model_path), str(dev), |
| "--output", "data/training/eval_results.json", |
| ]) |
|
|
| |
| results_file = Path("data/training/eval_results.json") |
| if results_file.exists(): |
| data = json.loads(results_file.read_text()) |
| per_type = data.get("ents_per_type", {}) |
|
|
| civil = {k: v for k, v in per_type.items() |
| if not k.startswith("FORM_")} |
| funsd = {k: v for k, v in per_type.items() |
| if k.startswith("FORM_")} |
|
|
| if civil: |
| print(f"\n CIVIL REGISTRY LABELS (what matters):") |
| print(f" {'Label':<35} {'P':>6} {'R':>6} {'F':>6}") |
| print(f" {'β'*57}") |
| any_nonzero = False |
| for label, scores in sorted(civil.items()): |
| f = scores.get("f", 0) |
| p = scores.get("p", 0) |
| r = scores.get("r", 0) |
| flag = "" if f > 0 else " β β 0%" |
| if f > 0: |
| any_nonzero = True |
| print(f" {label:<35} {p:>6.1f} {r:>6.1f} {f:>6.1f}{flag}") |
|
|
| if not any_nonzero: |
| print(f"\n β ALL civil labels are 0% β Phase 2 fine-tuning needed") |
| print(f" β Run: python training/train.py (two-phase training)") |
|
|
| if funsd: |
| avg_f = sum(v.get("f", 0) for v in funsd.values()) / len(funsd) |
| print(f"\n FUNSD LABELS (background learning): avg F={avg_f:.1f}%") |
|
|
|
|
| if __name__ == "__main__": |
| model_path = Path(MODEL_PATH) |
|
|
| if not model_path.exists(): |
| print(f"β Model not found: {model_path}") |
| print(" Run: python training/train.py") |
| sys.exit(1) |
|
|
| print(f"\n Loading model: {model_path}\n") |
| nlp = spacy.load(str(model_path)) |
|
|
| overall = visual_test(nlp) |
| spacy_eval(model_path) |
|
|
| print(f"\n Results saved β data/training/eval_results.json") |
|
|