|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| import spacy, subprocess, sys, json
|
| from pathlib import Path
|
|
|
| MODEL_PATH = "./models/civil_registry_model/model-best"
|
|
|
|
|
| TEST = [
|
| {
|
| "form": "Form 102 β Birth Certificate",
|
| "text": (
|
| "Registry No.: 2024-001\n"
|
| "1. NAME (First): Ana (Middle): Garcia (Last): Reyes\n"
|
| "2. SEX: Female\n"
|
| "3. DATE OF BIRTH: August 21, 1995\n"
|
| "4. PLACE OF BIRTH: Pasig City\n"
|
| "7. MAIDEN NAME (First): Gloria (Middle): Santos (Last): Garcia\n"
|
| "8. CITIZENSHIP: Filipino\n"
|
| "14. NAME (First): Ramon (Middle): Cruz (Last): Reyes\n"
|
| "15. CITIZENSHIP: Filipino"
|
| ),
|
| "expected": [
|
| "F102_CHILD_FIRST", "F102_CHILD_MIDDLE", "F102_CHILD_LAST",
|
| "F102_SEX", "F102_DATE_OF_BIRTH", "F102_PLACE_OF_BIRTH",
|
| "F102_MOTHER_FIRST", "F102_FATHER_FIRST",
|
| ],
|
| },
|
| {
|
| "form": "Form 103 β Death Certificate",
|
| "text": (
|
| "1. NAME (First): Fernando (Middle): Santos (Last): Cruz\n"
|
| "2. SEX: Male\n"
|
| "4. AGE: 70\n"
|
| "5. PLACE OF DEATH: PGH Manila\n"
|
| "6. DATE OF DEATH: March 3, 2023\n"
|
| "Immediate cause: Renal Failure"
|
| ),
|
| "expected": [
|
| "F103_DECEASED_FIRST", "F103_DECEASED_MIDDLE", "F103_DECEASED_LAST",
|
| "F103_SEX", "F103_AGE", "F103_PLACE_OF_DEATH",
|
| "F103_DATE_OF_DEATH", "F103_CAUSE_IMMEDIATE",
|
| ],
|
| },
|
| {
|
| "form": "Form 97 β Marriage Certificate",
|
| "text": (
|
| "Registry No.: 2021-MC-088\n"
|
| "MC Date of Registration: December 15, 2021\n"
|
| "Husband (First): Miguel\n"
|
| "Husband (Middle): Santos\n"
|
| "Husband (Last): dela Cruz\n"
|
| "Husband Age: 30\n"
|
| "Husband Citizenship: Filipino\n"
|
| "Husband Father (First): Fernando\n"
|
| "Husband Father (Middle): Reyes\n"
|
| "Husband Father (Last): Mendoza\n"
|
| "Husband Father Citizenship: Filipino\n"
|
| "Husband Mother (First): Rosario\n"
|
| "Husband Mother (Middle): Lim\n"
|
| "Husband Mother (Last): Santos\n"
|
| "Husband Mother Citizenship: Filipino\n"
|
| "Wife (First): Sofia\n"
|
| "Wife (Middle): Tan\n"
|
| "Wife (Last): Lim\n"
|
| "Wife Age: 27\n"
|
| "Wife Citizenship: Filipino\n"
|
| "Wife Father (First): Antonio\n"
|
| "Wife Father (Middle): Cruz\n"
|
| "Wife Father (Last): Tan\n"
|
| "Wife Father Citizenship: Filipino\n"
|
| "Wife Mother (First): Shirley\n"
|
| "Wife Mother (Middle): Go\n"
|
| "Wife Mother (Last): Reyes\n"
|
| "Wife Mother Citizenship: Filipino\n"
|
| "MC Date of Marriage: December 12, 2021\n"
|
| "MC Place of Marriage: Taguig City"
|
| ),
|
| "expected": [
|
| "F97_REGISTRY_NO", "F97_DATE_OF_REGISTRATION",
|
| "F97_HUSBAND_FIRST", "F97_HUSBAND_MIDDLE", "F97_HUSBAND_LAST",
|
| "F97_HUSBAND_AGE", "F97_HUSBAND_CITIZENSHIP",
|
| "F97_HUSBAND_FATHER_FIRST", "F97_HUSBAND_MOTHER_FIRST",
|
| "F97_WIFE_FIRST", "F97_WIFE_MIDDLE", "F97_WIFE_LAST",
|
| "F97_WIFE_AGE", "F97_WIFE_CITIZENSHIP",
|
| "F97_WIFE_FATHER_FIRST", "F97_WIFE_MOTHER_FIRST",
|
| "F97_DATE_OF_MARRIAGE", "F97_PLACE_OF_MARRIAGE",
|
| ],
|
| },
|
| {
|
| "form": "Form 90 β Marriage License (Groom + Bride)",
|
| "text": (
|
| "Registry No.: 2024-BC-001\n"
|
| "ML Date of Registration: January 10, 2024\n"
|
| "GROOM\n"
|
| "Groom (First): Jose\n"
|
| "Groom (Middle): Santos\n"
|
| "Groom (Last): Ramos\n"
|
| "Groom Date of Birth: March 15, 1995\n"
|
| "Groom Age: 39\n"
|
| "Groom Place of Birth: Manila\n"
|
| "Groom Sex: Male\n"
|
| "Groom Citizenship: Filipino\n"
|
| "Groom Father (First): Pedro\n"
|
| "Groom Father (Middle): dela Cruz\n"
|
| "Groom Father (Last): Villanueva\n"
|
| "Groom Father Citizenship: Filipino\n"
|
| "Groom Mother (First): Lourdes\n"
|
| "Groom Mother (Middle): Reyes\n"
|
| "Groom Mother (Last): Bautista\n"
|
| "Groom Mother Citizenship: Filipino\n"
|
| "BRIDE\n"
|
| "Bride (First): Maria\n"
|
| "Bride (Middle): Garcia\n"
|
| "Bride (Last): Torres\n"
|
| "Bride Date of Birth: August 3, 1995\n"
|
| "Bride Age: 35\n"
|
| "Bride Place of Birth: Quezon City\n"
|
| "Bride Sex: Female\n"
|
| "Bride Citizenship: Filipino\n"
|
| "Bride Father (First): Eduardo\n"
|
| "Bride Father (Middle): Mendoza\n"
|
| "Bride Father (Last): Aquino\n"
|
| "Bride Father Citizenship: Filipino\n"
|
| "Bride Mother (First): Gloria\n"
|
| "Bride Mother (Middle): Santos\n"
|
| "Bride Mother (Last): Lopez\n"
|
| "Bride Mother Citizenship: Filipino"
|
| ),
|
| "expected": [
|
| "F90_REGISTRY_NO", "F90_DATE_OF_REGISTRATION",
|
| "F90_GROOM_FIRST", "F90_GROOM_LAST",
|
| "F90_GROOM_DATE_OF_BIRTH", "F90_GROOM_AGE",
|
| "F90_GROOM_PLACE_OF_BIRTH", "F90_GROOM_SEX", "F90_GROOM_CITIZENSHIP",
|
| "F90_GROOM_FATHER_FIRST", "F90_GROOM_MOTHER_FIRST",
|
| "F90_BRIDE_FIRST", "F90_BRIDE_LAST",
|
| "F90_BRIDE_DATE_OF_BIRTH", "F90_BRIDE_AGE",
|
| "F90_BRIDE_PLACE_OF_BIRTH", "F90_BRIDE_SEX", "F90_BRIDE_CITIZENSHIP",
|
| "F90_BRIDE_FATHER_FIRST", "F90_BRIDE_MOTHER_FIRST",
|
| ],
|
| },
|
| ]
|
|
|
|
|
| def visual_test(nlp):
|
| print("=" * 62)
|
| print(" VISUAL TEST β Does the model find the right labels?")
|
| print("=" * 62)
|
|
|
| total_correct = 0
|
| total_expected = 0
|
|
|
| for case in TEST:
|
| doc = nlp(case["text"])
|
| found = {ent.label_: ent.text for ent in doc.ents}
|
| extra = {l: t for l, t in found.items()
|
| if l not in case["expected"]}
|
|
|
| print(f"\n {case['form']}")
|
| print(f" {'β'*56}")
|
|
|
| correct = 0
|
| for label in case["expected"]:
|
| if label in found:
|
| print(f" β
{label:<35} = '{found[label]}'")
|
| correct += 1
|
| else:
|
| print(f" β {label:<35} β NOT FOUND")
|
|
|
| if extra:
|
| print(f" {'Β·'*56}")
|
| for label, text in list(extra.items())[:5]:
|
| print(f" β οΈ {label:<35} = '{text}' (extra)")
|
|
|
| pct = correct / len(case["expected"]) * 100
|
| bar = "β" * int(pct / 5) + "β" * (20 - int(pct / 5))
|
| grade = "GOOD" if pct >= 70 else "PARTIAL" if pct >= 40 else "POOR"
|
| print(f"\n [{bar}] {pct:.0f}% {grade} ({correct}/{len(case['expected'])})")
|
|
|
| total_correct += correct
|
| total_expected += len(case["expected"])
|
|
|
| overall = total_correct / total_expected * 100
|
| bar = "β" * int(overall / 5) + "β" * (20 - int(overall / 5))
|
|
|
| print(f"\n{'=' * 62}")
|
| print(f" OVERALL: [{bar}] {overall:.0f}% ({total_correct}/{total_expected})")
|
| if overall >= 70:
|
| print(f" Grade: β
GOOD β model is working well")
|
| elif overall >= 40:
|
| print(f" Grade: β οΈ PARTIAL β needs more training examples")
|
| else:
|
| print(f" Grade: β POOR β check training pipeline")
|
| print(f"{'=' * 62}")
|
|
|
| return overall
|
|
|
|
|
| def spacy_eval(model_path):
|
| """Run official spaCy evaluate on civil-only dev.spacy."""
|
| dev = Path("data/training/dev.spacy")
|
| if not dev.exists():
|
| print(f"\n β οΈ dev.spacy not found β skipping spaCy eval")
|
| print(f" β Run: python training/prepare_data.py")
|
| return
|
|
|
| print(f"\n{'=' * 62}")
|
| print(f" spaCy OFFICIAL EVAL β civil registry labels only")
|
| print(f" Dev file: {dev} (civil registry, NOT merged)")
|
| print(f"{'=' * 62}\n")
|
|
|
| result = subprocess.run([
|
| sys.executable, "-m", "spacy", "evaluate",
|
| str(model_path), str(dev),
|
| "--output", "data/training/eval_results.json",
|
| ])
|
|
|
|
|
| results_file = Path("data/training/eval_results.json")
|
| if results_file.exists():
|
| data = json.loads(results_file.read_text())
|
| per_type = data.get("ents_per_type", {})
|
|
|
| civil = {k: v for k, v in per_type.items()
|
| if not k.startswith("FORM_")}
|
| funsd = {k: v for k, v in per_type.items()
|
| if k.startswith("FORM_")}
|
|
|
| if civil:
|
| print(f"\n CIVIL REGISTRY LABELS (what matters):")
|
| print(f" {'Label':<35} {'P':>6} {'R':>6} {'F':>6}")
|
| print(f" {'β'*57}")
|
| any_nonzero = False
|
| for label, scores in sorted(civil.items()):
|
| f = scores.get("f", 0)
|
| p = scores.get("p", 0)
|
| r = scores.get("r", 0)
|
| flag = "" if f > 0 else " β β 0%"
|
| if f > 0:
|
| any_nonzero = True
|
| print(f" {label:<35} {p:>6.1f} {r:>6.1f} {f:>6.1f}{flag}")
|
|
|
| if not any_nonzero:
|
| print(f"\n β ALL civil labels are 0% β Phase 2 fine-tuning needed")
|
| print(f" β Run: python training/train.py (two-phase training)")
|
|
|
| if funsd:
|
| avg_f = sum(v.get("f", 0) for v in funsd.values()) / len(funsd)
|
| print(f"\n FUNSD LABELS (background learning): avg F={avg_f:.1f}%")
|
|
|
|
|
| if __name__ == "__main__":
|
| model_path = Path(MODEL_PATH)
|
|
|
| if not model_path.exists():
|
| print(f"β Model not found: {model_path}")
|
| print(" Run: python training/train.py")
|
| sys.exit(1)
|
|
|
| print(f"\n Loading model: {model_path}\n")
|
| nlp = spacy.load(str(model_path))
|
|
|
| overall = visual_test(nlp)
|
| spacy_eval(model_path)
|
|
|
| print(f"\n Results saved β data/training/eval_results.json")
|
|
|