ocr / spacyNER /main.py
ifgr003's picture
Upload 124 files
4bfc055 verified
# ============================================================
# main.py
# ------------------------------------------------------------
# Entry point β€” run the full pipeline with sample form texts.
#
# How to run:
# python main.py
#
# For real scanned forms:
# from spacyNER.ocr import scan_form
# text = scan_form("path/to/form_102.jpg")
# ============================================================
from spacyNER.extractor import CivilRegistryNER
from spacyNER.autofill import AutoFillEngine
MODEL_PATH = "en_core_web_sm"
# After fine-tuning:
# MODEL_PATH = "./models/civil_registry_model/model-best"
extractor = CivilRegistryNER(model_path=MODEL_PATH)
filler = AutoFillEngine(extractor)
print("=" * 65)
print(" CIVIL REGISTRY NER β€” AUTO-FILL PIPELINE")
print("=" * 65)
# ──────────────────────────────────────────────────────────
# CERTIFICATIONS PAGE β€” Form 102 (Certificate of Live Birth)
# ──────────────────────────────────────────────────────────
FORM_102_OCR = """
Registry No.: 2024-001
Province: Metro Manila
City/Municipality: Makati City
1. NAME (First): Juan (Middle): dela Cruz (Last): Santos
2. SEX: Male
3. DATE OF BIRTH: March 15, 1990
4. PLACE OF BIRTH: Ospital ng Makati, Makati City, Metro Manila
5a. TYPE OF BIRTH: Single
5c. BIRTH ORDER: First
MOTHER:
7. MAIDEN NAME (First): Maria (Middle): Reyes (Last): dela Cruz
8. CITIZENSHIP: Filipino
9. RELIGION/RELIGIOUS SECT: Roman Catholic
13. RESIDENCE: 123 Rizal Street, Barangay San Antonio, Makati City
FATHER:
14. NAME (First): Pedro (Middle): Cruz (Last): Santos
15. CITIZENSHIP: Filipino
16. RELIGION/RELIGIOUS SECT: Roman Catholic
19. RESIDENCE: 123 Rizal Street, Barangay San Antonio, Makati City
MARRIAGE OF PARENTS:
20a. DATE: June 10, 1985
20b. PLACE: Manila City, Metro Manila, Philippines
"""
# ──────────────────────────────────────────────────────────
# CERTIFICATIONS PAGE β€” Form 103 (Certificate of Death)
# ──────────────────────────────────────────────────────────
FORM_103_OCR = """
Registry No.: 2024-045
Province: Metro Manila
City/Municipality: Quezon City
1. NAME (First): Carlos (Middle): Reyes (Last): Mendoza
2. SEX: Male
3. RELIGION: Roman Catholic
4. AGE: 65
5. PLACE OF DEATH: Quezon City Medical Center, Quezon City, Metro Manila
6. DATE OF DEATH: September 2, 2020
7. CITIZENSHIP: Filipino
8. RESIDENCE: 45 Mabini Street, Quezon City, Metro Manila
9. CIVIL STATUS: Married
10. OCCUPATION: Retired Teacher
17. CAUSES OF DEATH:
Immediate cause: Cardiac Arrest
Antecedent cause: Hypertensive Cardiovascular Disease
Underlying cause: Hypertension
"""
# ──────────────────────────────────────────────────────────
# CERTIFICATIONS PAGE β€” Form 97 (Certificate of Marriage)
# ──────────────────────────────────────────────────────────
FORM_97_OCR = """
Registry No.: 2024-088
Province: Metro Manila
City/Municipality: Makati City
HUSBAND:
1. NAME (First): Jose (Middle): Cruz (Last): Ramos
2a. DATE OF BIRTH: June 10, 1994
2b. AGE: 28
3. PLACE OF BIRTH: Manila, Metro Manila, Philippines
4a. SEX: Male
4b. CITIZENSHIP: Filipino
5. RESIDENCE: 123 Rizal Street, Makati City, Metro Manila
6. RELIGION/RELIGIOUS SECT: Roman Catholic
7. CIVIL STATUS: Single
8. NAME OF FATHER (First): Roberto (Middle): Santos (Last): Ramos
9. CITIZENSHIP: Filipino
10. NAME OF MOTHER (First): Conchita (Middle): Dela (Last): Rosa
11. CITIZENSHIP: Filipino
WIFE:
1. NAME (First): Elena (Middle): Bautista (Last): Torres
2a. DATE OF BIRTH: April 20, 1997
2b. AGE: 25
3. PLACE OF BIRTH: Cebu City, Cebu, Philippines
4a. SEX: Female
4b. CITIZENSHIP: Filipino
5. RESIDENCE: 456 Mabini Avenue, Cebu City, Cebu
6. RELIGION/RELIGIOUS SECT: Roman Catholic
7. CIVIL STATUS: Single
8. NAME OF FATHER (First): Ernesto (Middle): Lim (Last): Torres
9. CITIZENSHIP: Filipino
10. NAME OF MOTHER (First): Felicitas (Middle): Cruz (Last): Bautista
11. CITIZENSHIP: Filipino
15. PLACE OF MARRIAGE: Makati City Hall, Makati City, Metro Manila
16. DATE OF MARRIAGE: February 14, 2022
"""
# ──────────────────────────────────────────────────────────
# MARRIAGE LICENSE PAGE β€” Groom's Birth Certificate (Male/PSA)
# MNB classify_sex() β†’ "GROOM" (SEX: Male)
# Uses F90_GROOM_* labels
# ──────────────────────────────────────────────────────────
FORM_90_GROOM_OCR = """
Municipal Form No. 102 Certificate of Live Birth
PSA Registry No.: 2024-BC-001
Date of Registration: January 10, 2024
GROOM
Groom (First): Jose Groom (Middle): Santos Groom (Last): Ramos
Groom Date of Birth: March 15, 1995
Groom Age: 29
Groom Place of Birth: Manila
Groom Sex: Male
Groom Citizenship: Filipino
Groom Residence: 123 Rizal Street Makati City
Groom Father (First): Pedro Groom Father (Middle): dela Cruz Groom Father (Last): Villanueva
Groom Father Citizenship: Filipino
Groom Mother (First): Lourdes Groom Mother (Middle): Reyes Groom Mother (Last): Bautista
Groom Mother Citizenship: Filipino
"""
# ──────────────────────────────────────────────────────────
# MARRIAGE LICENSE PAGE β€” Bride's Birth Certificate (Female/NSO)
# MNB classify_sex() β†’ "BRIDE" (SEX: Female)
# Uses F90_BRIDE_* labels
# ──────────────────────────────────────────────────────────
FORM_90_BRIDE_OCR = """
Municipal Form No. 102 Certificate of Live Birth
NSO Registry No.: 2024-BC-002
Date of Registration: January 10, 2024
BRIDE
Bride (First): Maria Bride (Middle): Garcia Bride (Last): Torres
Bride Date of Birth: August 3, 1998
Bride Age: 26
Bride Place of Birth: Quezon City
Bride Sex: Female
Bride Citizenship: Filipino
Bride Residence: 456 Mabini Avenue Quezon City
Bride Father (First): Eduardo Bride Father (Middle): Mendoza Bride Father (Last): Aquino
Bride Father Citizenship: Filipino
Bride Mother (First): Gloria Bride Mother (Middle): Santos Bride Mother (Last): Lopez
Bride Mother Citizenship: Filipino
"""
# ──────────────────────────────────────────────────────────
# HELPER
# ──────────────────────────────────────────────────────────
def print_form(title: str, source: str, form_object):
result = filler.to_dict(form_object)
print(f"\n{'═' * 65}")
print(f" πŸ“‹ {title}")
print(f" Source: {source}")
print(f"{'═' * 65}")
if not result:
print(" ⚠️ No fields extracted.")
print(" β†’ Add annotated training data and fine-tune the model.")
return
for field_name, value in result.items():
label = field_name.replace("_", " ").title()
print(f" {label:<45} {value}")
# ──────────────────────────────────────────────────────────
# RUN PIPELINE
# ──────────────────────────────────────────────────────────
# ── Certifications Page ────────────────────────────────────
form_1a = filler.fill_form_1a(FORM_102_OCR)
print_form("FORM 1A β€” Birth Certificate", "Form 102 (Certificate of Live Birth)", form_1a)
print("\n ✏️ NAME ASSEMBLY:")
print(f" Name of Child β†’ {form_1a.name_of_child!r}")
print(f" Name of Mother β†’ {form_1a.name_of_mother!r}")
print(f" Name of Father β†’ {form_1a.name_of_father!r}")
form_2a = filler.fill_form_2a(FORM_103_OCR)
print_form("FORM 2A β€” Death Certificate", "Form 103 (Certificate of Death)", form_2a)
print("\n ✏️ NAME ASSEMBLY:")
print(f" Name of Deceased β†’ {form_2a.name_of_deceased!r}")
form_3a = filler.fill_form_3a(FORM_97_OCR)
print_form("FORM 3A β€” Marriage Certificate", "Form 97 (Certificate of Marriage)", form_3a)
print("\n ✏️ NAME ASSEMBLY:")
print(f" Husband Name β†’ {form_3a.husband.name!r}")
print(f" Husband Father Name β†’ {form_3a.husband.name_of_father!r}")
print(f" Husband Mother Name β†’ {form_3a.husband.name_of_mother!r}")
print(f" Wife Name β†’ {form_3a.wife.name!r}")
print(f" Wife Father Name β†’ {form_3a.wife.name_of_father!r}")
print(f" Wife Mother Name β†’ {form_3a.wife.name_of_mother!r}")
# ── Marriage License Page ──────────────────────────────────
# MNB has already classified:
# FORM_90_GROOM_OCR β†’ classify_sex() β†’ "GROOM" (Male)
# FORM_90_BRIDE_OCR β†’ classify_sex() β†’ "BRIDE" (Female)
form_90 = filler.fill_form_90(FORM_90_GROOM_OCR, FORM_90_BRIDE_OCR)
print_form("FORM 90 β€” Application for Marriage License",
"Marriage License Page (Groom + Bride Birth Certs)", form_90)
print("\n ✏️ NAME ASSEMBLY:")
print(f" Groom Name β†’ {form_90.groom.name_of_applicant!r}")
print(f" Groom Father Name β†’ {form_90.groom.name_of_father!r}")
print(f" Groom Mother Name β†’ {form_90.groom.maiden_name_of_mother!r}")
print(f" Bride Name β†’ {form_90.bride.name_of_applicant!r}")
print(f" Bride Father Name β†’ {form_90.bride.name_of_father!r}")
print(f" Bride Mother Name β†’ {form_90.bride.maiden_name_of_mother!r}")
print("\n" + "=" * 65)
print(" βœ… Pipeline complete!")
print("=" * 65)
print()
print(" CERTIFICATIONS PAGE β†’ MNB classifies form type β†’ SpaCy NER")
print(" MARRIAGE LICENSE PAGE β†’ MNB classify_sex() routes groom/bride β†’ SpaCy NER")
print()
print(" NEXT STEPS:")
print(" 1. Add annotated examples β†’ training/prepare_data.py")
print(" 2. Run: python training/prepare_data.py")
print(" 3. Run: python training/train.py")
print(" 4. Set MODEL_PATH = './models/civil_registry_model/model-best'")