NMP-MOM_Vocab / etl_processor.py
NavyDevilDoc's picture
Update etl_processor.py
815f21f verified
import pdfplumber
import json
import os
from tqdm import tqdm
# --- CONFIGURATION ---
# CHANGE THESE FOR EACH NEW PDF YOU PROCESS
#PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf"
ACRONYMS_START_PAGE = 5
ACRONYMS_END_PAGE = 10
# Set glossary pages to 0 if you want to skip them for now
GLOSSARY_START_PAGE = 0
GLOSSARY_END_PAGE = 0
# THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN
MASTER_DB_FILE = "manual_data.json"
def load_master_db():
if os.path.exists(MASTER_DB_FILE):
print(f"Loading existing Master DB: {MASTER_DB_FILE}...")
with open(MASTER_DB_FILE, "r") as f:
return json.load(f)
else:
print("No Master DB found. Creating a new one.")
return {"acronyms": {}, "glossary": {}}
def extract_acronyms(pdf, start_page, end_page):
print(f"--- Extracting Acronyms from {PDF_PATH} ---")
acronym_data = {}
if start_page == 0 or end_page == 0:
return acronym_data
pages_to_process = range(start_page - 1, end_page)
for i in tqdm(pages_to_process, desc="Processing Acronym Pages"):
page = pdf.pages[i]
tables = page.extract_tables()
for table in tables:
for row in table:
clean_row = [cell.strip() if cell else "" for cell in row]
if len(clean_row) >= 2:
# FORCE UPPERCASE KEY for better matching
acronym = clean_row[0].strip().upper()
definition = clean_row[1].strip()
if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]:
continue
if acronym and definition:
acronym_data[acronym] = definition
return acronym_data
def merge_data(master, new_data, section_name):
"""
Merges new_data into master.
Strategy: If key exists and definition is different, append it.
"""
added_count = 0
updated_count = 0
for key, new_def in new_data.items():
if key in master:
existing_def = master[key]
# Only update if the definition is actually different
if new_def not in existing_def:
# Append the new definition nicely
master[key] = f"{existing_def} | {new_def}"
updated_count += 1
else:
master[key] = new_def
added_count += 1
print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.")
return master
def main():
try:
pdf = pdfplumber.open(PDF_PATH)
except FileNotFoundError:
print(f"Error: Could not find file {PDF_PATH}")
return
# 1. Load Master DB
master_db = load_master_db()
# 2. Extract New Data
new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE)
# 3. Merge Acronyms
master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms")
# 4. Save Updated Master DB
with open(MASTER_DB_FILE, "w") as f:
json.dump(master_db, f, indent=4)
print(f"Success! Master DB updated at {MASTER_DB_FILE}")
print(f"Total Acronyms in Database: {len(master_db['acronyms'])}")
if __name__ == "__main__":
main()