Spaces:

NavyDevilDoc
/

NMP-MOM_Vocab

Sleeping

File size: 3,391 Bytes

d9afdba
 
815f21f
d9afdba
 
 
815f21f
 
 
 
 
 
 
d9afdba
815f21f
 
 
 
 
 
 
 
 
 
 
d9afdba
 
815f21f
d9afdba
 
815f21f
 
 
d9afdba
 
 
 
 
 
 
 
 
 
 
815f21f
 
 
d9afdba
815f21f
d9afdba
 
 
 
 
 
 
815f21f
d9afdba
815f21f
 
d9afdba
815f21f
 
d9afdba
815f21f
 
 
 
 
 
 
 
 
 
 
d9afdba
815f21f
 
d9afdba
 
 
 
 
 
 
 
815f21f
 
 
 
 
d9afdba
815f21f
 
d9afdba
815f21f
 
 
d9afdba
815f21f
 
d9afdba

import pdfplumber
import json
import os
from tqdm import tqdm

# --- CONFIGURATION ---
# CHANGE THESE FOR EACH NEW PDF YOU PROCESS
#PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf"
ACRONYMS_START_PAGE = 5
ACRONYMS_END_PAGE = 10
# Set glossary pages to 0 if you want to skip them for now
GLOSSARY_START_PAGE = 0 
GLOSSARY_END_PAGE = 0

# THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN
MASTER_DB_FILE = "manual_data.json"

def load_master_db():
    if os.path.exists(MASTER_DB_FILE):
        print(f"Loading existing Master DB: {MASTER_DB_FILE}...")
        with open(MASTER_DB_FILE, "r") as f:
            return json.load(f)
    else:
        print("No Master DB found. Creating a new one.")
        return {"acronyms": {}, "glossary": {}}

def extract_acronyms(pdf, start_page, end_page):
    print(f"--- Extracting Acronyms from {PDF_PATH} ---")
    acronym_data = {}
    
    if start_page == 0 or end_page == 0:
        return acronym_data

    pages_to_process = range(start_page - 1, end_page) 
    
    for i in tqdm(pages_to_process, desc="Processing Acronym Pages"):
        page = pdf.pages[i]
        tables = page.extract_tables()
        
        for table in tables:
            for row in table:
                clean_row = [cell.strip() if cell else "" for cell in row]
                
                if len(clean_row) >= 2:
                    # FORCE UPPERCASE KEY for better matching
                    acronym = clean_row[0].strip().upper() 
                    definition = clean_row[1].strip()
                    
                    if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]:
                        continue
                        
                    if acronym and definition:
                        acronym_data[acronym] = definition

    return acronym_data

def merge_data(master, new_data, section_name):
    """
    Merges new_data into master.
    Strategy: If key exists and definition is different, append it.
    """
    added_count = 0
    updated_count = 0
    
    for key, new_def in new_data.items():
        if key in master:
            existing_def = master[key]
            # Only update if the definition is actually different
            if new_def not in existing_def:
                # Append the new definition nicely
                master[key] = f"{existing_def} | {new_def}"
                updated_count += 1
        else:
            master[key] = new_def
            added_count += 1
            
    print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.")
    return master

def main():
    try:
        pdf = pdfplumber.open(PDF_PATH)
    except FileNotFoundError:
        print(f"Error: Could not find file {PDF_PATH}")
        return

    # 1. Load Master DB
    master_db = load_master_db()

    # 2. Extract New Data
    new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE)
    
    # 3. Merge Acronyms
    master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms")
    
    # 4. Save Updated Master DB
    with open(MASTER_DB_FILE, "w") as f:
        json.dump(master_db, f, indent=4)
        
    print(f"Success! Master DB updated at {MASTER_DB_FILE}")
    print(f"Total Acronyms in Database: {len(master_db['acronyms'])}")

if __name__ == "__main__":
    main()