import pdfplumber import json import os from tqdm import tqdm # --- CONFIGURATION --- # CHANGE THESE FOR EACH NEW PDF YOU PROCESS #PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf" ACRONYMS_START_PAGE = 5 ACRONYMS_END_PAGE = 10 # Set glossary pages to 0 if you want to skip them for now GLOSSARY_START_PAGE = 0 GLOSSARY_END_PAGE = 0 # THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN MASTER_DB_FILE = "manual_data.json" def load_master_db(): if os.path.exists(MASTER_DB_FILE): print(f"Loading existing Master DB: {MASTER_DB_FILE}...") with open(MASTER_DB_FILE, "r") as f: return json.load(f) else: print("No Master DB found. Creating a new one.") return {"acronyms": {}, "glossary": {}} def extract_acronyms(pdf, start_page, end_page): print(f"--- Extracting Acronyms from {PDF_PATH} ---") acronym_data = {} if start_page == 0 or end_page == 0: return acronym_data pages_to_process = range(start_page - 1, end_page) for i in tqdm(pages_to_process, desc="Processing Acronym Pages"): page = pdf.pages[i] tables = page.extract_tables() for table in tables: for row in table: clean_row = [cell.strip() if cell else "" for cell in row] if len(clean_row) >= 2: # FORCE UPPERCASE KEY for better matching acronym = clean_row[0].strip().upper() definition = clean_row[1].strip() if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]: continue if acronym and definition: acronym_data[acronym] = definition return acronym_data def merge_data(master, new_data, section_name): """ Merges new_data into master. Strategy: If key exists and definition is different, append it. """ added_count = 0 updated_count = 0 for key, new_def in new_data.items(): if key in master: existing_def = master[key] # Only update if the definition is actually different if new_def not in existing_def: # Append the new definition nicely master[key] = f"{existing_def} | {new_def}" updated_count += 1 else: master[key] = new_def added_count += 1 print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.") return master def main(): try: pdf = pdfplumber.open(PDF_PATH) except FileNotFoundError: print(f"Error: Could not find file {PDF_PATH}") return # 1. Load Master DB master_db = load_master_db() # 2. Extract New Data new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE) # 3. Merge Acronyms master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms") # 4. Save Updated Master DB with open(MASTER_DB_FILE, "w") as f: json.dump(master_db, f, indent=4) print(f"Success! Master DB updated at {MASTER_DB_FILE}") print(f"Total Acronyms in Database: {len(master_db['acronyms'])}") if __name__ == "__main__": main()