Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| import json | |
| import os | |
| from tqdm import tqdm | |
| # --- CONFIGURATION --- | |
| # CHANGE THESE FOR EACH NEW PDF YOU PROCESS | |
| #PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf" | |
| ACRONYMS_START_PAGE = 5 | |
| ACRONYMS_END_PAGE = 10 | |
| # Set glossary pages to 0 if you want to skip them for now | |
| GLOSSARY_START_PAGE = 0 | |
| GLOSSARY_END_PAGE = 0 | |
| # THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN | |
| MASTER_DB_FILE = "manual_data.json" | |
| def load_master_db(): | |
| if os.path.exists(MASTER_DB_FILE): | |
| print(f"Loading existing Master DB: {MASTER_DB_FILE}...") | |
| with open(MASTER_DB_FILE, "r") as f: | |
| return json.load(f) | |
| else: | |
| print("No Master DB found. Creating a new one.") | |
| return {"acronyms": {}, "glossary": {}} | |
| def extract_acronyms(pdf, start_page, end_page): | |
| print(f"--- Extracting Acronyms from {PDF_PATH} ---") | |
| acronym_data = {} | |
| if start_page == 0 or end_page == 0: | |
| return acronym_data | |
| pages_to_process = range(start_page - 1, end_page) | |
| for i in tqdm(pages_to_process, desc="Processing Acronym Pages"): | |
| page = pdf.pages[i] | |
| tables = page.extract_tables() | |
| for table in tables: | |
| for row in table: | |
| clean_row = [cell.strip() if cell else "" for cell in row] | |
| if len(clean_row) >= 2: | |
| # FORCE UPPERCASE KEY for better matching | |
| acronym = clean_row[0].strip().upper() | |
| definition = clean_row[1].strip() | |
| if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]: | |
| continue | |
| if acronym and definition: | |
| acronym_data[acronym] = definition | |
| return acronym_data | |
| def merge_data(master, new_data, section_name): | |
| """ | |
| Merges new_data into master. | |
| Strategy: If key exists and definition is different, append it. | |
| """ | |
| added_count = 0 | |
| updated_count = 0 | |
| for key, new_def in new_data.items(): | |
| if key in master: | |
| existing_def = master[key] | |
| # Only update if the definition is actually different | |
| if new_def not in existing_def: | |
| # Append the new definition nicely | |
| master[key] = f"{existing_def} | {new_def}" | |
| updated_count += 1 | |
| else: | |
| master[key] = new_def | |
| added_count += 1 | |
| print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.") | |
| return master | |
| def main(): | |
| try: | |
| pdf = pdfplumber.open(PDF_PATH) | |
| except FileNotFoundError: | |
| print(f"Error: Could not find file {PDF_PATH}") | |
| return | |
| # 1. Load Master DB | |
| master_db = load_master_db() | |
| # 2. Extract New Data | |
| new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE) | |
| # 3. Merge Acronyms | |
| master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms") | |
| # 4. Save Updated Master DB | |
| with open(MASTER_DB_FILE, "w") as f: | |
| json.dump(master_db, f, indent=4) | |
| print(f"Success! Master DB updated at {MASTER_DB_FILE}") | |
| print(f"Total Acronyms in Database: {len(master_db['acronyms'])}") | |
| if __name__ == "__main__": | |
| main() |