Spaces:
Sleeping
Sleeping
File size: 3,391 Bytes
d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba 815f21f d9afdba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import pdfplumber
import json
import os
from tqdm import tqdm
# --- CONFIGURATION ---
# CHANGE THESE FOR EACH NEW PDF YOU PROCESS
#PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf"
ACRONYMS_START_PAGE = 5
ACRONYMS_END_PAGE = 10
# Set glossary pages to 0 if you want to skip them for now
GLOSSARY_START_PAGE = 0
GLOSSARY_END_PAGE = 0
# THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN
MASTER_DB_FILE = "manual_data.json"
def load_master_db():
if os.path.exists(MASTER_DB_FILE):
print(f"Loading existing Master DB: {MASTER_DB_FILE}...")
with open(MASTER_DB_FILE, "r") as f:
return json.load(f)
else:
print("No Master DB found. Creating a new one.")
return {"acronyms": {}, "glossary": {}}
def extract_acronyms(pdf, start_page, end_page):
print(f"--- Extracting Acronyms from {PDF_PATH} ---")
acronym_data = {}
if start_page == 0 or end_page == 0:
return acronym_data
pages_to_process = range(start_page - 1, end_page)
for i in tqdm(pages_to_process, desc="Processing Acronym Pages"):
page = pdf.pages[i]
tables = page.extract_tables()
for table in tables:
for row in table:
clean_row = [cell.strip() if cell else "" for cell in row]
if len(clean_row) >= 2:
# FORCE UPPERCASE KEY for better matching
acronym = clean_row[0].strip().upper()
definition = clean_row[1].strip()
if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]:
continue
if acronym and definition:
acronym_data[acronym] = definition
return acronym_data
def merge_data(master, new_data, section_name):
"""
Merges new_data into master.
Strategy: If key exists and definition is different, append it.
"""
added_count = 0
updated_count = 0
for key, new_def in new_data.items():
if key in master:
existing_def = master[key]
# Only update if the definition is actually different
if new_def not in existing_def:
# Append the new definition nicely
master[key] = f"{existing_def} | {new_def}"
updated_count += 1
else:
master[key] = new_def
added_count += 1
print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.")
return master
def main():
try:
pdf = pdfplumber.open(PDF_PATH)
except FileNotFoundError:
print(f"Error: Could not find file {PDF_PATH}")
return
# 1. Load Master DB
master_db = load_master_db()
# 2. Extract New Data
new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE)
# 3. Merge Acronyms
master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms")
# 4. Save Updated Master DB
with open(MASTER_DB_FILE, "w") as f:
json.dump(master_db, f, indent=4)
print(f"Success! Master DB updated at {MASTER_DB_FILE}")
print(f"Total Acronyms in Database: {len(master_db['acronyms'])}")
if __name__ == "__main__":
main() |