Spaces:

NavyDevilDoc
/

NMP-MOM_Vocab

Sleeping

App Files Files Community

NMP-MOM_Vocab / etl_processor.py

NavyDevilDoc

Update etl_processor.py

815f21f verified 4 months ago

raw

history blame contribute delete

3.39 kB

	import pdfplumber
	import json
	import os
	from tqdm import tqdm

	# --- CONFIGURATION ---
	# CHANGE THESE FOR EACH NEW PDF YOU PROCESS
	#PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf"
	ACRONYMS_START_PAGE = 5
	ACRONYMS_END_PAGE = 10
	# Set glossary pages to 0 if you want to skip them for now
	GLOSSARY_START_PAGE = 0
	GLOSSARY_END_PAGE = 0

	# THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN
	MASTER_DB_FILE = "manual_data.json"

	def load_master_db():
	if os.path.exists(MASTER_DB_FILE):
	print(f"Loading existing Master DB: {MASTER_DB_FILE}...")
	with open(MASTER_DB_FILE, "r") as f:
	return json.load(f)
	else:
	print("No Master DB found. Creating a new one.")
	return {"acronyms": {}, "glossary": {}}

	def extract_acronyms(pdf, start_page, end_page):
	print(f"--- Extracting Acronyms from {PDF_PATH} ---")
	acronym_data = {}

	if start_page == 0 or end_page == 0:
	return acronym_data

	pages_to_process = range(start_page - 1, end_page)

	for i in tqdm(pages_to_process, desc="Processing Acronym Pages"):
	page = pdf.pages[i]
	tables = page.extract_tables()

	for table in tables:
	for row in table:
	clean_row = [cell.strip() if cell else "" for cell in row]

	if len(clean_row) >= 2:
	# FORCE UPPERCASE KEY for better matching
	acronym = clean_row[0].strip().upper()
	definition = clean_row[1].strip()

	if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]:
	continue

	if acronym and definition:
	acronym_data[acronym] = definition

	return acronym_data

	def merge_data(master, new_data, section_name):
	"""
	Merges new_data into master.
	Strategy: If key exists and definition is different, append it.
	"""
	added_count = 0
	updated_count = 0

	for key, new_def in new_data.items():
	if key in master:
	existing_def = master[key]
	# Only update if the definition is actually different
	if new_def not in existing_def:
	# Append the new definition nicely
	master[key] = f"{existing_def} \| {new_def}"
	updated_count += 1
	else:
	master[key] = new_def
	added_count += 1

	print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.")
	return master

	def main():
	try:
	pdf = pdfplumber.open(PDF_PATH)
	except FileNotFoundError:
	print(f"Error: Could not find file {PDF_PATH}")
	return

	# 1. Load Master DB
	master_db = load_master_db()

	# 2. Extract New Data
	new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE)

	# 3. Merge Acronyms
	master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms")

	# 4. Save Updated Master DB
	with open(MASTER_DB_FILE, "w") as f:
	json.dump(master_db, f, indent=4)

	print(f"Success! Master DB updated at {MASTER_DB_FILE}")
	print(f"Total Acronyms in Database: {len(master_db['acronyms'])}")

	if __name__ == "__main__":
	main()