import pdfplumber
import json
import os
from tqdm import tqdm

# --- CONFIGURATION ---
# CHANGE THESE FOR EACH NEW PDF YOU PROCESS
#PDF_PATH = r"C:\Users\646099\Documents\NMP-MOM_Glossary_and_Acrynoms.pdf"
ACRONYMS_START_PAGE = 5
ACRONYMS_END_PAGE = 10
# Set glossary pages to 0 if you want to skip them for now
GLOSSARY_START_PAGE = 0 
GLOSSARY_END_PAGE = 0

# THIS FILE REMAINS CONSTANT - IT GROWS WITH EVERY RUN
MASTER_DB_FILE = "manual_data.json"

def load_master_db():
    if os.path.exists(MASTER_DB_FILE):
        print(f"Loading existing Master DB: {MASTER_DB_FILE}...")
        with open(MASTER_DB_FILE, "r") as f:
            return json.load(f)
    else:
        print("No Master DB found. Creating a new one.")
        return {"acronyms": {}, "glossary": {}}

def extract_acronyms(pdf, start_page, end_page):
    print(f"--- Extracting Acronyms from {PDF_PATH} ---")
    acronym_data = {}
    
    if start_page == 0 or end_page == 0:
        return acronym_data

    pages_to_process = range(start_page - 1, end_page) 
    
    for i in tqdm(pages_to_process, desc="Processing Acronym Pages"):
        page = pdf.pages[i]
        tables = page.extract_tables()
        
        for table in tables:
            for row in table:
                clean_row = [cell.strip() if cell else "" for cell in row]
                
                if len(clean_row) >= 2:
                    # FORCE UPPERCASE KEY for better matching
                    acronym = clean_row[0].strip().upper() 
                    definition = clean_row[1].strip()
                    
                    if acronym in ["ACRONYM", "TERM"] or definition in ["DEFINITION"]:
                        continue
                        
                    if acronym and definition:
                        acronym_data[acronym] = definition

    return acronym_data

def merge_data(master, new_data, section_name):
    """
    Merges new_data into master.
    Strategy: If key exists and definition is different, append it.
    """
    added_count = 0
    updated_count = 0
    
    for key, new_def in new_data.items():
        if key in master:
            existing_def = master[key]
            # Only update if the definition is actually different
            if new_def not in existing_def:
                # Append the new definition nicely
                master[key] = f"{existing_def} | {new_def}"
                updated_count += 1
        else:
            master[key] = new_def
            added_count += 1
            
    print(f"[{section_name}] Added {added_count} new entries. Updated/Appended {updated_count} existing entries.")
    return master

def main():
    try:
        pdf = pdfplumber.open(PDF_PATH)
    except FileNotFoundError:
        print(f"Error: Could not find file {PDF_PATH}")
        return

    # 1. Load Master DB
    master_db = load_master_db()

    # 2. Extract New Data
    new_acronyms = extract_acronyms(pdf, ACRONYMS_START_PAGE, ACRONYMS_END_PAGE)
    
    # 3. Merge Acronyms
    master_db["acronyms"] = merge_data(master_db["acronyms"], new_acronyms, "Acronyms")
    
    # 4. Save Updated Master DB
    with open(MASTER_DB_FILE, "w") as f:
        json.dump(master_db, f, indent=4)
        
    print(f"Success! Master DB updated at {MASTER_DB_FILE}")
    print(f"Total Acronyms in Database: {len(master_db['acronyms'])}")

if __name__ == "__main__":
    main()