mastermap-cleaner / apply_blueprint.py
andrewbejjani's picture
Initial clean commit
dc06d4c
import pandas as pd
import openpyxl
import os
import json
import argparse
from src.config import (
DEFAULT_BLUEPRINT_FILE,
DEFAULT_OFFICIAL_REFS_FILE,
DEFAULT_MANUAL_REFS_FILE,
DEFAULT_OUTPUT_SHEET_NAME,
resolve_data_path,
resolve_ref_path,
)
from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains
def parse_args():
parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")
args = parser.parse_args()
args.input = resolve_data_path(args.input)
args.blueprint = resolve_data_path(args.blueprint)
args.refs = resolve_ref_path(args.refs)
args.manual_refs = resolve_ref_path(args.manual_refs)
return args
def load_json_safe(filepath):
try:
with open(filepath, 'r', encoding='utf-8-sig') as f:
return json.load(f)
except Exception:
return {}
def split_approved_parts(value):
if pd.isna(value):
return []
return [part.strip() for part in str(value).split(",") if part.strip()]
def ensure_manual_bucket(manual_refs, official_refs, column_name):
if column_name not in manual_refs:
manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
return manual_refs[column_name]
def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
added_count = 0
for part in split_approved_parts(approved_value):
if ref_contains(official_refs.get(column_name, []), part):
continue
if ref_contains(manual_bucket, part):
continue
if isinstance(manual_bucket, list):
manual_bucket.append(part)
added_count += 1
elif isinstance(manual_bucket, dict):
manual_bucket[normalize_ref(part)] = part
added_count += 1
return added_count
if __name__ == "__main__":
args = parse_args()
print("Loading Master Data, Blueprint, and Memory Files...")
if not os.path.exists(args.blueprint):
print(f"Error: No blueprint found at {args.blueprint} to apply.")
exit()
bp_df = pd.read_excel(args.blueprint)
required_columns = {
"Row_Index",
"Column",
"Original_Raw_Text",
"AI_Suggested_Match",
"Human_Override",
}
missing_columns = required_columns - set(bp_df.columns)
if missing_columns:
print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
exit()
# Load the target Excel workbook
wb = openpyxl.load_workbook(args.input)
if args.sheet not in wb.sheetnames:
print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
exit()
sheet = wb[args.sheet]
col_name_to_idx = {
str(sheet.cell(row=1, column=c).value).strip(): c
for c in range(1, sheet.max_column + 1)
if sheet.cell(row=1, column=c).value
}
# Load the memory dictionaries using the synced CLI path
official_refs = load_json_safe(args.refs)
manual_refs = load_json_safe(args.manual_refs)
changes_made = 0
memory_additions = 0
print("Applying manual overrides and updating memory...")
for _, row in bp_df.iterrows():
human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""
if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
continue
raw_col = str(row['Column']).strip()
if human_val:
# 1. Update the Excel File
try:
excel_row = int(row['Row_Index'])
except (TypeError, ValueError):
print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
continue
if raw_col not in col_name_to_idx:
print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
continue
if excel_row < 1 or excel_row > sheet.max_row:
print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
continue
col_idx = col_name_to_idx[raw_col]
sheet.cell(row=excel_row, column=col_idx).value = human_val
changes_made += 1
# 2. Update Manual References for human overrides and accepted AI suggestions.
if raw_col == "Degree":
continue
if not human_val and confidence == "LOW":
continue
memory_additions += remember_approved_value(
manual_refs=manual_refs,
official_refs=official_refs,
column_name=raw_col,
approved_value=approved_val,
)
memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
# Save Excel
wb.save(args.input)
# Save JSONs
# Make sure the data directory exists before dumping
manual_refs_dir = os.path.dirname(args.manual_refs)
if manual_refs_dir:
os.makedirs(manual_refs_dir, exist_ok=True)
with open(args.manual_refs, 'w', encoding='utf-8') as f:
json.dump(manual_refs, f, indent=4, ensure_ascii=False)
print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
print(f"Memory updated: {memory_additions} new approved values added.")
print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
print(f"Memory updated: Human overrides dumped to {args.manual_refs}")