import pandas as pd import openpyxl import os import json import argparse from src.config import ( DEFAULT_BLUEPRINT_FILE, DEFAULT_OFFICIAL_REFS_FILE, DEFAULT_MANUAL_REFS_FILE, DEFAULT_OUTPUT_SHEET_NAME, resolve_data_path, resolve_ref_path, ) from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains def parse_args(): """Parse workbook, Blueprint, and reference paths for the apply step.""" parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides") parser.add_argument("--input", required=True, help="Master Excel file name inside data/") parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/") parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/") parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/") parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied") args = parser.parse_args() args.input = resolve_data_path(args.input) args.blueprint = resolve_data_path(args.blueprint) args.refs = resolve_ref_path(args.refs) args.manual_refs = resolve_ref_path(args.manual_refs) return args def load_json_safe(filepath): """Load JSON memory files and fall back to an empty dict if absent/corrupt.""" try: with open(filepath, 'r', encoding='utf-8-sig') as f: return json.load(f) except Exception: return {} def split_approved_parts(value): """Split multi-value approvals into individual reference candidates.""" if pd.isna(value): return [] return [part.strip() for part in str(value).split(",") if part.strip()] def ensure_manual_bucket(manual_refs, official_refs, column_name): """Create the correct manual-ref container for list or dict reference columns.""" if column_name not in manual_refs: manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else [] return manual_refs[column_name] def remember_approved_value(manual_refs, official_refs, column_name, approved_value): """Remember approved values that are not already official or manual refs.""" manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name) added_count = 0 for part in split_approved_parts(approved_value): if ref_contains(official_refs.get(column_name, []), part): continue if ref_contains(manual_bucket, part): continue if isinstance(manual_bucket, list): manual_bucket.append(part) added_count += 1 elif isinstance(manual_bucket, dict): manual_bucket[normalize_ref(part)] = part added_count += 1 return added_count if __name__ == "__main__": args = parse_args() print("Loading Master Data, Blueprint, and Memory Files...") if not os.path.exists(args.blueprint): print(f"Error: No blueprint found at {args.blueprint} to apply.") exit() bp_df = pd.read_excel(args.blueprint) required_columns = { "Row_Index", "Column", "Original_Raw_Text", "AI_Suggested_Match", "Human_Override", } missing_columns = required_columns - set(bp_df.columns) if missing_columns: print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}") exit() # Human overrides are applied directly to the selected cleaned sheet. wb = openpyxl.load_workbook(args.input) if args.sheet not in wb.sheetnames: print(f"Error: No '{args.sheet}' sheet found in {args.input}.") exit() sheet = wb[args.sheet] col_name_to_idx = { str(sheet.cell(row=1, column=c).value).strip(): c for c in range(1, sheet.max_column + 1) if sheet.cell(row=1, column=c).value } # Reference files use the same CLI defaults as the cleaning pipeline. official_refs = load_json_safe(args.refs) manual_refs = load_json_safe(args.manual_refs) changes_made = 0 memory_additions = 0 print("Applying manual overrides and updating memory...") for _, row in bp_df.iterrows(): # Empty Human_Override means the reviewer accepted the AI suggestion. human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else "" approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else "" confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else "" if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}: continue raw_col = str(row['Column']).strip() if human_val: # Blueprint row indices already include the skipped MasterMap filter row. try: excel_row = int(row['Row_Index']) except (TypeError, ValueError): print(f"Skipping override with invalid Row_Index: {row['Row_Index']}") continue if raw_col not in col_name_to_idx: print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.") continue if excel_row < 1 or excel_row > sheet.max_row: print(f"Skipping override: row {excel_row} is outside {args.sheet}.") continue col_idx = col_name_to_idx[raw_col] sheet.cell(row=excel_row, column=col_idx).value = human_val changes_made += 1 # Only approved non-low-confidence values should teach future runs. if raw_col == "Degree": continue if not human_val and confidence == "LOW": continue memory_additions += remember_approved_value( manual_refs=manual_refs, official_refs=official_refs, column_name=raw_col, approved_value=approved_val, ) memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs) # Persist workbook updates before writing the learned memory file. wb.save(args.input) # Manual refs may be written to an empty deployment volume, so ensure the folder exists. manual_refs_dir = os.path.dirname(args.manual_refs) if manual_refs_dir: os.makedirs(manual_refs_dir, exist_ok=True) with open(args.manual_refs, 'w', encoding='utf-8') as f: json.dump(manual_refs, f, indent=4, ensure_ascii=False) print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.") print(f"Memory updated: {memory_additions} new approved values added.") print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.") print(f"Memory updated: Human overrides dumped to {args.manual_refs}")