Spaces:

MasterMap
/

mastermap-cleaner

Running

File size: 7,047 Bytes

import pandas as pd
import openpyxl
import os
import json
import argparse
from src.config import (
    DEFAULT_BLUEPRINT_FILE,
    DEFAULT_OFFICIAL_REFS_FILE,
    DEFAULT_MANUAL_REFS_FILE,
    DEFAULT_OUTPUT_SHEET_NAME,
    resolve_data_path,
    resolve_ref_path,
)
from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains

def parse_args():
    """Parse workbook, Blueprint, and reference paths for the apply step."""
    parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
    parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
    parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
    parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
    parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
    parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")

    args = parser.parse_args()
    args.input = resolve_data_path(args.input)
    args.blueprint = resolve_data_path(args.blueprint)
    args.refs = resolve_ref_path(args.refs)
    args.manual_refs = resolve_ref_path(args.manual_refs)
    return args

def load_json_safe(filepath):
    """Load JSON memory files and fall back to an empty dict if absent/corrupt."""
    try:
        with open(filepath, 'r', encoding='utf-8-sig') as f:
            return json.load(f)
    except Exception:
        return {}

def split_approved_parts(value):
    """Split multi-value approvals into individual reference candidates."""
    if pd.isna(value):
        return []
    return [part.strip() for part in str(value).split(",") if part.strip()]

def ensure_manual_bucket(manual_refs, official_refs, column_name):
    """Create the correct manual-ref container for list or dict reference columns."""
    if column_name not in manual_refs:
        manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
    return manual_refs[column_name]

def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
    """Remember approved values that are not already official or manual refs."""
    manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
    added_count = 0

    for part in split_approved_parts(approved_value):
        if ref_contains(official_refs.get(column_name, []), part):
            continue
        if ref_contains(manual_bucket, part):
            continue

        if isinstance(manual_bucket, list):
            manual_bucket.append(part)
            added_count += 1
        elif isinstance(manual_bucket, dict):
            manual_bucket[normalize_ref(part)] = part
            added_count += 1

    return added_count

if __name__ == "__main__":
    args = parse_args()
    
    print("Loading Master Data, Blueprint, and Memory Files...")
    if not os.path.exists(args.blueprint):
        print(f"Error: No blueprint found at {args.blueprint} to apply.")
        exit()

    bp_df = pd.read_excel(args.blueprint)
    required_columns = {
        "Row_Index",
        "Column",
        "Original_Raw_Text",
        "AI_Suggested_Match",
        "Human_Override",
    }
    missing_columns = required_columns - set(bp_df.columns)
    if missing_columns:
        print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
        exit()

    # Human overrides are applied directly to the selected cleaned sheet.
    wb = openpyxl.load_workbook(args.input)
    if args.sheet not in wb.sheetnames:
        print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
        exit()

    sheet = wb[args.sheet]
    col_name_to_idx = {
        str(sheet.cell(row=1, column=c).value).strip(): c
        for c in range(1, sheet.max_column + 1)
        if sheet.cell(row=1, column=c).value
    }

    # Reference files use the same CLI defaults as the cleaning pipeline.
    official_refs = load_json_safe(args.refs)
    manual_refs = load_json_safe(args.manual_refs)

    changes_made = 0
    memory_additions = 0

    print("Applying manual overrides and updating memory...")
    for _, row in bp_df.iterrows():
        # Empty Human_Override means the reviewer accepted the AI suggestion.
        human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
        approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
        confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""

        if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
            continue

        raw_col = str(row['Column']).strip()
        
        if human_val:
            # Blueprint row indices already include the skipped MasterMap filter row.
            try:
                excel_row = int(row['Row_Index'])
            except (TypeError, ValueError):
                print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
                continue

            if raw_col not in col_name_to_idx:
                print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
                continue

            if excel_row < 1 or excel_row > sheet.max_row:
                print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
                continue

            col_idx = col_name_to_idx[raw_col]
            sheet.cell(row=excel_row, column=col_idx).value = human_val
            changes_made += 1

        # Only approved non-low-confidence values should teach future runs.
        if raw_col == "Degree":
            continue

        if not human_val and confidence == "LOW":
            continue

        memory_additions += remember_approved_value(
            manual_refs=manual_refs,
            official_refs=official_refs,
            column_name=raw_col,
            approved_value=approved_val,
        )

    memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)

    # Persist workbook updates before writing the learned memory file.
    wb.save(args.input)

    # Manual refs may be written to an empty deployment volume, so ensure the folder exists.
    manual_refs_dir = os.path.dirname(args.manual_refs)
    if manual_refs_dir:
        os.makedirs(manual_refs_dir, exist_ok=True)
    with open(args.manual_refs, 'w', encoding='utf-8') as f:
        json.dump(manual_refs, f, indent=4, ensure_ascii=False)

    print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
    print(f"Memory updated: {memory_additions} new approved values added.")
    print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
    print(f"Memory updated: Human overrides dumped to {args.manual_refs}")