File size: 7,047 Bytes
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6a3f44
dc06d4c
 
c6a3f44
dc06d4c
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import pandas as pd
import openpyxl
import os
import json
import argparse
from src.config import (
    DEFAULT_BLUEPRINT_FILE,
    DEFAULT_OFFICIAL_REFS_FILE,
    DEFAULT_MANUAL_REFS_FILE,
    DEFAULT_OUTPUT_SHEET_NAME,
    resolve_data_path,
    resolve_ref_path,
)
from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains

def parse_args():
    """Parse workbook, Blueprint, and reference paths for the apply step."""
    parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
    parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
    parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
    parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
    parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
    parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")

    args = parser.parse_args()
    args.input = resolve_data_path(args.input)
    args.blueprint = resolve_data_path(args.blueprint)
    args.refs = resolve_ref_path(args.refs)
    args.manual_refs = resolve_ref_path(args.manual_refs)
    return args

def load_json_safe(filepath):
    """Load JSON memory files and fall back to an empty dict if absent/corrupt."""
    try:
        with open(filepath, 'r', encoding='utf-8-sig') as f:
            return json.load(f)
    except Exception:
        return {}

def split_approved_parts(value):
    """Split multi-value approvals into individual reference candidates."""
    if pd.isna(value):
        return []
    return [part.strip() for part in str(value).split(",") if part.strip()]

def ensure_manual_bucket(manual_refs, official_refs, column_name):
    """Create the correct manual-ref container for list or dict reference columns."""
    if column_name not in manual_refs:
        manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
    return manual_refs[column_name]

def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
    """Remember approved values that are not already official or manual refs."""
    manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
    added_count = 0

    for part in split_approved_parts(approved_value):
        if ref_contains(official_refs.get(column_name, []), part):
            continue
        if ref_contains(manual_bucket, part):
            continue

        if isinstance(manual_bucket, list):
            manual_bucket.append(part)
            added_count += 1
        elif isinstance(manual_bucket, dict):
            manual_bucket[normalize_ref(part)] = part
            added_count += 1

    return added_count

if __name__ == "__main__":
    args = parse_args()
    
    print("Loading Master Data, Blueprint, and Memory Files...")
    if not os.path.exists(args.blueprint):
        print(f"Error: No blueprint found at {args.blueprint} to apply.")
        exit()

    bp_df = pd.read_excel(args.blueprint)
    required_columns = {
        "Row_Index",
        "Column",
        "Original_Raw_Text",
        "AI_Suggested_Match",
        "Human_Override",
    }
    missing_columns = required_columns - set(bp_df.columns)
    if missing_columns:
        print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
        exit()

    # Human overrides are applied directly to the selected cleaned sheet.
    wb = openpyxl.load_workbook(args.input)
    if args.sheet not in wb.sheetnames:
        print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
        exit()

    sheet = wb[args.sheet]
    col_name_to_idx = {
        str(sheet.cell(row=1, column=c).value).strip(): c
        for c in range(1, sheet.max_column + 1)
        if sheet.cell(row=1, column=c).value
    }

    # Reference files use the same CLI defaults as the cleaning pipeline.
    official_refs = load_json_safe(args.refs)
    manual_refs = load_json_safe(args.manual_refs)

    changes_made = 0
    memory_additions = 0

    print("Applying manual overrides and updating memory...")
    for _, row in bp_df.iterrows():
        # Empty Human_Override means the reviewer accepted the AI suggestion.
        human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
        approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
        confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""

        if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
            continue

        raw_col = str(row['Column']).strip()
        
        if human_val:
            # Blueprint row indices already include the skipped MasterMap filter row.
            try:
                excel_row = int(row['Row_Index'])
            except (TypeError, ValueError):
                print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
                continue

            if raw_col not in col_name_to_idx:
                print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
                continue

            if excel_row < 1 or excel_row > sheet.max_row:
                print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
                continue

            col_idx = col_name_to_idx[raw_col]
            sheet.cell(row=excel_row, column=col_idx).value = human_val
            changes_made += 1

        # Only approved non-low-confidence values should teach future runs.
        if raw_col == "Degree":
            continue

        if not human_val and confidence == "LOW":
            continue

        memory_additions += remember_approved_value(
            manual_refs=manual_refs,
            official_refs=official_refs,
            column_name=raw_col,
            approved_value=approved_val,
        )

    memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)

    # Persist workbook updates before writing the learned memory file.
    wb.save(args.input)

    # Manual refs may be written to an empty deployment volume, so ensure the folder exists.
    manual_refs_dir = os.path.dirname(args.manual_refs)
    if manual_refs_dir:
        os.makedirs(manual_refs_dir, exist_ok=True)
    with open(args.manual_refs, 'w', encoding='utf-8') as f:
        json.dump(manual_refs, f, indent=4, ensure_ascii=False)

    print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
    print(f"Memory updated: {memory_additions} new approved values added.")
    print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
    print(f"Memory updated: Human overrides dumped to {args.manual_refs}")