Spaces:
Running
Running
File size: 7,047 Bytes
dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c c6a3f44 dc06d4c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | import pandas as pd
import openpyxl
import os
import json
import argparse
from src.config import (
DEFAULT_BLUEPRINT_FILE,
DEFAULT_OFFICIAL_REFS_FILE,
DEFAULT_MANUAL_REFS_FILE,
DEFAULT_OUTPUT_SHEET_NAME,
resolve_data_path,
resolve_ref_path,
)
from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains
def parse_args():
"""Parse workbook, Blueprint, and reference paths for the apply step."""
parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")
args = parser.parse_args()
args.input = resolve_data_path(args.input)
args.blueprint = resolve_data_path(args.blueprint)
args.refs = resolve_ref_path(args.refs)
args.manual_refs = resolve_ref_path(args.manual_refs)
return args
def load_json_safe(filepath):
"""Load JSON memory files and fall back to an empty dict if absent/corrupt."""
try:
with open(filepath, 'r', encoding='utf-8-sig') as f:
return json.load(f)
except Exception:
return {}
def split_approved_parts(value):
"""Split multi-value approvals into individual reference candidates."""
if pd.isna(value):
return []
return [part.strip() for part in str(value).split(",") if part.strip()]
def ensure_manual_bucket(manual_refs, official_refs, column_name):
"""Create the correct manual-ref container for list or dict reference columns."""
if column_name not in manual_refs:
manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
return manual_refs[column_name]
def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
"""Remember approved values that are not already official or manual refs."""
manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
added_count = 0
for part in split_approved_parts(approved_value):
if ref_contains(official_refs.get(column_name, []), part):
continue
if ref_contains(manual_bucket, part):
continue
if isinstance(manual_bucket, list):
manual_bucket.append(part)
added_count += 1
elif isinstance(manual_bucket, dict):
manual_bucket[normalize_ref(part)] = part
added_count += 1
return added_count
if __name__ == "__main__":
args = parse_args()
print("Loading Master Data, Blueprint, and Memory Files...")
if not os.path.exists(args.blueprint):
print(f"Error: No blueprint found at {args.blueprint} to apply.")
exit()
bp_df = pd.read_excel(args.blueprint)
required_columns = {
"Row_Index",
"Column",
"Original_Raw_Text",
"AI_Suggested_Match",
"Human_Override",
}
missing_columns = required_columns - set(bp_df.columns)
if missing_columns:
print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
exit()
# Human overrides are applied directly to the selected cleaned sheet.
wb = openpyxl.load_workbook(args.input)
if args.sheet not in wb.sheetnames:
print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
exit()
sheet = wb[args.sheet]
col_name_to_idx = {
str(sheet.cell(row=1, column=c).value).strip(): c
for c in range(1, sheet.max_column + 1)
if sheet.cell(row=1, column=c).value
}
# Reference files use the same CLI defaults as the cleaning pipeline.
official_refs = load_json_safe(args.refs)
manual_refs = load_json_safe(args.manual_refs)
changes_made = 0
memory_additions = 0
print("Applying manual overrides and updating memory...")
for _, row in bp_df.iterrows():
# Empty Human_Override means the reviewer accepted the AI suggestion.
human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""
if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
continue
raw_col = str(row['Column']).strip()
if human_val:
# Blueprint row indices already include the skipped MasterMap filter row.
try:
excel_row = int(row['Row_Index'])
except (TypeError, ValueError):
print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
continue
if raw_col not in col_name_to_idx:
print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
continue
if excel_row < 1 or excel_row > sheet.max_row:
print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
continue
col_idx = col_name_to_idx[raw_col]
sheet.cell(row=excel_row, column=col_idx).value = human_val
changes_made += 1
# Only approved non-low-confidence values should teach future runs.
if raw_col == "Degree":
continue
if not human_val and confidence == "LOW":
continue
memory_additions += remember_approved_value(
manual_refs=manual_refs,
official_refs=official_refs,
column_name=raw_col,
approved_value=approved_val,
)
memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
# Persist workbook updates before writing the learned memory file.
wb.save(args.input)
# Manual refs may be written to an empty deployment volume, so ensure the folder exists.
manual_refs_dir = os.path.dirname(args.manual_refs)
if manual_refs_dir:
os.makedirs(manual_refs_dir, exist_ok=True)
with open(args.manual_refs, 'w', encoding='utf-8') as f:
json.dump(manual_refs, f, indent=4, ensure_ascii=False)
print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
print(f"Memory updated: {memory_additions} new approved values added.")
print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
print(f"Memory updated: Human overrides dumped to {args.manual_refs}")
|