Spaces:
Running
Running
| import pandas as pd | |
| import openpyxl | |
| import os | |
| import json | |
| import argparse | |
| from src.config import ( | |
| DEFAULT_BLUEPRINT_FILE, | |
| DEFAULT_OFFICIAL_REFS_FILE, | |
| DEFAULT_MANUAL_REFS_FILE, | |
| DEFAULT_OUTPUT_SHEET_NAME, | |
| resolve_data_path, | |
| resolve_ref_path, | |
| ) | |
| from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains | |
| def parse_args(): | |
| parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides") | |
| parser.add_argument("--input", required=True, help="Master Excel file name inside data/") | |
| parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/") | |
| parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/") | |
| parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/") | |
| parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied") | |
| args = parser.parse_args() | |
| args.input = resolve_data_path(args.input) | |
| args.blueprint = resolve_data_path(args.blueprint) | |
| args.refs = resolve_ref_path(args.refs) | |
| args.manual_refs = resolve_ref_path(args.manual_refs) | |
| return args | |
| def load_json_safe(filepath): | |
| try: | |
| with open(filepath, 'r', encoding='utf-8-sig') as f: | |
| return json.load(f) | |
| except Exception: | |
| return {} | |
| def split_approved_parts(value): | |
| if pd.isna(value): | |
| return [] | |
| return [part.strip() for part in str(value).split(",") if part.strip()] | |
| def ensure_manual_bucket(manual_refs, official_refs, column_name): | |
| if column_name not in manual_refs: | |
| manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else [] | |
| return manual_refs[column_name] | |
| def remember_approved_value(manual_refs, official_refs, column_name, approved_value): | |
| manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name) | |
| added_count = 0 | |
| for part in split_approved_parts(approved_value): | |
| if ref_contains(official_refs.get(column_name, []), part): | |
| continue | |
| if ref_contains(manual_bucket, part): | |
| continue | |
| if isinstance(manual_bucket, list): | |
| manual_bucket.append(part) | |
| added_count += 1 | |
| elif isinstance(manual_bucket, dict): | |
| manual_bucket[normalize_ref(part)] = part | |
| added_count += 1 | |
| return added_count | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| print("Loading Master Data, Blueprint, and Memory Files...") | |
| if not os.path.exists(args.blueprint): | |
| print(f"Error: No blueprint found at {args.blueprint} to apply.") | |
| exit() | |
| bp_df = pd.read_excel(args.blueprint) | |
| required_columns = { | |
| "Row_Index", | |
| "Column", | |
| "Original_Raw_Text", | |
| "AI_Suggested_Match", | |
| "Human_Override", | |
| } | |
| missing_columns = required_columns - set(bp_df.columns) | |
| if missing_columns: | |
| print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}") | |
| exit() | |
| # Load the target Excel workbook | |
| wb = openpyxl.load_workbook(args.input) | |
| if args.sheet not in wb.sheetnames: | |
| print(f"Error: No '{args.sheet}' sheet found in {args.input}.") | |
| exit() | |
| sheet = wb[args.sheet] | |
| col_name_to_idx = { | |
| str(sheet.cell(row=1, column=c).value).strip(): c | |
| for c in range(1, sheet.max_column + 1) | |
| if sheet.cell(row=1, column=c).value | |
| } | |
| # Load the memory dictionaries using the synced CLI path | |
| official_refs = load_json_safe(args.refs) | |
| manual_refs = load_json_safe(args.manual_refs) | |
| changes_made = 0 | |
| memory_additions = 0 | |
| print("Applying manual overrides and updating memory...") | |
| for _, row in bp_df.iterrows(): | |
| human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else "" | |
| approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else "" | |
| confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else "" | |
| if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}: | |
| continue | |
| raw_col = str(row['Column']).strip() | |
| if human_val: | |
| # 1. Update the Excel File | |
| try: | |
| excel_row = int(row['Row_Index']) | |
| except (TypeError, ValueError): | |
| print(f"Skipping override with invalid Row_Index: {row['Row_Index']}") | |
| continue | |
| if raw_col not in col_name_to_idx: | |
| print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.") | |
| continue | |
| if excel_row < 1 or excel_row > sheet.max_row: | |
| print(f"Skipping override: row {excel_row} is outside {args.sheet}.") | |
| continue | |
| col_idx = col_name_to_idx[raw_col] | |
| sheet.cell(row=excel_row, column=col_idx).value = human_val | |
| changes_made += 1 | |
| # 2. Update Manual References for human overrides and accepted AI suggestions. | |
| if raw_col == "Degree": | |
| continue | |
| if not human_val and confidence == "LOW": | |
| continue | |
| memory_additions += remember_approved_value( | |
| manual_refs=manual_refs, | |
| official_refs=official_refs, | |
| column_name=raw_col, | |
| approved_value=approved_val, | |
| ) | |
| memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs) | |
| # Save Excel | |
| wb.save(args.input) | |
| # Save JSONs | |
| # Make sure the data directory exists before dumping | |
| manual_refs_dir = os.path.dirname(args.manual_refs) | |
| if manual_refs_dir: | |
| os.makedirs(manual_refs_dir, exist_ok=True) | |
| with open(args.manual_refs, 'w', encoding='utf-8') as f: | |
| json.dump(manual_refs, f, indent=4, ensure_ascii=False) | |
| print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.") | |
| print(f"Memory updated: {memory_additions} new approved values added.") | |
| print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.") | |
| print(f"Memory updated: Human overrides dumped to {args.manual_refs}") | |