andrewbejjani's picture
Added functional doc in README.md and added basic
c6a3f44
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
import sys
import json
import openpyxl
from openpyxl.styles import PatternFill, Font
from openpyxl.utils import get_column_letter
from openpyxl.worksheet.datavalidation import DataValidation
from openpyxl.workbook.defined_name import DefinedName
from src.config import parse_cli_args, GROQ_API_KEY, AVAILABLE_MODELS, DEFAULT_SIMILARITY_THRESHOLD
from src.llm_router import GroqRouter
from src.data_pipeline import process_column, cluster_degrees_by_institution
from src.utils import prune_manual_refs_against_official
# Each cleaned column has its own conservative split pattern. Avoid splitting
# on words like "and" because they can be part of official country names.
COLUMNS_CONFIG = {
"Country": r',|;|\n|/',
"Institution": r'[,/;|\n]',
"Continent": r',|;|\n|/',
"City": r',|;|\n|/',
"Level": r'\n|;',
"Language": r',|;|\n|/',
"Tags": r',|;|\n|/',
"Degree": r'\n|;'
}
master_cache = {}
def load_json_safe(filepath):
"""Load reference JSON files, accepting UTF-8 files with or without a BOM."""
with open(filepath, 'r', encoding='utf-8-sig') as f:
return json.load(f)
def validate_official_refs(official_refs):
"""Fail early if required reference buckets are missing or empty."""
missing = []
for column_name in COLUMNS_CONFIG:
if column_name == "Degree":
continue
ref_data = official_refs.get(column_name)
if not isinstance(ref_data, (list, dict)) or len(ref_data) == 0:
missing.append(column_name)
if missing:
raise ValueError(
"Official references are missing or empty for: "
+ ", ".join(missing)
+ ". Refusing to run because this would send too many values to Groq."
)
def inject_searchable_dropdowns(blueprint_path, master_unique_lists):
"""Add hidden reference lists and dropdowns to the generated Blueprint."""
print("Injecting static searchable dropdowns into Blueprint...")
wb = openpyxl.load_workbook(blueprint_path)
main_sheet = wb.active
# Store all dropdown values on a hidden sheet so Excel can reference them.
ref_sheet = wb.create_sheet(title="Reference_Lists")
col_idx = 1
for column_name, unique_items in master_unique_lists.items():
safe_name = column_name.replace(" ", "_")
ref_sheet.cell(row=1, column=col_idx, value=safe_name)
# Clean and alphabetize the list for a better review experience.
valid_items = sorted([item for item in unique_items if item and isinstance(item, str)])
# Write the items
for row_idx, item in enumerate(valid_items, start=2):
ref_sheet.cell(row=row_idx, column=col_idx, value=item)
# Named ranges let data validation reference long lists safely.
if valid_items:
letter = get_column_letter(col_idx)
range_str = f"Reference_Lists!${letter}$2:${letter}${len(valid_items) + 1}"
named_range = DefinedName(name=safe_name, attr_text=range_str)
wb.defined_names.add(named_range)
col_idx += 1
# The override dropdown changes based on the row's target column.
target_col_idx = None
override_col_letter = None
for cell in main_sheet[1]:
if cell.value == "Column":
target_col_idx = get_column_letter(cell.column)
elif cell.value == "Human_Override":
override_col_letter = get_column_letter(cell.column)
if target_col_idx and override_col_letter:
dv = DataValidation(
type="list",
formula1=f'=INDIRECT(SUBSTITUTE(${target_col_idx}2, " ", "_"))',
allowBlank=True,
showErrorMessage=False # CRITICAL: This allows the user to manually type an override!
)
dv.add(f"{override_col_letter}2:{override_col_letter}{main_sheet.max_row}")
main_sheet.add_data_validation(dv)
ref_sheet.sheet_state = 'hidden'
wb.save(blueprint_path)
print("Dropdowns successfully injected!")
if __name__ == "__main__":
# Parse CLI/UI arguments before loading any expensive model assets.
args = parse_cli_args()
source_sheet_name = args.sheet
output_sheet_name = args.output_sheet
available_models = [m.strip() for m in args.models.split(",") if m.strip()] if args.models else AVAILABLE_MODELS
print("Loading AI Model (this may take a few seconds)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
# The router owns Groq fallback order and rate-limit switching.
router = GroqRouter(api_key=GROQ_API_KEY, available_models=available_models)
if not os.path.exists(args.refs):
raise FileNotFoundError(f"Official references file not found: {args.refs}")
if not os.path.exists(args.manual_refs):
os.makedirs(os.path.dirname(args.manual_refs), exist_ok=True)
with open(args.manual_refs, 'w', encoding='utf-8') as f:
json.dump({}, f)
official_refs = load_json_safe(args.refs)
manual_refs = load_json_safe(args.manual_refs)
validate_official_refs(official_refs)
# Manual memory should only contain values not already covered by official refs.
memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
if memory_pruned:
print(f"[INFO] Removed {memory_pruned} manual reference duplicate(s) already covered by official refs.")
print(f"Loading Excel dataset from {args.input}, sheet '{source_sheet_name}'...")
data = pd.read_excel(args.input, sheet_name=source_sheet_name, skiprows=[1])
# Every uncertain or changed value is logged here for human review.
blueprint_records = []
# Run each configured column through the normalization pipeline. Degree
# values are clustered within each institution instead of matched to refs.
for col, pattern in COLUMNS_CONFIG.items():
if col == "Degree":
inst_col = 'Cleaned_Institution' if 'Cleaned_Institution' in data.columns else 'Institution'
data = cluster_degrees_by_institution(
df=data, degree_col=col, inst_col=inst_col, model=model,
master_cache=master_cache, blueprint_data=blueprint_records,
threshold=DEFAULT_SIMILARITY_THRESHOLD
)
else:
data = process_column(
df=data, column_name=col, model=model, groq_router=router,
official_refs=official_refs, manual_refs=manual_refs, master_cache=master_cache,
split_pattern=pattern, blueprint_data=blueprint_records
)
print("\nSaving all memory files...")
with open(args.manual_refs, 'w', encoding='utf-8') as f: json.dump(manual_refs, f, indent=4, ensure_ascii=False)
# Export the review workbook only when there is something to inspect.
if blueprint_records:
bp_df = pd.DataFrame(blueprint_records)
bp_df.to_excel(args.blueprint, index=False)
# Basic formatting helps reviewers scan confidence levels quickly.
bp_wb = openpyxl.load_workbook(args.blueprint)
bp_sheet = bp_wb.active
header_fill = PatternFill(start_color="1F4E78", end_color="1F4E78", fill_type="solid")
header_font = Font(color="FFFFFF", bold=True)
high_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
med_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
low_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
conf_col_idx = None
for col_idx in range(1, bp_sheet.max_column + 1):
cell = bp_sheet.cell(row=1, column=col_idx)
cell.fill = header_fill
cell.font = header_font
if cell.value == "Confidence": conf_col_idx = col_idx
bp_sheet.column_dimensions[get_column_letter(col_idx)].width = 30
if conf_col_idx:
for row_idx in range(2, bp_sheet.max_row + 1):
cell = bp_sheet.cell(row=row_idx, column=conf_col_idx)
val = str(cell.value).upper()
if "HIGH" in val: cell.fill = high_fill
elif "MEDIUM" in val: cell.fill = med_fill
elif "LOW" in val: cell.fill = low_fill
bp_wb.save(args.blueprint)
print(f"[!] Saved and formatted {len(bp_df)} rows requiring review to {args.blueprint}")
def extract_uniques(ref_data):
"""Extract display values from list-style or dict-style references."""
if isinstance(ref_data, dict): return list(ref_data.values())
elif isinstance(ref_data, list): return ref_data
return []
master_lists = {}
for category in COLUMNS_CONFIG.keys():
off_items = extract_uniques(official_refs.get(category, []))
man_items = extract_uniques(manual_refs.get(category, []))
# Merge official and manual values for the Blueprint dropdowns.
master_lists[category] = list(set([x for x in (off_items + man_items) if x]))
inject_searchable_dropdowns(args.blueprint, master_lists)
else:
print("[!] No blueprint generated. All matches were HIGH confidence!")
# Copy the source sheet to preserve formatting, then overwrite cleaned columns.
print("\nOpening original Excel file to preserve formatting...")
wb = openpyxl.load_workbook(args.input)
new_sheet_name = output_sheet_name
if source_sheet_name == new_sheet_name:
raise ValueError("Output sheet name cannot match the source sheet name.")
source_sheet = wb[source_sheet_name]
if new_sheet_name in wb.sheetnames: del wb[new_sheet_name]
new_sheet = wb.copy_worksheet(source_sheet)
new_sheet.title = new_sheet_name
col_name_to_idx = {new_sheet.cell(row=1, column=c).value: c for c in range(1, new_sheet.max_column + 1) if new_sheet.cell(row=1, column=c).value}
for row_idx, (_, row_data) in enumerate(data.iterrows()):
excel_row = row_idx + 3
for col_name in COLUMNS_CONFIG.keys():
cleaned_col_name = f"Cleaned_{col_name}"
if cleaned_col_name in data.columns and col_name in col_name_to_idx:
new_value = row_data[cleaned_col_name]
new_sheet.cell(row=excel_row, column=col_name_to_idx[col_name]).value = None if pd.isna(new_value) else new_value
wb.save(args.input)
print(f"\nSuccess! Initial pass saved. Please review {args.blueprint}.")