Spaces:

MasterMap
/

mastermap-cleaner

Running

App Files Files Community

andrewbejjani commited on 4 days ago

Commit

dc06d4c

0 Parent(s):

Initial clean commit

Browse files

Files changed (19) hide show

.gitattributes +36 -0
Dockerfile +16 -0
README.md +14 -0
apply_blueprint.py +169 -0
main.py +242 -0
newest_model.py +123 -0
refdata/manual_references.json +71 -0
refdata/official_references.json +650 -0
requirements.txt +11 -0
src/config.py +60 -0
src/data_pipeline.py +431 -0
src/llm_router.py +122 -0
src/process_runner.py +67 -0
src/utils.py +152 -0
src/workbook_io.py +48 -0
ui/static/app.js +463 -0
ui/static/styles.css +225 -0
ui/templates/index.html +136 -0
ui_app.py +342 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+* text=auto eol=lf
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.12-slim
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/app/.cache/huggingface
+ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
+CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "1", "--timeout", "3600", "ui_app:app"]

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: MasterMap Cleaner
+sdk: docker
+app_port: 7860
+---
+## Hugging Face Secrets
+Set these in the Space settings:
+- `GROQ_API_KEY`: required for Groq model calls.
+- `HF_TOKEN`: optional, required only for the `Save Manual References` button.
+`Save Manual References` only enables on Hugging Face Spaces when `SPACE_ID` is present and `HF_TOKEN` is configured. It commits the current `refdata/manual_references.json` back to the Space repository.

apply_blueprint.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import pandas as pd
+import openpyxl
+import os
+import json
+import argparse
+from src.config import (
+    DEFAULT_BLUEPRINT_FILE,
+    DEFAULT_OFFICIAL_REFS_FILE,
+    DEFAULT_MANUAL_REFS_FILE,
+    DEFAULT_OUTPUT_SHEET_NAME,
+    resolve_data_path,
+    resolve_ref_path,
+)
+from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains
+def parse_args():
+    parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
+    parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
+    parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
+    parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
+    parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
+    parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")
+    args = parser.parse_args()
+    args.input = resolve_data_path(args.input)
+    args.blueprint = resolve_data_path(args.blueprint)
+    args.refs = resolve_ref_path(args.refs)
+    args.manual_refs = resolve_ref_path(args.manual_refs)
+    return args
+def load_json_safe(filepath):
+    try:
+        with open(filepath, 'r', encoding='utf-8-sig') as f:
+            return json.load(f)
+    except Exception:
+        return {}
+def split_approved_parts(value):
+    if pd.isna(value):
+        return []
+    return [part.strip() for part in str(value).split(",") if part.strip()]
+def ensure_manual_bucket(manual_refs, official_refs, column_name):
+    if column_name not in manual_refs:
+        manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
+    return manual_refs[column_name]
+def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
+    manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
+    added_count = 0
+    for part in split_approved_parts(approved_value):
+        if ref_contains(official_refs.get(column_name, []), part):
+            continue
+        if ref_contains(manual_bucket, part):
+            continue
+        if isinstance(manual_bucket, list):
+            manual_bucket.append(part)
+            added_count += 1
+        elif isinstance(manual_bucket, dict):
+            manual_bucket[normalize_ref(part)] = part
+            added_count += 1
+    return added_count
+if __name__ == "__main__":
+    args = parse_args()
+    print("Loading Master Data, Blueprint, and Memory Files...")
+    if not os.path.exists(args.blueprint):
+        print(f"Error: No blueprint found at {args.blueprint} to apply.")
+        exit()
+    bp_df = pd.read_excel(args.blueprint)
+    required_columns = {
+        "Row_Index",
+        "Column",
+        "Original_Raw_Text",
+        "AI_Suggested_Match",
+        "Human_Override",
+    }
+    missing_columns = required_columns - set(bp_df.columns)
+    if missing_columns:
+        print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
+        exit()
+    # Load the target Excel workbook
+    wb = openpyxl.load_workbook(args.input)
+    if args.sheet not in wb.sheetnames:
+        print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
+        exit()
+    sheet = wb[args.sheet]
+    col_name_to_idx = {
+        str(sheet.cell(row=1, column=c).value).strip(): c
+        for c in range(1, sheet.max_column + 1)
+        if sheet.cell(row=1, column=c).value
+    }
+    # Load the memory dictionaries using the synced CLI path
+    official_refs = load_json_safe(args.refs)
+    manual_refs = load_json_safe(args.manual_refs)
+    changes_made = 0
+    memory_additions = 0
+    print("Applying manual overrides and updating memory...")
+    for _, row in bp_df.iterrows():
+        human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
+        approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
+        confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""
+        if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
+            continue
+        raw_col = str(row['Column']).strip()
+        if human_val:
+            # 1. Update the Excel File
+            try:
+                excel_row = int(row['Row_Index'])
+            except (TypeError, ValueError):
+                print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
+                continue
+            if raw_col not in col_name_to_idx:
+                print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
+                continue
+            if excel_row < 1 or excel_row > sheet.max_row:
+                print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
+                continue
+            col_idx = col_name_to_idx[raw_col]
+            sheet.cell(row=excel_row, column=col_idx).value = human_val
+            changes_made += 1
+        # 2. Update Manual References for human overrides and accepted AI suggestions.
+        if raw_col == "Degree":
+            continue
+        if not human_val and confidence == "LOW":
+            continue
+        memory_additions += remember_approved_value(
+            manual_refs=manual_refs,
+            official_refs=official_refs,
+            column_name=raw_col,
+            approved_value=approved_val,
+        )
+    memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
+    # Save Excel
+    wb.save(args.input)
+    # Save JSONs
+    # Make sure the data directory exists before dumping
+    manual_refs_dir = os.path.dirname(args.manual_refs)
+    if manual_refs_dir:
+        os.makedirs(manual_refs_dir, exist_ok=True)
+    with open(args.manual_refs, 'w', encoding='utf-8') as f:
+        json.dump(manual_refs, f, indent=4, ensure_ascii=False)
+    print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
+    print(f"Memory updated: {memory_additions} new approved values added.")
+    print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
+    print(f"Memory updated: Human overrides dumped to {args.manual_refs}")

main.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import os
+import sys
+import json
+import openpyxl
+from openpyxl.styles import PatternFill, Font
+from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.datavalidation import DataValidation
+from openpyxl.workbook.defined_name import DefinedName
+# Import our new modular architecture
+from src.config import parse_cli_args, GROQ_API_KEY, AVAILABLE_MODELS, DEFAULT_SIMILARITY_THRESHOLD
+from src.llm_router import GroqRouter
+from src.data_pipeline import process_column, cluster_degrees_by_institution
+from src.utils import prune_manual_refs_against_official
+# --- 1. CONFIGURATION ---
+COLUMNS_CONFIG = {
+    "Country": r',|;|\n|/',
+    "Institution": r'[,/;|\n]',
+    "Continent": r',|;|\n|/',
+    "City": r',|;|\n|/',
+    "Level": r'\n|;',
+    "Language": r',|;|\n|/',
+    "Tags": r',|;|\n|/',
+    "Degree": r'\n|;'
+}
+master_cache = {}
+def load_json_safe(filepath):
+    with open(filepath, 'r', encoding='utf-8-sig') as f:
+        return json.load(f)
+def validate_official_refs(official_refs):
+    missing = []
+    for column_name in COLUMNS_CONFIG:
+        if column_name == "Degree":
+            continue
+        ref_data = official_refs.get(column_name)
+        if not isinstance(ref_data, (list, dict)) or len(ref_data) == 0:
+            missing.append(column_name)
+    if missing:
+        raise ValueError(
+            "Official references are missing or empty for: "
+            + ", ".join(missing)
+            + ". Refusing to run because this would send too many values to Groq."
+        )
+def inject_searchable_dropdowns(blueprint_path, master_unique_lists):
+    """Injects robust, static searchable dropdowns into the Blueprint."""
+    print("Injecting static searchable dropdowns into Blueprint...")
+    wb = openpyxl.load_workbook(blueprint_path)
+    main_sheet = wb.active
+    # 1. Create the Reference Sheet
+    ref_sheet = wb.create_sheet(title="Reference_Lists")
+    col_idx = 1
+    for column_name, unique_items in master_unique_lists.items():
+        safe_name = column_name.replace(" ", "_")
+        # Write the header
+        ref_sheet.cell(row=1, column=col_idx, value=safe_name)
+        # Clean and alphabetize the list for a better user experience
+        valid_items = sorted([item for item in unique_items if item and isinstance(item, str)])
+        # Write the items
+        for row_idx, item in enumerate(valid_items, start=2):
+            ref_sheet.cell(row=row_idx, column=col_idx, value=item)
+        # 2. Create the Excel "Named Range"
+        if valid_items:
+            letter = get_column_letter(col_idx)
+            range_str = f"Reference_Lists!${letter}$2:${letter}${len(valid_items) + 1}"
+            named_range = DefinedName(name=safe_name, attr_text=range_str)
+            wb.defined_names.add(named_range)
+        col_idx += 1
+    # 3. Locate Target & Override Columns
+    target_col_idx = None
+    override_col_letter = None
+    for cell in main_sheet[1]:
+        if cell.value == "Column":
+            target_col_idx = get_column_letter(cell.column)
+        elif cell.value == "Human_Override":
+            override_col_letter = get_column_letter(cell.column)
+    # 4. Apply Data Validation
+    if target_col_idx and override_col_letter:
+        dv = DataValidation(
+            type="list",
+            formula1=f'=INDIRECT(SUBSTITUTE(${target_col_idx}2, " ", "_"))',
+            allowBlank=True,
+            showErrorMessage=False # CRITICAL: This allows the user to manually type an override!
+        )
+        dv.add(f"{override_col_letter}2:{override_col_letter}{main_sheet.max_row}")
+        main_sheet.add_data_validation(dv)
+    ref_sheet.sheet_state = 'hidden'
+    wb.save(blueprint_path)
+    print("Dropdowns successfully injected!")
+if __name__ == "__main__":
+    # --- 2. INITIALIZATION ---
+    args = parse_cli_args()
+    source_sheet_name = args.sheet
+    output_sheet_name = args.output_sheet
+    available_models = [m.strip() for m in args.models.split(",") if m.strip()] if args.models else AVAILABLE_MODELS
+    print("Loading AI Model (this may take a few seconds)...")
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    # Initialize our LLM Router
+    router = GroqRouter(api_key=GROQ_API_KEY, available_models=available_models)
+    if not os.path.exists(args.refs):
+        raise FileNotFoundError(f"Official references file not found: {args.refs}")
+    if not os.path.exists(args.manual_refs):
+        os.makedirs(os.path.dirname(args.manual_refs), exist_ok=True)
+        with open(args.manual_refs, 'w', encoding='utf-8') as f:
+            json.dump({}, f)
+    official_refs = load_json_safe(args.refs)
+    manual_refs = load_json_safe(args.manual_refs)
+    validate_official_refs(official_refs)
+    memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
+    if memory_pruned:
+        print(f"[INFO] Removed {memory_pruned} manual reference duplicate(s) already covered by official refs.")
+    print(f"Loading Excel dataset from {args.input}, sheet '{source_sheet_name}'...")
+    data = pd.read_excel(args.input, sheet_name=source_sheet_name, skiprows=[1])
+    # Initialize the global Blueprint Logger
+    blueprint_records = []
+    # --- 3. EXECUTE BATCH PIPELINE ---
+    for col, pattern in COLUMNS_CONFIG.items():
+        if col == "Degree":
+            inst_col = 'Cleaned_Institution' if 'Cleaned_Institution' in data.columns else 'Institution'
+            data = cluster_degrees_by_institution(
+                df=data, degree_col=col, inst_col=inst_col, model=model,
+                master_cache=master_cache, blueprint_data=blueprint_records,
+                threshold=DEFAULT_SIMILARITY_THRESHOLD
+            )
+        else:
+            data = process_column(
+                df=data, column_name=col, model=model, groq_router=router,
+                official_refs=official_refs, manual_refs=manual_refs, master_cache=master_cache,
+                split_pattern=pattern, blueprint_data=blueprint_records
+            )
+    # --- 4. EXPORT RESULTS ---
+    print("\nSaving all memory files...")
+    with open(args.manual_refs, 'w', encoding='utf-8') as f: json.dump(manual_refs, f, indent=4, ensure_ascii=False)
+    # 4a. Export the Blueprint for Human Review
+    if blueprint_records:
+        bp_df = pd.DataFrame(blueprint_records)
+        bp_df.to_excel(args.blueprint, index=False)
+        # --- Format the Blueprint Visually ---
+        bp_wb = openpyxl.load_workbook(args.blueprint)
+        bp_sheet = bp_wb.active
+        header_fill = PatternFill(start_color="1F4E78", end_color="1F4E78", fill_type="solid")
+        header_font = Font(color="FFFFFF", bold=True)
+        high_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
+        med_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
+        low_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
+        conf_col_idx = None
+        for col_idx in range(1, bp_sheet.max_column + 1):
+            cell = bp_sheet.cell(row=1, column=col_idx)
+            cell.fill = header_fill
+            cell.font = header_font
+            if cell.value == "Confidence": conf_col_idx = col_idx
+            bp_sheet.column_dimensions[get_column_letter(col_idx)].width = 30
+        if conf_col_idx:
+            for row_idx in range(2, bp_sheet.max_row + 1):
+                cell = bp_sheet.cell(row=row_idx, column=conf_col_idx)
+                val = str(cell.value).upper()
+                if "HIGH" in val: cell.fill = high_fill
+                elif "MEDIUM" in val: cell.fill = med_fill
+                elif "LOW" in val: cell.fill = low_fill
+        bp_wb.save(args.blueprint)
+        print(f"[!] Saved and formatted {len(bp_df)} rows requiring review to {args.blueprint}")
+        # --- NEW: Build master lists and inject dropdowns ---
+        def extract_uniques(ref_data):
+            """Helper to extract names whether the memory file is a list or a dict"""
+            if isinstance(ref_data, dict): return list(ref_data.values())
+            elif isinstance(ref_data, list): return ref_data
+            return []
+        master_lists = {}
+        for category in COLUMNS_CONFIG.keys():
+            off_items = extract_uniques(official_refs.get(category, []))
+            man_items = extract_uniques(manual_refs.get(category, []))
+            # Merge, deduplicate, and remove blanks
+            master_lists[category] = list(set([x for x in (off_items + man_items) if x]))
+        inject_searchable_dropdowns(args.blueprint, master_lists)
+    else:
+        print("[!] No blueprint generated. All matches were HIGH confidence!")
+    # 4b. Inject Cleaned Data to Mastermap
+    print("\nOpening original Excel file to preserve formatting...")
+    wb = openpyxl.load_workbook(args.input)
+    new_sheet_name = output_sheet_name
+    if source_sheet_name == new_sheet_name:
+        raise ValueError("Output sheet name cannot match the source sheet name.")
+    source_sheet = wb[source_sheet_name]
+    if new_sheet_name in wb.sheetnames: del wb[new_sheet_name]
+    new_sheet = wb.copy_worksheet(source_sheet)
+    new_sheet.title = new_sheet_name
+    col_name_to_idx = {new_sheet.cell(row=1, column=c).value: c for c in range(1, new_sheet.max_column + 1) if new_sheet.cell(row=1, column=c).value}
+    for row_idx, (_, row_data) in enumerate(data.iterrows()):
+        excel_row = row_idx + 3
+        for col_name in COLUMNS_CONFIG.keys():
+            cleaned_col_name = f"Cleaned_{col_name}"
+            if cleaned_col_name in data.columns and col_name in col_name_to_idx:
+                new_value = row_data[cleaned_col_name]
+                new_sheet.cell(row=excel_row, column=col_name_to_idx[col_name]).value = None if pd.isna(new_value) else new_value
+    wb.save(args.input)
+    print(f"\nSuccess! Initial pass saved. Please review {args.blueprint}.")

newest_model.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import argparse
+import os
+from typing import Any
+import requests
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_MODELS_URL = "https://api.groq.com/openai/v1/models"
+# Models selected from Groq's rate-limit table for this workload: relevant
+# chat/reasoning models with at least 1K requests per day.
+PREFERRED_PRODUCTION_CHAT_MODELS = [
+    "llama-3.3-70b-versatile",
+    "openai/gpt-oss-120b",
+    "qwen/qwen3-32b",
+    "llama-3.1-8b-instant",
+    "openai/gpt-oss-20b",
+    "meta-llama/llama-4-scout-17b-16e-instruct",
+    "meta-llama/llama-prompt-guard-2-22m",
+    "meta-llama/llama-prompt-guard-2-86m"
+]
+EXCLUDED_MODEL_TERMS = [
+    "whisper",
+    "tts",
+    "audio",
+    "moderation",
+    "embed",
+    "compound",
+    "safeguard"
+]
+PREFERRED_MODEL_IDS = {model_id.lower() for model_id in PREFERRED_PRODUCTION_CHAT_MODELS}
+def fetch_groq_models(api_key: str) -> list[dict[str, Any]]:
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    response = requests.get(GROQ_MODELS_URL, headers=headers, timeout=30)
+    response.raise_for_status()
+    return response.json().get("data", [])
+def is_active_chat_model(model: dict[str, Any]) -> bool:
+    model_id = str(model.get("id", "")).lower()
+    if not model_id:
+        return False
+    if model.get("active") is False:
+        return False
+    if model_id not in PREFERRED_MODEL_IDS:
+        return False
+    return not any(term in model_id for term in EXCLUDED_MODEL_TERMS)
+def rank_model(model: dict[str, Any]) -> tuple[int, int, int, str]:
+    model_id = str(model.get("id", ""))
+    model_id_lower = model_id.lower()
+    try:
+        preferred_rank = [item.lower() for item in PREFERRED_PRODUCTION_CHAT_MODELS].index(model_id_lower)
+    except ValueError:
+        preferred_rank = len(PREFERRED_PRODUCTION_CHAT_MODELS)
+    created = int(model.get("created") or 0)
+    context_window = int(model.get("context_window") or 0)
+    # Lower tuple wins. Known production models come first; unknown active chat
+    # models then prefer newer creation dates and larger context windows.
+    return (preferred_rank, -created, -context_window, model_id)
+def select_groq_chat_models(limit: int = len(PREFERRED_PRODUCTION_CHAT_MODELS), strategy: str = "stable") -> list[str]:
+    api_key = os.getenv("GROQ_API_KEY")
+    if not api_key:
+        raise RuntimeError("GROQ_API_KEY is missing. Add it to .env first.")
+    models = fetch_groq_models(api_key)
+    chat_models = [model for model in models if is_active_chat_model(model)]
+    if strategy == "latest":
+        ranked_models = sorted(
+            chat_models,
+            key=lambda model: (
+                -int(model.get("created") or 0),
+                -int(model.get("context_window") or 0),
+                str(model.get("id", "")),
+            ),
+        )
+    else:
+        ranked_models = sorted(chat_models, key=rank_model)
+    return [model["id"] for model in ranked_models[:limit]]
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Select currently available Groq chat models.")
+    parser.add_argument("--limit", type=int, default=len(PREFERRED_PRODUCTION_CHAT_MODELS), help="Number of fallback models to print.")
+    parser.add_argument(
+        "--strategy",
+        choices=["stable", "latest"],
+        default="stable",
+        help="stable prefers known production chat models; latest picks newest active chat models.",
+    )
+    args = parser.parse_args()
+    models = select_groq_chat_models(limit=args.limit, strategy=args.strategy)
+    if not models:
+        raise RuntimeError("No active Groq chat models were returned by the Models API.")
+    print("Recommended GROQ_MODEL value:")
+    print(f"GROQ_MODEL={','.join(models)}")
+    print("\nSelected fallback order:")
+    for index, model_id in enumerate(models, start=1):
+        print(f"{index}. {model_id}")
+if __name__ == "__main__":
+    main()

refdata/manual_references.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+    "Country": [
+        "Hong Kong"
+    ],
+    "Institution": [
+        "Centre Interdisciplinaire de Formation à la Fonction Personnel",
+        "Faculté de médecine Paris-Descartes",
+        "Politecnico di Milano",
+        "INSTN",
+        "Université Paris-Est Marne-la-Vallée",
+        "CEMS",
+        "Conservatoire National Supérieur d'Art Dramatique (CNSAD)",
+        "EDC Paris Business School",
+        "EFREI",
+        "Catolica Lisbon School of Business & Economics",
+        "Ecole des Mines Paris - Université PSL",
+        "École d'urbanisme de Paris",
+        "ESMOD Paris",
+        "HEAD Genève (Haute école d'art et de design)",
+        "MODART International",
+        "University of Mumbai",
+        "Oxford Brookes University",
+        "Shanghai Advanced Institute of Finance",
+        "The University of Hong Kong (HKU)",
+        "Université de Montpellier Paul Valéry",
+        "IAE Paris",
+        "The University of Sydney Business School",
+        "Korea University Business School",
+        "EHL Business School",
+        "Technische Universität Dortmund"
+    ],
+    "Continent": [],
+    "City": [
+        "Champs-sur-Marne",
+        "Croix",
+        "Hanoi",
+        "Pantin",
+        "Versailles",
+        "Mumbai",
+        "Oxford",
+        "Suresnes",
+        "Aubervilliers",
+        "Dortmund",
+        "Mannheim"
+    ],
+    "Level": {},
+    "Language": {},
+    "Tags": [
+        "Sociology",
+        "Fashion Management",
+        "Geoeconomy",
+        "Humanitarian",
+        "Sustainable Finance",
+        "Fashion",
+        "Financial Risks",
+        "Digital Creativity",
+        "International Marketing",
+        "Sales Management",
+        "Data Management",
+        "Journalism",
+        "Arts & Culture",
+        "Environmental Governance",
+        "Neurology",
+        "Dentist",
+        "Financial Control",
+        "Biodiversity",
+        "Languages",
+        "Financial Engineering",
+        "Biology"
+    ]
+}

refdata/official_references.json ADDED Viewed

	@@ -0,0 +1,650 @@

+{
+    "Country": [
+        "Afghanistan",
+        "Albania",
+        "Algeria",
+        "Andorra",
+        "Angola",
+        "Antigua and Barbuda",
+        "Argentina",
+        "Armenia",
+        "Australia",
+        "Austria",
+        "Azerbaijan",
+        "Bahamas",
+        "Bahrain",
+        "Bangladesh",
+        "Barbados",
+        "Belarus",
+        "Belgium",
+        "Belize",
+        "Benin",
+        "Bhutan",
+        "Bolivia",
+        "Bosnia and Herzegovina",
+        "Botswana",
+        "Brazil",
+        "Brunei",
+        "Bulgaria",
+        "Burkina Faso",
+        "Burundi",
+        "Cabo Verde",
+        "Cambodia",
+        "Cameroon",
+        "Canada",
+        "Central African Republic",
+        "Chad",
+        "Chile",
+        "China",
+        "Colombia",
+        "Comoros",
+        "Congo",
+        "Costa Rica",
+        "Croatia",
+        "Cuba",
+        "Cyprus",
+        "Czech Republic",
+        "Democratic Republic of the Congo",
+        "Denmark",
+        "Djibouti",
+        "Dominica",
+        "Dominican Republic",
+        "Ecuador",
+        "Egypt",
+        "El Salvador",
+        "Equatorial Guinea",
+        "Eritrea",
+        "Estonia",
+        "Eswatini",
+        "Ethiopia",
+        "Fiji",
+        "Finland",
+        "France",
+        "Gabon",
+        "Gambia",
+        "Georgia",
+        "Germany",
+        "Ghana",
+        "Greece",
+        "Grenada",
+        "Guatemala",
+        "Guinea",
+        "Guinea-Bissau",
+        "Guyana",
+        "Haiti",
+        "Honduras",
+        "Hungary",
+        "Iceland",
+        "India",
+        "Indonesia",
+        "Iran",
+        "Iraq",
+        "Ireland",
+        "Israel",
+        "Italy",
+        "Ivory Coast",
+        "Jamaica",
+        "Japan",
+        "Jordan",
+        "Kazakhstan",
+        "Kenya",
+        "Kiribati",
+        "Kosovo",
+        "Kuwait",
+        "Kyrgyzstan",
+        "Laos",
+        "Latvia",
+        "Lebanon",
+        "Lesotho",
+        "Liberia",
+        "Libya",
+        "Liechtenstein",
+        "Lithuania",
+        "Luxembourg",
+        "Madagascar",
+        "Malawi",
+        "Malaysia",
+        "Maldives",
+        "Mali",
+        "Malta",
+        "Marshall Islands",
+        "Mauritania",
+        "Mauritius",
+        "Mexico",
+        "Micronesia",
+        "Moldova",
+        "Monaco",
+        "Mongolia",
+        "Montenegro",
+        "Morocco",
+        "Mozambique",
+        "Myanmar",
+        "Namibia",
+        "Nauru",
+        "Nepal",
+        "Netherlands",
+        "New Zealand",
+        "Nicaragua",
+        "Niger",
+        "Nigeria",
+        "North Korea",
+        "North Macedonia",
+        "Norway",
+        "Oman",
+        "Pakistan",
+        "Palau",
+        "Palestine",
+        "Panama",
+        "Papua New Guinea",
+        "Paraguay",
+        "Peru",
+        "Philippines",
+        "Poland",
+        "Portugal",
+        "Qatar",
+        "Romania",
+        "Russia",
+        "Rwanda",
+        "Saint Kitts and Nevis",
+        "Saint Lucia",
+        "Saint Vincent and the Grenadines",
+        "Samoa",
+        "San Marino",
+        "Sao Tome and Principe",
+        "Saudi Arabia",
+        "Senegal",
+        "Serbia",
+        "Seychelles",
+        "Sierra Leone",
+        "Singapore",
+        "Slovakia",
+        "Slovenia",
+        "Solomon Islands",
+        "Somalia",
+        "South Africa",
+        "South Korea",
+        "South Sudan",
+        "Spain",
+        "Sri Lanka",
+        "Sudan",
+        "Suriname",
+        "Sweden",
+        "Switzerland",
+        "Syria",
+        "Taiwan",
+        "Tajikistan",
+        "Tanzania",
+        "Thailand",
+        "Timor-Leste",
+        "Togo",
+        "Tonga",
+        "Trinidad and Tobago",
+        "Tunisia",
+        "Turkey",
+        "Turkmenistan",
+        "Tuvalu",
+        "Uganda",
+        "Ukraine",
+        "United Arab Emirates",
+        "United Kingdom",
+        "United States",
+        "Uruguay",
+        "Uzbekistan",
+        "Vanuatu",
+        "Vatican City",
+        "Venezuela",
+        "Vietnam",
+        "Yemen",
+        "Zambia",
+        "Zimbabwe"
+    ],
+    "Institution": [
+        "AgroParisTech (Institut des sciences et industries du vivant et de l'environnement)",
+        "Aix Marseille Université (AMU)",
+        "Alliance Manchester Business School (The University of Manchester)",
+        "American University",
+        "Audencia Business School",
+        "Babson College",
+        "Bayes Business School (formerly Cass Business School)",
+        "Berklee College of Music",
+        "Bocconi University (Università Commerciale Luigi Bocconi)",
+        "Brandeis University",
+        "Bucharest University of Economic Studies (ASE)",
+        "Burgundy School of Business (BSB)",
+        "Cambridge Judge Business School",
+        "CELSA Sorbonne Université",
+        "CEMS (The Global Alliance in Management Education)",
+        "CentraleSupélec",
+        "CFA Dordogne Périgueux",
+        "Charité Universitätsmedizin Berlin",
+        "Charles University of Prague",
+        "CIFFOP (Université Paris 1 Panthéon Sorbonne)",
+        "Columbia University",
+        "Conservatoire National des Arts et Métiers (Le Cnam)",
+        "Conservatoire National Supérieur d'Art Dramatique (CNSAD PSL)",
+        "Copenhagen Business School (CBS)",
+        "Cornell University",
+        "Cours Florent",
+        "Coventry University",
+        "CY Cergy Paris Université",
+        "Dublin City University (DCU)",
+        "EAC (École d'Art et de Culture)",
+        "EAE Business School",
+        "École 42",
+        "École des Hautes Études en Sciences Sociales (EHESS)",
+        "École Hôtelière de Paris CFA Médéric",
+        "École Polytechnique",
+        "EDHEC Business School (École des Hautes Études Commerciales du Nord)",
+        "EFAP (École des Nouveaux Métiers de la Communication)",
+        "EM Strasbourg Business School",
+        "EMIC (École de Management des Industries Créatives)",
+        "Emlyon Business School (EM Lyon)",
+        "Epitech Digital (École pour l'informatique et les nouvelles technologies)",
+        "Erasmus University Rotterdam",
+        "ESADE Business School (Ramon Llull University)",
+        "ESCP Business School (École Supérieure de Commerce de Paris)",
+        "ESG Group",
+        "ESI Business School",
+        "ESIC Business & Marketing School",
+        "ESPI (École Supérieure des Professions Immobilières)",
+        "ESSCA School of Management (École Supérieure des Sciences Commerciales d'Angers)",
+        "ESSEC Business School",
+        "ESTICE International Business School",
+        "FERRANDI Paris",
+        "Fordham University",
+        "Frankfurt School of Finance & Management",
+        "Freie Universität Berlin",
+        "Fu Jen Catholic University",
+        "Fudan University",
+        "George Brown College",
+        "Glion Institute of Higher Education",
+        "GOBELINS Paris (l'école de l'image)",
+        "Graduate School of Management (GSOM) St. Petersburg University",
+        "Grenoble École de Management (GEM)",
+        "Hanyang University",
+        "Haute École de Joaillerie (HEJ)",
+        "HEC Lausanne (Université de Lausanne)",
+        "HEC Montréal (École des Hautes Études Commerciales de Montréal)",
+        "HEC Paris (École des Hautes Études Commerciales de Paris)",
+        "HEIP (Hautes Études Internationales et Politiques)",
+        "Hertie School",
+        "Hong Kong University of Science and Technology (HKUST)",
+        "Humboldt Universität zu Berlin",
+        "IAE Aix Marseille Graduate School of Management",
+        "IAE Lille University School of Management",
+        "IAE Paris Sorbonne Business School",
+        "IAE Paris Est (formerly IAE Gustave Eiffel)",
+        "IAE Versailles (Université Paris Saclay)",
+        "IAE Lyon School of Management",
+        "IAE Creteil",
+        "ICART (L'école du management de la culture et du marché de l'art)",
+        "IE Business School (IE University)",
+        "Institut d'études politiques (IEP)",
+        "IESEG School of Management (Institut d'Économie Scientifique et de Gestion)",
+        "IFP School (Institut Français du Pétrole)",
+        "IGS RH (Institut de Gestion Sociale Ressources Humaines)",
+        "IIM Digital School (Institut de l'Internet et du Multimédia)",
+        "Imperial College Business School",
+        "Imperial College London",
+        "IMPGT Aix Marseille Université (Institut de Management Public et Gouvernance Territoriale)",
+        "IMT Atlantique (École Nationale Supérieure des Mines de Nantes et Télécom Bretagne)",
+        "INSEAD (Institut Européen d'Administration des Affaires)",
+        "INSEEC (Institut des Hautes Études Économiques et Commerciales)",
+        "Institut Catholique de Paris (ICP)",
+        "Institut Français de la Mode (IFM)",
+        "Institut Léonard de Vinci (ILV)",
+        "Institut Lyfe (formerly Institut Paul Bocuse)",
+        "Institut Magellan",
+        "Institut Polytechnique de Paris (IP Paris)",
+        "INSTN (Institut National des Sciences et Techniques Nucléaires)",
+        "International University of Monaco (IUM)",
+        "IQS School of Management (Universitat Ramon Llull)",
+        "IRIS Sup' (Institut de Relations Internationales et Stratégiques)",
+        "ISAE SUPAERO (Institut Supérieur de l'Aéronautique et de l'Espace)",
+        "ISC Paris Business School (Institut Supérieur du Commerce de Paris)",
+        "ISCOM (Institut Supérieur de Communication et Publicité)",
+        "ISIPCA (Institut Supérieur International du Parfum de la Cosmétique et de l'Aromatique alimentaire)",
+        "IULM University (Libera Università di Lingue e Comunicazione)",
+        "Ivey Business School (Western University)",
+        "Kedge Business School",
+        "King's College London",
+        "KU Leuven",
+        "Les Roches Global Hospitality Education",
+        "London Business School (LBS)",
+        "London College of Fashion (University of the Arts London)",
+        "London Film School",
+        "London School of Economics and Political Science (LSE)",
+        "Loughborough University London",
+        "Loyola Marymount University",
+        "LUISS Guido Carli (Libera Università Internazionale degli Studi Sociali)",
+        "Lund University",
+        "MBA ESG",
+        "MGIMO University (Moscow State Institute of International Relations)",
+        "Monash University",
+        "National University of Singapore (NUS)",
+        "NEOMA Business School",
+        "New York University (NYU)",
+        "Newcastle University",
+        "Northumbria University",
+        "Nova School of Business and Economics",
+        "Paris Executive Business School (PEBS)",
+        "Paris School of Business (PSB)",
+        "Penninghen (École de direction artistique et d'architecture intérieure)",
+        "Plekhanov Russian University of Economics",
+        "POLIMI Graduate School of Management",
+        "Polytech Nice Sophia (Université Côte d'Azur)",
+        "Prague University of Economics and Business (VSE)",
+        "Queen's University",
+        "Renmin University of China",
+        "Rotterdam School of Management Erasmus University (RSM)",
+        "Royal College of Art (RCA)",
+        "Ruhr University Bochum",
+        "School of International and Public Affairs (SIPA) at Columbia University",
+        "Sciences Po (Institut d'études politiques de Paris)",
+        "Sciences Po Grenoble UGA",
+        "SDA Bocconi School of Management",
+        "Seoul National University",
+        "Shanghai Jiao Tong University",
+        "Singapore Management University (SMU)",
+        "SKEMA Business School (School of Knowledge Economy and Management)",
+        "Smith School of Business at Queen's University",
+        "Sophia University",
+        "Sorbonne Paris Nord University",
+        "Sorbonne Université",
+        "Sotheby's Institute of Art",
+        "Stockholm School of Economics (SSE)",
+        "Sup de Luxe (Institut Supérieur de Marketing du Luxe)",
+        "TBS Education (Toulouse Business School)",
+        "Technische Universität Berlin (TU Berlin)",
+        "Télécom Paris (École Nationale Supérieure des Télécommunications)",
+        "The University of Edinburgh",
+        "The University of Melbourne",
+        "The University of Sydney (USYD)",
+        "Trinity College Dublin",
+        "Tsinghua University",
+        "UCD Michael Smurfit Graduate Business School",
+        "Universidad Carlos III de Madrid (UC3M)",
+        "Universidad Europea de Madrid (UEM)",
+        "Universidade Católica Portuguesa",
+        "Université Côte d'Azur",
+        "Université de Strasbourg",
+        "Université Gustave Eiffel",
+        "Université Jean Moulin Lyon 3",
+        "Université Laval",
+        "Université Libre de Bruxelles (ULB)",
+        "Université Paris 1 Panthéon Sorbonne",
+        "Université Paris 8 Vincennes Saint Denis",
+        "Université Paris Cité",
+        "Université Paris Dauphine PSL",
+        "Université Paris Nanterre",
+        "Université Paris Panthéon Assas",
+        "Université Paris Saclay",
+        "Université Sorbonne Nouvelle (Paris 3)",
+        "Université Toulouse Capitole",
+        "University College Dublin (UCD)",
+        "University College London (UCL)",
+        "Université de Nice",
+        "University of Amsterdam (UvA)",
+        "University of California Los Angeles (UCLA)",
+        "University of Delaware",
+        "University of Freiburg (Albert Ludwigs Universität Freiburg)",
+        "University of Fribourg",
+        "University of Glasgow",
+        "University of Groningen",
+        "University of London",
+        "University of Mannheim",
+        "University of Pennsylvania (UPenn)",
+        "University of Rochester",
+        "University of Seoul",
+        "University of South Carolina",
+        "University of Southern California (USC)",
+        "University of St. Gallen (HSG)",
+        "University of the Arts London (UAL)",
+        "University of Trento",
+        "University of Warwick",
+        "Warwick Business School (WBS)",
+        "Washington University in St. Louis",
+        "WHU Otto Beisheim School of Management",
+        "WU (Vienna University of Economics and Business)",
+        "Xi'an Jiaotong Liverpool University (XJTLU)",
+        "Zhejiang A&F University (ZAFU)"
+    ],
+    "Continent": [
+        "Africa",
+        "America",
+        "Asia",
+        "Australia",
+        "Europe",
+        "Middle East",
+        "North America",
+        "Oceania",
+        "South America"
+    ],
+    "City": [
+        "Abu Dhabi",
+        "Aix-en-Provence",
+        "Amsterdam",
+        "Barcelona",
+        "Beijing",
+        "Belo Horizonte",
+        "Berlin",
+        "Bochum",
+        "Bordeaux",
+        "Boulogne-Billancourt",
+        "Brussels",
+        "Bucharest",
+        "Bulle",
+        "Cambridge",
+        "Carcavelos",
+        "Casablanca",
+        "Cergy",
+        "Cergy-Pontoise",
+        "Clayton",
+        "Copenhagen",
+        "Courbevoie",
+        "Coventry",
+        "Creteil",
+        "Dublin",
+        "Ecully",
+        "Edinburgh",
+        "Fontainebleau",
+        "Frankfurt",
+        "Fribourg",
+        "Gif-sur-Yvette",
+        "Glasgow",
+        "Grenoble",
+        "Groningen",
+        "Ho Chi Minh",
+        "Hong Kong",
+        "Jouy-en-Josas",
+        "Kingston",
+        "Lancaster",
+        "Las Vegas",
+        "Lausanne",
+        "Leuven",
+        "Lille",
+        "Lisbon",
+        "London",
+        "Los Angeles",
+        "Lund",
+        "Lyon",
+        "Madrid",
+        "Malaga",
+        "Manchester",
+        "Marne-la-Vallée",
+        "Marseille",
+        "Melbourne",
+        "Milan",
+        "Monaco",
+        "Montpellier",
+        "Montreal",
+        "Moscow",
+        "Munich",
+        "Nantes",
+        "New York",
+        "Newark",
+        "Newcastle",
+        "Newcastle upon Tyne",
+        "Nice",
+        "North Carolina",
+        "Oulu",
+        "Palaiseau",
+        "Paris",
+        "Périgueux",
+        "Philadelphia",
+        "Prague",
+        "Puteaux",
+        "Quebec",
+        "Rabat",
+        "Reims",
+        "Rochester",
+        "Rome",
+        "Rotterdam",
+        "Roubaix",
+        "Rouen",
+        "Saint Aubin",
+        "Saint Denis",
+        "Saint-Petersbourg",
+        "Seoul",
+        "Serris",
+        "Shanghai",
+        "Shenzhen",
+        "Singapore",
+        "Sophia Antipolis",
+        "St Gallen",
+        "Stockholm",
+        "Strasbourg",
+        "Suzhou",
+        "Sydney",
+        "Taipei",
+        "Tokyo",
+        "Toronto",
+        "Toulouse",
+        "Tunis",
+        "Turin",
+        "Valencia",
+        "Vallendar",
+        "Velizy",
+        "Vienne",
+        "Villetaneuse",
+        "Waltham",
+        "Warsaw",
+        "Washington",
+        "Wellesley",
+        "Zhejiang"
+    ],
+    "Level": {
+        "Master": "Master 2 (BAC+5) / Master Degree",
+        "Specialised Master": "Mastère Spécialisé/Advanced Master",
+        "Mastere": "Mastère Spécialisé/Advanced Master",
+        "MBA": "MBA",
+        "Doctorat": "Doctorat / PhD",
+        "Preparationdunconcours": "CAP",
+        "Maitrise": "Master 2 (BAC+5) / Master Degree",
+        "Licence": "Licence (BAC+3) / pengraduate",
+        "Chineselanguage": "Autre",
+        "Other kind of studies": "Autre",
+        "Maîtrise": "Master 1 (BAC+4) / Graduate",
+        "Preparation for a competitive examination": "Preparation for a competitive exam",
+        "L3": "Licence (BAC+3) / pengraduate",
+        "Bachelor": "Licence (BAC+3) / pengraduate"
+    },
+    "Language": {
+        "French": "French",
+        "Français": "French",
+        "Francais": "French",
+        "FR": "French",
+        "English": "English",
+        "Anglais": "English",
+        "EN": "English",
+        "German": "German",
+        "Deutsch": "German",
+        "Allemand": "German",
+        "DE": "German",
+        "Spanish": "Spanish",
+        "Español": "Spanish",
+        "Espagnol": "Spanish",
+        "ES": "Spanish",
+        "Chinese": "Chinese",
+        "Mandarin": "Chinese",
+        "Cantonese": "Chinese",
+        "Chinois": "Chinese",
+        "CN": "Chinese",
+        "Italian": "Italian",
+        "Italiano": "Italian",
+        "Italien": "Italian",
+        "IT": "Italian",
+        "Portuguese": "Portuguese",
+        "Português": "Portuguese",
+        "Portugues": "Portuguese",
+        "Portugais": "Portuguese",
+        "PT": "Portuguese",
+        "Japanese": "Japanese",
+        "Nihongo": "Japanese",
+        "Japonais": "Japanese",
+        "JP": "Japanese",
+        "Korean": "Korean",
+        "Hangul": "Korean",
+        "Coréen": "Korean",
+        "Coreen": "Korean",
+        "KR": "Korean",
+        "Arabic": "Arabic",
+        "Arabe": "Arabic",
+        "AR": "Arabic",
+        "Russian": "Russian",
+        "Russe": "Russian",
+        "RU": "Russian",
+        "Dutch": "Dutch",
+        "Nederlands": "Dutch",
+        "Néerlandais": "Dutch",
+        "Neerlandais": "Dutch",
+        "NL": "Dutch",
+        "Hindi": "Hindi",
+        "Indien": "Hindi"
+    },
+    "Tags":  [
+        "Accounting",
+        "Audit & Control",
+        "Architecture & Urban Planning",
+        "Arts & Cultural Management",
+        "Banking & Insurance",
+        "Business Administration & Management",
+        "Communication & Media",
+        "Consulting & Professional Services",
+        "Cybersecurity",
+        "Data Science & Analytics",
+        "Design Management",
+        "Digital Marketing",
+        "Digital Transformation & Tech Management",
+        "Economics",
+        "Economics & Social Sciences",
+        "Energy Management",
+        "Engineering Management",
+        "Entrepreneurship & Innovation",
+        "Finance & Corporate Finance",
+        "Healthcare Management",
+        "Hospitality & Culinary Management",
+        "Human Resources (HR)",
+        "Humanities & Social Sciences",
+        "Information Systems & IT Management",
+        "International Business",
+        "International Security & Defense",
+        "Law",
+        "Luxury Management",
+        "Marketing",
+        "Marketing Research & Insights",
+        "NGO & Social Impact",
+        "Organizational Psychology",
+        "Political Science & International Relations",
+        "Public & Organizational Management",
+        "Real Estate & Asset Management",
+        "Real Estate & Urban Planning",
+        "Sales & Business Development",
+        "Sports Management",
+        "Strategy & Strategic Management",
+        "Supply Chain & Purchasing",
+        "Sustainability & Environmental Management",
+        "Tourism & Leisure",
+        "Video Games & Creative Tech"
+    ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+pandas
+openpyxl
+sentence_transformers
+torch
+tqdm
+requests
+python-dotenv
+tenacity
+flask
+gunicorn
+huggingface_hub

src/config.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import argparse
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# --- ENVIRONMENT VARIABLES to be set up in .env ---
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+RAW_MODELS = os.getenv("GROQ_MODEL", "")
+# Parse models cleanly into a list
+AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()]
+# --- PIPELINE THRESHOLDS & CONSTANTS ---
+DEFAULT_SIMILARITY_THRESHOLD = 0.93
+TOP_K_CANDIDATES = 5
+SHEET_NAME = "Data"
+DATA_DIR = "data"
+REFDATA_DIR = "refdata"
+DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data"
+DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx"
+DEFAULT_OFFICIAL_REFS_FILE = "official_references.json"
+DEFAULT_MANUAL_REFS_FILE = "manual_references.json"
+def resolve_data_path(file_arg):
+    """Resolve bare file names into the project data directory."""
+    if not file_arg:
+        return file_arg
+    if os.path.isabs(file_arg) or os.path.dirname(file_arg):
+        return file_arg
+    return os.path.join(DATA_DIR, file_arg)
+def resolve_ref_path(file_arg):
+    """Resolve bare reference file names into the project refdata directory."""
+    if not file_arg:
+        return file_arg
+    if os.path.isabs(file_arg) or os.path.dirname(file_arg):
+        return file_arg
+    return os.path.join(REFDATA_DIR, file_arg)
+def parse_cli_args():
+    """Sets up the command line arguments so you don't have to hardcode filenames."""
+    parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline")
+    parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/")
+    parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
+    parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
+    parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
+    parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean")
+    parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create")
+    parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order")
+    args = parser.parse_args()
+    args.input = resolve_data_path(args.input)
+    args.blueprint = resolve_data_path(args.blueprint)
+    args.refs = resolve_ref_path(args.refs)
+    args.manual_refs = resolve_ref_path(args.manual_refs)
+    return args

src/data_pipeline.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import pandas as pd
+import torch
+import re
+from collections import Counter
+from sentence_transformers import util
+from tqdm import tqdm
+# Import our pure text manipulation functions
+from src.utils import (
+    clean_degree_text,
+    normalize_text,
+    strip_degrees_for_search,
+    smart_format
+)
+from src.config import TOP_K_CANDIDATES, DEFAULT_SIMILARITY_THRESHOLD
+# ---------------------------------------------------------------------------
+# ML & CLUSTERING ENGINE
+# ---------------------------------------------------------------------------
+def self_cluster_degrees(raw_degrees_list, model, school_cache, threshold=0.93):
+    cleaned_list = [clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)]
+    raw_to_clean = {raw: clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)}
+    clean_counts = Counter(cleaned_list)
+    unique_cleans = [deg for deg, count in clean_counts.most_common() if deg]
+    raw_to_meta = {}
+    if len(unique_cleans) <= 1:
+        for raw, clean in raw_to_clean.items():
+            raw_to_meta[raw] = (clean, "Degree_Formatter", "HIGH")
+        return raw_to_meta
+    embeddings = model.encode(unique_cleans, convert_to_tensor=True)
+    clean_to_clustered = {}
+    merge_info = {} # Tracks similarity scores for the Blueprint
+    for i, current_deg in enumerate(unique_cleans):
+        if current_deg in clean_to_clustered: continue
+        clean_to_clustered[current_deg] = current_deg
+        if i + 1 < len(unique_cleans):
+            cos_scores = util.cos_sim(embeddings[i], embeddings[i+1:])[0]
+            for j, score in enumerate(cos_scores):
+                target_deg = unique_cleans[i + 1 + j]
+                if score.item() >= threshold and target_deg not in clean_to_clustered:
+                    pair_key = f"{min(current_deg, target_deg)}|||{max(current_deg, target_deg)}"
+                    # We still use school_cache as a temporary runtime speedup,
+                    # but it is NOT saved to the json memory.
+                    cached_action = school_cache.get(pair_key)
+                    if cached_action:
+                        if cached_action == current_deg:
+                            clean_to_clustered[target_deg] = current_deg
+                        elif cached_action == target_deg:
+                            clean_to_clustered[current_deg] = target_deg
+                        merge_info[target_deg] = "Cached (Runtime)"
+                    else:
+                        clean_to_clustered[target_deg] = current_deg
+                        school_cache[pair_key] = current_deg
+                        merge_info[target_deg] = f"{score.item()*100:.1f}%"
+    for raw, clean in raw_to_clean.items():
+        final_val = clean
+        was_merged = False
+        sim_str = ""
+        while final_val in clean_to_clustered and clean_to_clustered[final_val] != final_val:
+            if not sim_str: sim_str = merge_info.get(final_val, "")
+            final_val = clean_to_clustered[final_val]
+            was_merged = True
+        conf = "MEDIUM" if was_merged else "HIGH"
+        if was_merged:
+            src = f"Auto-Merge ({sim_str})" if sim_str else "Auto-Merge"
+        else:
+            src = "Degree_Formatter"
+        raw_to_meta[raw] = (final_val, src, conf)
+    return raw_to_meta
+def cluster_degrees_by_institution(df, degree_col, inst_col, model, master_cache, blueprint_data, threshold=0.93):
+    print(f"\n[INFO] Auto-Clustering '{degree_col}'. (Merges will be logged to Blueprint...)")
+    cleaned_col_name = f'Cleaned_{degree_col}'
+    df[cleaned_col_name] = df[degree_col].copy()
+    unique_schools = df[inst_col].dropna().unique()
+    if "Degree_Decisions" not in master_cache: master_cache["Degree_Decisions"] = {}
+    school_mappings = {}
+    # 1. Wrap the AI bottleneck (school clustering) in tqdm
+    for school in tqdm(unique_schools, desc=f"Mapping {degree_col}s by Institution"):
+        school_mask = (df[inst_col] == school) & (df[degree_col].notna())
+        raw_degs = df.loc[school_mask, degree_col].astype(str).tolist()
+        if not raw_degs: continue
+        if school not in master_cache["Degree_Decisions"]: master_cache["Degree_Decisions"][school] = {}
+        school_mappings[school] = self_cluster_degrees(raw_degs, model, master_cache["Degree_Decisions"][school], threshold)
+    # 2. Wrap the DataFrame injection and Blueprint logging in tqdm
+    for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Applying & Logging {degree_col}s"):
+        school = row[inst_col]
+        raw_deg = str(row[degree_col])
+        if pd.isna(row[degree_col]) or school not in school_mappings: continue
+        mapping_data = school_mappings[school].get(raw_deg)
+        if mapping_data:
+            final_val, src, conf = mapping_data
+            df.at[idx, cleaned_col_name] = final_val
+            # Log to Blueprint if modified or auto-merged
+            if str(raw_deg).strip() != final_val.strip() or conf != "HIGH":
+                blueprint_data.append({
+                    "Row_Index": idx + 3,
+                    "Column": degree_col,
+                    "Original_Raw_Text": raw_deg,
+                    "AI_Suggested_Match": final_val,
+                    "Human_Override": "",
+                    "Confidence": conf,
+                    "Match_Source": src
+                })
+    return df
+def get_deterministic_match(value, combined_valid_targets):
+    val_clean = normalize_text(value)
+    for target in combined_valid_targets:
+        target_clean = normalize_text(target)
+        if re.search(rf"^{re.escape(val_clean)}(\b|[\s\(\/\\\-])", target_clean): return target
+    for target in combined_valid_targets:
+        if f"({val_clean.upper()})" in normalize_text(target).upper(): return target
+    return None
+def get_top_candidates(model, value, combined_valid_targets, reference_embeddings, k=5):
+    if not combined_valid_targets: return []
+    query_embedding = model.encode(value, convert_to_tensor=True)
+    similarities = util.pytorch_cos_sim(query_embedding, reference_embeddings)[0]
+    actual_k = min(k, len(combined_valid_targets))
+    top_matches = torch.topk(similarities, actual_k)
+    return [combined_valid_targets[idx] for idx in top_matches.indices]
+def get_dict_exact_match(value, combined_dict):
+    value_clean = normalize_text(value)
+    for alias, canonical in combined_dict.items():
+        if normalize_text(alias) == value_clean:
+            return canonical
+    for canonical in combined_dict.values():
+        if normalize_text(canonical) == value_clean:
+            return canonical
+    return None
+def get_dict_rule_match(value, combined_dict):
+    aliases = list(combined_dict.keys())
+    canonical_values = list(dict.fromkeys(combined_dict.values()))
+    alias_match = get_deterministic_match(value, aliases)
+    if alias_match:
+        return combined_dict[alias_match]
+    value_match = get_deterministic_match(value, canonical_values)
+    if value_match:
+        return value_match
+    return None
+def as_reference_list(ref_data):
+    if isinstance(ref_data, list):
+        return ref_data
+    if isinstance(ref_data, dict):
+        return list(dict.fromkeys(ref_data.values()))
+    return []
+def as_reference_dict(ref_data):
+    if isinstance(ref_data, dict):
+        return ref_data
+    if isinstance(ref_data, list):
+        return {item: item for item in ref_data if isinstance(item, str)}
+    return {}
+def update_match_postfix(progress, source_counts):
+    progress.set_postfix({
+        "Exact_Match": source_counts["Exact_Match"],
+        "Rule_Match": source_counts["Rule_Match"],
+        "LLM_Judged": source_counts["LLM_Judged"],
+    }, refresh=False)
+def match_cache_key(column_name, value):
+    """Return the single cache key format used for matching and reconstruction."""
+    if column_name in ["Institution", "Degree"]:
+        value = strip_degrees_for_search(value)
+    return normalize_text(str(value).rstrip("."))
+def append_unique_cleaned_part(cleaned_parts, value):
+    seen = set()
+    for existing_value in cleaned_parts:
+        for existing_part in str(existing_value).split(","):
+            key = normalize_text(existing_part.strip())
+            if key:
+                seen.add(key)
+    added = False
+    for part in str(value).split(","):
+        clean_part = part.strip()
+        if not clean_part:
+            continue
+        key = normalize_text(clean_part)
+        if key in seen:
+            continue
+        seen.add(key)
+        cleaned_parts.append(clean_part)
+        added = True
+    return added
+# ---------------------------------------------------------------------------
+# CORE DATA PIPELINE
+# ---------------------------------------------------------------------------
+def process_column(df, column_name, model, groq_router, official_refs, manual_refs, master_cache, split_pattern, blueprint_data):
+    if column_name not in df.columns: return df
+    core_data = official_refs.get(column_name, [])
+    added_data = manual_refs.get(column_name, [])
+    if column_name not in master_cache: master_cache[column_name] = {}
+    detailed_cache = {}
+    is_dict_mode = isinstance(core_data, dict)
+    def get_updated_embeddings():
+        if is_dict_mode:
+            c_dict = {**as_reference_dict(core_data), **as_reference_dict(added_data)}
+            c_keys = list(c_dict.keys())
+            u_vals = list(set(c_dict.values()))
+            k_emb = model.encode(c_keys, convert_to_tensor=True) if c_keys else None
+            v_emb = model.encode(u_vals, convert_to_tensor=True) if u_vals else None
+            return c_dict, c_keys, k_emb, u_vals, v_emb
+        else:
+            comb = as_reference_list(core_data) + as_reference_list(added_data)
+            comb = list(dict.fromkeys(item for item in comb if isinstance(item, str) and item.strip()))
+            emb = model.encode(comb, convert_to_tensor=True) if comb else None
+            return None, comb, emb, None, None
+    combined_dict, combined_valid_targets, reference_embeddings, unique_values, value_embeddings = get_updated_embeddings()
+    if is_dict_mode and not combined_dict:
+        raise ValueError(f"No dictionary references loaded for '{column_name}'. Refusing to call Groq for every value.")
+    if not is_dict_mode and not combined_valid_targets:
+        raise ValueError(f"No list references loaded for '{column_name}'. Refusing to call Groq for every value.")
+    uniques = set()
+    for cell in df[column_name].dropna():
+        for p in re.split(split_pattern, str(cell), flags=re.IGNORECASE):
+            if p.strip(): uniques.add(p.strip())
+    print(f"\n[INFO] Analyzing {len(uniques)} unique entities in '{column_name}'...")
+    source_counts = Counter()
+    progress = tqdm(sorted(uniques, key=normalize_text), desc=f"Cleaning {column_name}")
+    for word in progress:
+        word_clean = match_cache_key(column_name, word)
+        # 1. Check Memory Cache
+        if word_clean in master_cache[column_name]:
+            detailed_cache[word_clean] = {"val": master_cache[column_name][word_clean], "src": "Memory_Cache", "conf": "HIGH"}
+            source_counts["Memory_Cache"] += 1
+            update_match_postfix(progress, source_counts)
+            continue
+        # 2. Check Exact Targets
+        if is_dict_mode:
+            exact = get_dict_exact_match(word, combined_dict)
+        else:
+            exact = next((k for k in combined_valid_targets if normalize_text(k) == normalize_text(word_clean)), None) if combined_valid_targets else None
+        if exact:
+            val = exact
+            detailed_cache[word_clean] = {"val": val, "src": "Exact_Match", "conf": "HIGH"}
+            source_counts["Exact_Match"] += 1
+            update_match_postfix(progress, source_counts)
+            continue
+        # 3. Deterministic / Rule Match
+        if is_dict_mode:
+            suggested_match = get_dict_rule_match(word, combined_dict)
+        else:
+            suggested_match = get_deterministic_match(word, combined_valid_targets) if combined_valid_targets else None
+        if suggested_match:
+            detailed_cache[word_clean] = {"val": suggested_match, "src": "Rule_Match", "conf": "HIGH"}
+            source_counts["Rule_Match"] += 1
+            update_match_postfix(progress, source_counts)
+            continue
+        # 4. LLM API Match
+        candidates = []
+        if is_dict_mode:
+            cand_keys = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
+            cand_vals = get_top_candidates(model, word, unique_values, value_embeddings)
+            candidates = list(dict.fromkeys(cand_keys + cand_vals))[:TOP_K_CANDIDATES]
+        else:
+            candidates = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
+        # Call the router instance
+        ans_val, src, conf = groq_router.ask_judge(word, candidates, column_name)
+        source_counts[src] += 1
+        update_match_postfix(progress, source_counts)
+        # Process every valid string, regardless of confidence (skip if API crashed)
+        if "API_Error" not in conf and ans_val != "UNKNOWN" and ans_val != "LLM_Failed":
+            llm_parts = [p.strip() for p in ans_val.split(",")]
+            corrected_parts = []
+            all_matched = True  # Flag to track if every piece exists in our data
+            for part in llm_parts:
+                if is_dict_mode:
+                    exact_match = get_dict_exact_match(part, combined_dict)
+                    if exact_match:
+                        corrected_parts.append(exact_match)
+                    else:
+                        rule_match = get_dict_rule_match(part, combined_dict)
+                        if rule_match:
+                            corrected_parts.append(rule_match)
+                        else:
+                            corrected_parts.append(part)
+                            all_matched = False
+                else:
+                    # 1. Exact Match Check (Case-insensitive)
+                    exact_match = next((c for c in candidates if c.lower() == part.lower()), None)
+                    if exact_match:
+                        corrected_parts.append(exact_match)
+                    else:
+                        # 2. Rule-Based Match Check
+                        rule_match = get_deterministic_match(part, candidates)
+                        if rule_match:
+                            corrected_parts.append(rule_match)
+                        else:
+                            # 3. No match in dictionary. Keep LLM's version, but flag that we couldn't verify it.
+                            corrected_parts.append(part)
+                            all_matched = False
+            # Remove duplicates while preserving the exact order
+            unique_parts = list(dict.fromkeys(corrected_parts))
+            # Glue it back together
+            ans_val = ", ".join(unique_parts)
+            raw_parts_for_check = [
+                p.strip()
+                for p in re.split(split_pattern, str(word))
+                if p.strip()
+            ] or [word]
+            raw_lookup_keys = {normalize_text(part) for part in raw_parts_for_check}
+            verified_lookup_keys = {normalize_text(part) for part in unique_parts}
+            # Only upgrade when the LLM answer was verified against the refs and
+            # also matches the original text directly. Otherwise it is still an
+            # LLM judgment, even if the chosen answer exists in the references.
+            if all_matched and verified_lookup_keys.issubset(raw_lookup_keys):
+                conf = "HIGH"
+                src = "LLM_Rule_Verified"
+        # -----------------------------------------------------
+        detailed_cache[word_clean] = {"val": ans_val, "src": src, "conf": conf}
+    # Reconstruct cells and capture low/medium confidence matches for the Blueprint
+    for idx, row in df.iterrows():
+        cell_val = row[column_name]
+        if pd.isna(cell_val): continue
+        raw_parts = [p.strip() for p in re.split(split_pattern, str(cell_val)) if p.strip()]
+        cleaned_parts = []
+        lowest_conf = "HIGH"
+        cell_sources = []
+        i = 0
+        while i < len(raw_parts):
+            curr = raw_parts[i]
+            # Check for combined pairs (e.g., "University of, Manchester" split by mistake)
+            if i + 1 < len(raw_parts):
+                combo_clean = match_cache_key(column_name, f"{curr}, {raw_parts[i+1]}")
+                if combo_clean in detailed_cache:
+                    append_unique_cleaned_part(cleaned_parts, detailed_cache[combo_clean]["val"])
+                    if detailed_cache[combo_clean]["conf"] != "HIGH": lowest_conf = detailed_cache[combo_clean]["conf"]
+                    cell_sources.append(detailed_cache[combo_clean]["src"])
+                    i += 2; continue
+            p_clean = match_cache_key(column_name, curr)
+            if len(p_clean) <= 1: i += 1; continue
+            cache_hit = detailed_cache.get(p_clean)
+            if cache_hit:
+                append_unique_cleaned_part(cleaned_parts, cache_hit["val"])
+                if cache_hit["conf"] != "HIGH": lowest_conf = cache_hit["conf"]
+                cell_sources.append(cache_hit["src"])
+            else:
+                append_unique_cleaned_part(cleaned_parts, smart_format(curr))
+                lowest_conf = "LOW"
+                cell_sources.append("Fallback_Format")
+            i += 1
+        final_stitched_val = ", ".join(cleaned_parts)
+        df.at[idx, f'Cleaned_{column_name}'] = final_stitched_val
+        # Log EVERY change made to the Excel file, plus any low/medium confidence guesses
+        if str(cell_val).strip() != final_stitched_val.strip() or lowest_conf != "HIGH":
+            blueprint_data.append({
+                "Row_Index": idx + 3,
+                "Column": column_name,
+                "Original_Raw_Text": cell_val,
+                "AI_Suggested_Match": final_stitched_val,
+                "Human_Override": "",
+                "Match_Source": " | ".join(set(cell_sources)),
+                "Confidence": lowest_conf
+            })
+    return df

src/llm_router.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import requests
+import time
+from tqdm import tqdm
+from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
+class RateLimitException(Exception): pass
+class GroqRouter:
+    def __init__(self, api_key, available_models):
+        self.api_key = api_key
+        self.available_models = available_models
+        self.current_model_index = 0
+        self.last_printed_model = None
+    def ask_judge(self, word, candidates, column_name):
+        if self.current_model_index >= len(self.available_models):
+            return (word, "API_Error_All_Models_Dead", "LOW")
+        if not self.api_key:
+            return (word, "No_API_Key", "LOW")
+        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
+        if column_name in ["Institution", "Degree"]:
+            specific_rules = (
+                "- Split distinct separate schools or global alliances with a comma (e.g., 'Harvard & MIT' -> 'Harvard University, MIT').\n"
+                "- Treat alliances like CEMS as distinct institutions! NEVER delete them.\n"
+                "- Do NOT split a Business School and its Parent University. They are ONE entity.\n"
+                "- Do NOT split a school and its acronym/alias. Keep them as one entity.\n"
+                "- Watch out for joint programs! If a hyphen separates TWO truly distinct institutions, split them.\n"
+                "- Do NOT split single names with hyphens (e.g., 'Paris-Descartes').\n"
+                "- Strip out ONLY standard degree titles (e.g., 'MSc', 'MBA', 'BBA', 'Ph.D.')."
+            )
+        elif column_name in ["Country", "Continent"]:
+            specific_rules = (
+                f"- MUST translate the {column_name} name to its standard ENGLISH equivalent (e.g., 'Espagne' -> 'Spain', 'Milano' -> 'Milan').\n"
+                f"- If the input contains multiple distinct {column_name}s, MUST split them into a comma-separated list.\n"
+                f"- If it is a single {column_name}, return exactly ONE official English name."
+            )
+        else:
+            specific_rules = (
+                "- If the input contains multiple distinct entities, MUST split them into a comma-separated list.\n"
+                "- If it is a single entity, return exactly ONE official name. No aliases."
+            )
+        system_msg = (
+            "You are a data normalization expert. Your task is to map raw input to the MOST ACCURATE "
+            "official name from a provided list of candidates or your own world knowledge. "
+            "Respond ONLY with the final string. No conversational text, no quotes."
+        )
+        user_msg = (
+            f"Column: {column_name}\n"
+            f"Raw Input: '{word}'\n"
+            f"Database Candidates: {candidates}\n\n"
+            f"Rules:\n"
+            f"1. Choose the best match from the Database Candidates if possible.\n"
+            f"2. If an entity from the Raw Input does NOT exist in the Candidates list, DO NOT map it to a different candidate just because they share a city or country. Instead, return its cleaned original name exactly as written.\n"
+            f"{specific_rules}\n"
+            f"3. If gibberish, return 'UNKNOWN'."
+        )
+        payload = {
+            "messages": [{"role": "system", "content": system_msg}, {"role": "user", "content": user_msg}],
+            "temperature": 0.0,
+            "max_tokens": 50
+        }
+        # --- SIMPLIFIED RETRY LOGIC ---
+        @retry(
+            retry=retry_if_exception_type(RateLimitException),
+            wait=wait_exponential(multiplier=2, min=2, max=30),
+            stop=stop_after_attempt(5),
+            reraise=True
+        )
+        def fire_request():
+            res = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, timeout=30)
+            if res.status_code == 429:
+                raise RateLimitException("Model Limits Exhausted")
+            return res
+        while self.current_model_index < len(self.available_models):
+            active_model = self.available_models[self.current_model_index]
+            payload["model"] = active_model
+            if active_model != self.last_printed_model:
+                tqdm.write(f"\n[INFO] Routing requests to model: {active_model}")
+                self.last_printed_model = active_model
+            try:
+                time.sleep(0.3)
+                response = fire_request()
+                if response.status_code == 200:
+                    answer = response.json()["choices"][0]["message"]["content"].strip().strip('"\'')
+                    if answer == "UNKNOWN":
+                        return (word, "LLM_Failed", "LOW")
+                    return (answer, "LLM_Judged", "MEDIUM")
+                else:
+                    tqdm.write(f"  [!] Groq API Error: {response.status_code} for '{word}'")
+                    return (word, f"API_Error_{response.status_code}", "LOW")
+            except RateLimitException:
+                tqdm.write(f"  [!] Limits exhausted for {active_model}!")
+                self.current_model_index += 1
+                if self.current_model_index < len(self.available_models):
+                    next_model = self.available_models[self.current_model_index]
+                    tqdm.write(f"  [!] Switching to fallback model: {next_model}...")
+                else:
+                    tqdm.write(f"  [!] FATAL: ALL {len(self.available_models)} MODELS DEAD. Skipping remainder of dataset.")
+                    return (word, "API_Error_All_Models_Dead", "LOW")
+            except Exception as e:
+                tqdm.write(f"  [!] Request failed for '{word}' on {active_model}: {e}")
+                return (word, "API_Error_Timeout", "LOW")
+        return (word, "API_Error_Unhandled", "LOW")

src/process_runner.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import codecs
+import json
+import os
+import signal
+import subprocess
+from pathlib import Path
+ACTIVE_PROCESSES = {}
+def stop_process(job_id: str) -> bool:
+    process = ACTIVE_PROCESSES.get(job_id)
+    if not process or process.poll() is not None:
+        return False
+    try:
+        if os.name == "nt":
+            process.send_signal(signal.CTRL_BREAK_EVENT)
+        else:
+            process.terminate()
+    except Exception:
+        process.terminate()
+    return True
+def stream_process(command, cwd: Path, job_id=None):
+    env = os.environ.copy()
+    env["PYTHONUNBUFFERED"] = "1"
+    popen_kwargs = {
+        "cwd": cwd,
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.STDOUT,
+        "bufsize": 0,
+        "env": env,
+    }
+    if os.name == "nt":
+        popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
+    process = subprocess.Popen(
+        command,
+        **popen_kwargs,
+    )
+    if job_id:
+        ACTIVE_PROCESSES[job_id] = process
+    try:
+        assert process.stdout is not None
+        decoder = codecs.getincrementaldecoder("utf-8")("replace")
+        while True:
+            raw_chunk = process.stdout.read(1)
+            if raw_chunk == b"" and process.poll() is not None:
+                break
+            if raw_chunk:
+                chunk = decoder.decode(raw_chunk)
+                yield f"data: {json.dumps(chunk)}\n\n"
+        exit_code = process.wait()
+        trailing_chunk = decoder.decode(b"", final=True)
+        if trailing_chunk:
+            yield f"data: {json.dumps(trailing_chunk)}\n\n"
+        yield f"data: {json.dumps(chr(10) + f'Process exited with code {exit_code}' + chr(10))}\n\n"
+        event_name = "done" if exit_code == 0 else "failed"
+        yield f"event: {event_name}\ndata: {{}}\n\n"
+    finally:
+        if job_id:
+            ACTIVE_PROCESSES.pop(job_id, None)

src/utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+from pathlib import Path
+import re
+import unicodedata
+def strip_degrees_for_search(text):
+    if not isinstance(text, str): return text
+    degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b'
+    cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE)
+    cleaned = re.sub(r'\s+', ' ', cleaned)
+    cleaned = cleaned.strip(' -.,&/|')
+    if not cleaned: return text.strip()
+    return cleaned
+def smart_format(text):
+    if not isinstance(text, str): return text
+    res = text.title()
+    acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems']
+    for ac in acronyms:
+        res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res)
+    res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc")
+    res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res)
+    return res.strip()
+def clean_degree_text(text):
+    if not isinstance(text, str): return ""
+    text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE)
+    text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE)
+    text = re.sub(r'[^\w\s\-&\+\']', ' ', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return smart_format(text)
+def normalize_text(text):
+    if not isinstance(text, str): return ""
+    normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
+    return normalized.strip().lower()
+def normalize_ref(value):
+    return normalize_text(str(value))
+def iter_ref_values(ref_data):
+    if isinstance(ref_data, dict):
+        yield from (item for item in ref_data.keys() if isinstance(item, str))
+        yield from (item for item in ref_data.values() if isinstance(item, str))
+    elif isinstance(ref_data, list):
+        yield from (item for item in ref_data if isinstance(item, str))
+def ref_contains(ref_data, value):
+    needle = normalize_ref(value)
+    return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data))
+def prune_manual_refs_against_official(manual_refs, official_refs):
+    removed_count = 0
+    for column_name, manual_bucket in list(manual_refs.items()):
+        official_bucket = official_refs.get(column_name, [])
+        if isinstance(manual_bucket, list):
+            kept = []
+            seen = set()
+            for value in manual_bucket:
+                if not isinstance(value, str):
+                    removed_count += 1
+                    continue
+                key = normalize_ref(value)
+                if not key or key in seen:
+                    removed_count += 1
+                    continue
+                if ref_contains(official_bucket, value):
+                    removed_count += 1
+                    continue
+                seen.add(key)
+                kept.append(value)
+            manual_refs[column_name] = kept
+        elif isinstance(manual_bucket, dict):
+            kept = {}
+            seen_values = set()
+            for alias, value in manual_bucket.items():
+                candidate = value if isinstance(value, str) else alias
+                key = normalize_ref(candidate)
+                if not key or key in seen_values:
+                    removed_count += 1
+                    continue
+                if ref_contains(official_bucket, candidate):
+                    removed_count += 1
+                    continue
+                seen_values.add(key)
+                kept[normalize_ref(alias)] = value
+            manual_refs[column_name] = kept
+    return removed_count
+MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json"
+def reference_sync_status():
+    space_id = os.getenv("SPACE_ID", "")
+    has_token = bool(os.getenv("HF_TOKEN"))
+    if not space_id:
+        return {
+            "enabled": False,
+            "space_id": "",
+            "reason": "Reference sync is only available on Hugging Face Spaces.",
+        }
+    if not has_token:
+        return {
+            "enabled": False,
+            "space_id": space_id,
+            "reason": "HF_TOKEN secret is missing from this Space.",
+        }
+    return {
+        "enabled": True,
+        "space_id": space_id,
+        "reason": "",
+    }
+def save_manual_references_to_hub(app_root: Path):
+    status = reference_sync_status()
+    if not status["enabled"]:
+        raise RuntimeError(status["reason"])
+    manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH
+    if not manual_refs_path.is_file():
+        raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}")
+    try:
+        from huggingface_hub import HfApi
+    except ImportError as exc:
+        raise RuntimeError("huggingface_hub is not installed.") from exc
+    api = HfApi(token=os.environ["HF_TOKEN"])
+    commit_info = api.upload_file(
+        path_or_fileobj=str(manual_refs_path),
+        path_in_repo=MANUAL_REFERENCES_REPO_PATH,
+        repo_id=status["space_id"],
+        repo_type="space",
+        commit_message="Update manual references",
+    )
+    return {
+        "space_id": status["space_id"],
+        "path": MANUAL_REFERENCES_REPO_PATH,
+        "commit_url": str(commit_info),
+    }

src/workbook_io.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import uuid
+from pathlib import Path
+import openpyxl
+from werkzeug.utils import secure_filename
+ALLOWED_EXCEL_EXTENSIONS = (".xlsx", ".xlsm")
+def save_uploaded_excel(uploaded, upload_dir: Path):
+    if not uploaded or not uploaded.filename:
+        raise ValueError("No file uploaded.")
+    filename = secure_filename(uploaded.filename)
+    if not filename.lower().endswith(ALLOWED_EXCEL_EXTENSIONS):
+        raise ValueError("Upload an .xlsx or .xlsm file.")
+    stem = Path(filename).stem
+    suffix = Path(filename).suffix
+    saved_filename = f"{stem}_{uuid.uuid4().hex[:8]}{suffix}"
+    destination = upload_dir / saved_filename
+    uploaded.save(destination)
+    return saved_filename, destination
+def read_workbook_sheets(path: Path) -> list[str]:
+    workbook = openpyxl.load_workbook(path, read_only=True, data_only=False)
+    try:
+        return workbook.sheetnames
+    finally:
+        workbook.close()
+def resolve_allowed_path(raw_path: str, app_root: Path, allowed_roots: list[Path]) -> Path:
+    if not raw_path:
+        raise ValueError("Path is required.")
+    candidate = Path(raw_path)
+    if not candidate.is_absolute():
+        candidate = app_root / candidate
+    resolved = candidate.resolve()
+    allowed = [root.resolve() for root in allowed_roots]
+    if not any(resolved == root or resolved.is_relative_to(root) for root in allowed):
+        raise ValueError("Path is outside the application data directory.")
+    return resolved

ui/static/app.js ADDED Viewed

	@@ -0,0 +1,463 @@

+    const config = window.MASTERMAP_CONFIG || {};
+    const cleanPath = config.cleanPath || "";
+    let applyWorkbookPath = config.applyWorkbookPath || "";
+    let applyBlueprintPath = config.applyBlueprintPath || "";
+    const defaultOutputSheet = config.defaultOutputSheet || "Cleaned_Data";
+    const sheetSelect = document.getElementById("sheetSelect");
+    const applySheetSelect = document.getElementById("applySheetSelect");
+    const outputSheet = document.getElementById("outputSheet");
+    const models = document.getElementById("models");
+    const fetchModels = document.getElementById("fetchModels");
+    const runButton = document.getElementById("runButton");
+    const applyButton = document.getElementById("applyButton");
+    const saveReferencesButton = document.getElementById("saveReferencesButton");
+    const applyWorkbookForm = document.getElementById("applyWorkbookForm");
+    const applyBlueprintForm = document.getElementById("applyBlueprintForm");
+    const applyWorkbookInput = document.getElementById("applyWorkbookInput");
+    const applyBlueprintInput = document.getElementById("applyBlueprintInput");
+    const runStatus = document.getElementById("runStatus");
+    const applyStatus = document.getElementById("applyStatus");
+    const referencesStatus = document.getElementById("referencesStatus");
+    const applyWorkbookFile = document.getElementById("applyWorkbookFile");
+    const applyBlueprintFile = document.getElementById("applyBlueprintFile");
+    const cleanLogs = document.getElementById("cleanLogs");
+    const applyLogs = document.getElementById("applyLogs");
+    const cleanProgressPanel = document.getElementById("cleanProgressPanel");
+    const cleanProgressSummary = document.getElementById("cleanProgressSummary");
+    const cleanProgressList = document.getElementById("cleanProgressList");
+    const cleanResult = document.getElementById("cleanResult");
+    const applyResult = document.getElementById("applyResult");
+    let cleanRawLogText = "";
+    let cleanLiveLine = "";
+    let cleanProgressOrder = [];
+    let cleanProgressByColumn = {};
+    let applyRawLogText = "";
+    let applyLiveLine = "";
+    let activeRunStream = null;
+    let activeRunJobId = "";
+    let stopRequested = false;
+    function submitUploadForm(formId, statusId, event) {
+      if (event) {
+        event.preventDefault();
+      }
+      const form = document.getElementById(formId);
+      const status = document.getElementById(statusId);
+      [
+        ["clean_selected_sheet", sheetSelect ? sheetSelect.value : ""],
+        ["output_sheet", outputSheet ? outputSheet.value : ""],
+        ["models", models ? models.value : ""],
+        ["apply_selected_sheet", applySheetSelect ? applySheetSelect.value : ""]
+      ].forEach(([name, value]) => {
+        let input = form.querySelector(`input[name="${name}"]`);
+        if (!input) {
+          input = document.createElement("input");
+          input.type = "hidden";
+          input.name = name;
+          form.appendChild(input);
+        }
+        input.value = value;
+      });
+      status.textContent = "Loading...";
+      if (formId === "applyWorkbookForm" || formId === "applyBlueprintForm") {
+        uploadApplyFile(form, status, formId);
+        return;
+      }
+      form.submit();
+    }
+    async function uploadApplyFile(form, status, formId) {
+      try {
+        const res = await fetch(form.action, {
+          method: "POST",
+          body: new FormData(form),
+          headers: { "Accept": "application/json" }
+        });
+        const data = await res.json();
+        if (!res.ok) {
+          status.textContent = data.error || "Upload failed.";
+          status.classList.add("error");
+          return;
+        }
+        status.classList.remove("error");
+        status.textContent = data.message || "Loaded.";
+        if (formId === "applyWorkbookForm") {
+          applyWorkbookPath = data.apply_workbook_path || "";
+          applyWorkbookFile.innerHTML = data.apply_workbook_filename
+            ? `<div class="file-pill">${escapeHtml(data.apply_workbook_filename)}</div>`
+            : "";
+        }
+        if (formId === "applyBlueprintForm") {
+          applyBlueprintPath = data.apply_blueprint_path || "";
+          applyBlueprintFile.innerHTML = data.apply_blueprint_filename
+            ? `<div class="file-pill">${escapeHtml(data.apply_blueprint_filename)}</div>`
+            : "";
+        }
+        if (data.apply_sheets) {
+          setApplySheets(data.apply_sheets, data.apply_selected_sheet);
+        }
+        applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
+      } catch (error) {
+        status.textContent = "Upload failed.";
+        status.classList.add("error");
+      }
+    }
+    function clearCleanOutput() {
+      cleanRawLogText = "";
+      cleanLiveLine = "";
+      cleanProgressOrder = [];
+      cleanProgressByColumn = {};
+      cleanLogs.textContent = "";
+      cleanProgressPanel.classList.remove("active");
+      cleanProgressSummary.textContent = "Waiting...";
+      cleanProgressList.innerHTML = "";
+      cleanResult.classList.remove("active");
+      cleanResult.innerHTML = "";
+    }
+    function clearApplyOutput() {
+      applyRawLogText = "";
+      applyLiveLine = "";
+      applyLogs.textContent = "";
+      applyResult.classList.remove("active");
+      applyResult.innerHTML = "";
+    }
+    function parseProgressLine(line) {
+      const cleanLine = line.replace(/\x1b\[[0-9;]*m/g, "").trim();
+      const match = cleanLine.match(/^Cleaning\s+(.+?):\s+(\d+)%\|.*?\|\s+(\d+)\/(\d+)\s+\[([^\]]*)\]/);
+      if (!match) return null;
+      const columnName = match[1].trim();
+      const percent = Number(match[2]);
+      const current = match[3];
+      const total = match[4];
+      const bracketParts = match[5].split(",").map(part => part.trim()).filter(Boolean);
+      const timingParts = bracketParts.slice(0, 2);
+      const metricParts = bracketParts.slice(2);
+      const timing = timingParts.join(", ").includes("?") ? "estimating..." : timingParts.join(", ");
+      const metrics = metricParts.join(", ");
+      return { columnName, percent, current, total, timing, metrics };
+    }
+    function escapeHtml(value) {
+      return String(value).replace(/[&<>"']/g, char => ({
+        "&": "&amp;",
+        "<": "&lt;",
+        ">": "&gt;",
+        '"': "&quot;",
+        "'": "&#39;"
+      }[char]));
+    }
+    function renderCleanProgressRows() {
+      cleanProgressPanel.classList.add("active");
+      cleanProgressSummary.textContent = `${cleanProgressOrder.length} column${cleanProgressOrder.length === 1 ? "" : "s"}`;
+      cleanProgressList.innerHTML = cleanProgressOrder.map(columnName => {
+        const item = cleanProgressByColumn[columnName];
+        const percent = Math.max(0, Math.min(100, item.percent));
+        const meta = `${item.current}/${item.total}${item.timing ? " | " + item.timing : ""}${item.metrics ? " | " + item.metrics : ""}`;
+        return `
+          <div class="progress-row">
+            <div class="progress-top">
+              <strong>${escapeHtml(columnName)}</strong>
+              <span>${percent}%</span>
+            </div>
+            <div class="progress-track">
+              <div class="progress-fill" style="width:${percent}%"></div>
+            </div>
+            <div class="progress-meta">${escapeHtml(meta)}</div>
+          </div>
+        `;
+      }).join("");
+    }
+    function renderCleanProgressLine(line) {
+      const parsed = parseProgressLine(line);
+      if (!parsed) return false;
+      if (!cleanProgressByColumn[parsed.columnName]) {
+        cleanProgressOrder.push(parsed.columnName);
+      }
+      cleanProgressByColumn[parsed.columnName] = parsed;
+      renderCleanProgressRows();
+      return true;
+    }
+    function appendCleanLogChunk(chunk) {
+      for (const char of chunk) {
+        cleanRawLogText += char === "\r" ? "\n" : char;
+        if (char === "\r") {
+          renderCleanProgressLine(cleanLiveLine);
+          cleanLiveLine = "";
+        } else if (char === "\n") {
+          renderCleanProgressLine(cleanLiveLine);
+          cleanLiveLine = "";
+        } else {
+          cleanLiveLine += char;
+        }
+      }
+      const isPartialProgress = cleanLiveLine.startsWith("Cleaning ") && cleanLiveLine.includes("|");
+      if (!isPartialProgress) {
+        renderCleanProgressLine(cleanLiveLine);
+      }
+      cleanLogs.textContent = cleanRawLogText;
+      cleanLogs.scrollTop = cleanLogs.scrollHeight;
+    }
+    function renderApplySummary() {
+      const changed = applyRawLogText.match(/Success!\s+(\d+)\s+corrections injected/i);
+      const added = applyRawLogText.match(/Memory updated:\s+(\d+)\s+new approved values added/i);
+      if (!changed && !added) return;
+      applyResult.classList.add("active");
+      applyResult.innerHTML = `
+        <strong>Blueprint applied</strong>
+        <div class="status">${changed ? changed[1] : "0"} workbook row value${changed && changed[1] === "1" ? "" : "s"} updated from human overrides.</div>
+        <div class="status">${added ? added[1] : "0"} new unique reference value${added && added[1] === "1" ? "" : "s"} added to manual references.</div>
+        <a class="download-link" href="/download-applied-workbook">Download Cleaned Workbook</a>
+      `;
+    }
+    function appendApplyLogChunk(chunk) {
+      for (const char of chunk) {
+        applyRawLogText += char === "\r" ? "\n" : char;
+        if (char === "\r" || char === "\n") {
+          applyLiveLine = "";
+        } else {
+          applyLiveLine += char;
+        }
+      }
+      applyLogs.textContent = applyRawLogText;
+      applyLogs.scrollTop = applyLogs.scrollHeight;
+      renderApplySummary();
+    }
+    function clearPersistedOutputs() {
+      [
+        "mastermap.cleanRawLogText",
+        "mastermap.applyRawLogText",
+        "mastermap.cleanResultHtml",
+        "mastermap.applyResultHtml",
+        "mastermap.cleanResultActive",
+        "mastermap.applyResultActive"
+      ].forEach(key => localStorage.removeItem(key));
+    }
+    function setRunButtonIdle() {
+      activeRunStream = null;
+      activeRunJobId = "";
+      stopRequested = false;
+      runButton.disabled = false;
+      runButton.textContent = "Run Cleaning";
+      runButton.classList.remove("danger");
+    }
+    async function stopActiveRun() {
+      if (!activeRunJobId) return;
+      stopRequested = true;
+      runButton.disabled = true;
+      runButton.textContent = "Stopping...";
+      runStatus.textContent = "Stopping run...";
+      await fetch(`/stop?job_id=${encodeURIComponent(activeRunJobId)}`, { method: "POST" });
+    }
+    function selectApplySheet(sheetName) {
+      if (!sheetName) return;
+      const existing = Array.from(applySheetSelect.options).some(option => option.value === sheetName);
+      if (!existing) {
+        const option = document.createElement("option");
+        option.value = sheetName;
+        option.textContent = sheetName;
+        applySheetSelect.appendChild(option);
+      }
+      applySheetSelect.value = sheetName;
+      applySheetSelect.disabled = false;
+    }
+    function setApplySheets(sheets, preferredSheet) {
+      const selected = preferredSheet || applySheetSelect.value;
+      applySheetSelect.innerHTML = "";
+      sheets.forEach(sheetName => {
+        const option = document.createElement("option");
+        option.value = sheetName;
+        option.textContent = sheetName;
+        applySheetSelect.appendChild(option);
+      });
+      if (selected && sheets.includes(selected)) {
+        applySheetSelect.value = selected;
+      } else if (sheets.length) {
+        applySheetSelect.value = sheets[0];
+      }
+      applySheetSelect.disabled = sheets.length === 0;
+    }
+    async function refreshApplySheets(preferredSheet) {
+      if (!applyWorkbookPath) return selectApplySheet(preferredSheet);
+      const params = new URLSearchParams({ path: applyWorkbookPath });
+      const res = await fetch(`/sheets?${params.toString()}`);
+      if (!res.ok) {
+        selectApplySheet(preferredSheet);
+        return;
+      }
+      const data = await res.json();
+      setApplySheets(data.sheets || [], preferredSheet);
+    }
+    async function refreshReferenceSyncStatus(updateText = true) {
+      try {
+        const res = await fetch("/references/status");
+        const data = await res.json();
+        saveReferencesButton.disabled = !data.enabled;
+        if (updateText) {
+          referencesStatus.textContent = data.enabled
+            ? `Ready to save manual references to ${data.space_id}.`
+            : data.reason || "Reference sync is unavailable.";
+        }
+      } catch (error) {
+        saveReferencesButton.disabled = true;
+        if (updateText) {
+          referencesStatus.textContent = "Reference sync status unavailable.";
+        }
+      }
+    }
+    clearPersistedOutputs();
+    refreshReferenceSyncStatus();
+    applyWorkbookForm.addEventListener("submit", event => {
+      submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
+    });
+    applyBlueprintForm.addEventListener("submit", event => {
+      submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
+    });
+    applyWorkbookInput.addEventListener("change", event => {
+      submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
+    });
+    applyBlueprintInput.addEventListener("change", event => {
+      submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
+    });
+    fetchModels.addEventListener("click", async () => {
+      fetchModels.disabled = true;
+      runStatus.textContent = "Fetching Groq models...";
+      const res = await fetch("/models");
+      const data = await res.json();
+      fetchModels.disabled = false;
+      if (!res.ok) {
+        runStatus.textContent = data.error || "Could not fetch models";
+        return;
+      }
+      models.value = data.models.join(",");
+      runStatus.textContent = "Model list updated.";
+    });
+    saveReferencesButton.addEventListener("click", async () => {
+      saveReferencesButton.disabled = true;
+      referencesStatus.textContent = "Saving manual references...";
+      try {
+        const res = await fetch("/references/save", { method: "POST" });
+        const data = await res.json();
+        if (!res.ok) {
+          referencesStatus.textContent = data.error || "Could not save manual references.";
+          await refreshReferenceSyncStatus();
+          return;
+        }
+        referencesStatus.textContent = data.message || "Manual references saved.";
+      } catch (error) {
+        referencesStatus.textContent = "Could not save manual references.";
+      } finally {
+        await refreshReferenceSyncStatus(false);
+      }
+    });
+    runButton.addEventListener("click", () => {
+      if (activeRunStream) {
+        stopActiveRun();
+        return;
+      }
+      if (!cleanPath || !sheetSelect.value) return;
+      clearCleanOutput();
+      activeRunJobId = window.crypto && window.crypto.randomUUID ? window.crypto.randomUUID() : String(Date.now());
+      stopRequested = false;
+      runButton.disabled = false;
+      runButton.textContent = "Stop Cleaning";
+      runButton.classList.add("danger");
+      runStatus.textContent = "Running...";
+      const params = new URLSearchParams({
+        job_id: activeRunJobId,
+        input: cleanPath,
+        sheet: sheetSelect.value,
+        output_sheet: outputSheet.value || defaultOutputSheet,
+        models: models.value.trim()
+      });
+      const stream = new EventSource(`/run?${params.toString()}`);
+      activeRunStream = stream;
+      stream.onmessage = event => appendCleanLogChunk(JSON.parse(event.data));
+      stream.addEventListener("done", async () => {
+        stream.close();
+        if (!stopRequested) {
+          applyBlueprintPath = "data/Blueprint.xlsx";
+          const targetSheet = outputSheet.value || defaultOutputSheet;
+          await refreshApplySheets(targetSheet);
+          applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
+          applyStatus.textContent = "Generated blueprint is ready for Apply Blueprint.";
+          cleanResult.classList.add("active");
+          cleanResult.innerHTML = `
+            <strong>Blueprint generated</strong>
+            <div class="status">Blueprint saved at data/Blueprint.xlsx</div>
+            <a class="download-link" href="/download-blueprint">Download Blueprint</a>
+            <a class="download-link" href="/download-cleaned-workbook">Download Cleaned Workbook</a>
+          `;
+          runStatus.textContent = "Finished.";
+        } else {
+          runStatus.textContent = "Stopped.";
+        }
+        setRunButtonIdle();
+      });
+      stream.addEventListener("failed", () => {
+        stream.close();
+        setRunButtonIdle();
+        runStatus.textContent = "Run failed. Check logs.";
+      });
+      stream.addEventListener("error", () => {
+        stream.close();
+        setRunButtonIdle();
+        runStatus.textContent = "Run stopped. Check logs.";
+      });
+    });
+    applyButton.addEventListener("click", () => {
+      if (!applyWorkbookPath || !applyBlueprintPath || !applySheetSelect.value) return;
+      clearApplyOutput();
+      applyButton.disabled = true;
+      applyStatus.textContent = "Applying blueprint...";
+      const params = new URLSearchParams({
+        input: applyWorkbookPath,
+        blueprint: applyBlueprintPath,
+        sheet: applySheetSelect.value
+      });
+      const stream = new EventSource(`/apply?${params.toString()}`);
+      stream.onmessage = event => appendApplyLogChunk(JSON.parse(event.data));
+      stream.addEventListener("done", () => {
+        stream.close();
+        applyButton.disabled = false;
+        applyStatus.textContent = "Finished.";
+        renderApplySummary();
+      });
+      stream.addEventListener("failed", () => {
+        stream.close();
+        applyButton.disabled = false;
+        applyStatus.textContent = "Apply failed. Check logs.";
+      });
+      stream.addEventListener("error", () => {
+        stream.close();
+        applyButton.disabled = false;
+        applyStatus.textContent = "Apply stopped. Check logs.";
+      });
+    });

ui/static/styles.css ADDED Viewed

	@@ -0,0 +1,225 @@

+:root {
+      --bg: #f6f7f9;
+      --panel: #ffffff;
+      --text: #17202a;
+      --muted: #627386;
+      --line: #d9e0e7;
+      --accent: #2457c5;
+      --accent-dark: #1d459c;
+      --danger: #b42318;
+    }
+    * { box-sizing: border-box; }
+    body {
+      margin: 0;
+      font-family: Arial, sans-serif;
+      color: var(--text);
+      background: var(--bg);
+    }
+    header {
+      padding: 20px 28px;
+      background: var(--panel);
+      border-bottom: 1px solid var(--line);
+    }
+    h1 { margin: 0; font-size: 22px; }
+    h2 { font-size: 16px; margin: 0 0 12px; }
+    main {
+      display: grid;
+      grid-template-columns: 380px 1fr;
+      gap: 18px;
+      padding: 18px 28px 28px;
+    }
+    section {
+      background: var(--panel);
+      border: 1px solid var(--line);
+      border-radius: 8px;
+      padding: 16px;
+    }
+    label {
+      display: block;
+      margin: 14px 0 6px;
+      font-weight: 700;
+      font-size: 13px;
+    }
+    input, select, textarea, button {
+      width: 100%;
+      font: inherit;
+      border: 1px solid var(--line);
+      border-radius: 6px;
+      padding: 10px 11px;
+      background: #fff;
+    }
+    textarea {
+      min-height: 86px;
+      resize: vertical;
+      font-family: Consolas, monospace;
+      font-size: 13px;
+    }
+    button {
+      cursor: pointer;
+      background: var(--accent);
+      color: white;
+      border-color: var(--accent);
+      font-weight: 700;
+      margin-top: 10px;
+    }
+    button:hover { background: var(--accent-dark); }
+    button.secondary {
+      background: white;
+      color: var(--accent);
+    }
+    button.danger {
+      background: white;
+      color: var(--danger);
+      border-color: var(--danger);
+    }
+    button.danger:hover { background: #fff1f0; }
+    button:disabled {
+      opacity: 0.55;
+      cursor: not-allowed;
+    }
+    .dropbox {
+      border: 2px dashed #aeb9c7;
+      border-radius: 8px;
+      background: #fbfcfe;
+      padding: 14px;
+      color: var(--muted);
+    }
+    .dropbox strong {
+      display: block;
+      color: var(--text);
+      margin-bottom: 8px;
+    }
+    .dropbox input {
+      background: white;
+      cursor: pointer;
+    }
+    .file-pill {
+      border: 1px solid var(--line);
+      border-radius: 6px;
+      padding: 10px 11px;
+      background: #f8fafc;
+      color: var(--text);
+      word-break: break-word;
+      margin-bottom: 8px;
+    }
+    .status {
+      margin-top: 10px;
+      color: var(--muted);
+      font-size: 13px;
+      min-height: 18px;
+    }
+    .status.error { color: var(--danger); }
+    .progress-panel {
+      display: none;
+      margin-bottom: 12px;
+      background: #ffffff;
+      border: 1px solid var(--line);
+      border-radius: 8px;
+      padding: 12px;
+    }
+    .progress-panel.active {
+      display: block;
+    }
+    .progress-heading {
+      display: flex;
+      justify-content: space-between;
+      gap: 12px;
+      align-items: center;
+      font-size: 13px;
+      font-weight: 700;
+      margin-bottom: 10px;
+    }
+    .progress-list {
+      display: grid;
+      gap: 10px;
+    }
+    .progress-row {
+      border: 1px solid var(--line);
+      border-radius: 6px;
+      padding: 10px;
+      background: #f8fafc;
+    }
+    .progress-top {
+      display: flex;
+      justify-content: space-between;
+      gap: 12px;
+      font-size: 13px;
+      margin-bottom: 8px;
+    }
+    .progress-track {
+      height: 10px;
+      background: #e8edf3;
+      border-radius: 999px;
+      overflow: hidden;
+    }
+    .progress-fill {
+      height: 100%;
+      width: 0%;
+      background: var(--accent);
+    }
+    .progress-meta {
+      margin-top: 8px;
+      color: var(--muted);
+      font-size: 12px;
+      line-height: 1.4;
+    }
+    .blueprint-result {
+      display: none;
+      margin-bottom: 12px;
+      border: 1px solid var(--line);
+      border-radius: 8px;
+      background: #f8fafc;
+      padding: 12px;
+      font-size: 13px;
+    }
+    .blueprint-result.active {
+      display: block;
+    }
+    .download-link {
+      display: inline-block;
+      color: white;
+      background: var(--accent);
+      border-radius: 6px;
+      padding: 9px 11px;
+      text-decoration: none;
+      font-weight: 700;
+      margin-top: 8px;
+    }
+    .download-link:hover {
+      background: var(--accent-dark);
+    }
+    .raw-logs {
+      border: 1px solid var(--line);
+      border-radius: 8px;
+      background: #ffffff;
+      padding: 10px;
+    }
+    .raw-logs summary {
+      cursor: pointer;
+      font-size: 13px;
+      font-weight: 700;
+    }
+    .output-block {
+      margin-bottom: 14px;
+    }
+    .output-block h2 {
+      margin-bottom: 10px;
+    }
+    .log-screen {
+      height: 420px;
+      margin-top: 10px;
+      overflow: auto;
+      background: #0f172a;
+      color: #dbeafe;
+      border-radius: 8px;
+      padding: 14px;
+      white-space: pre-wrap;
+      font-family: Consolas, monospace;
+      font-size: 13px;
+      line-height: 1.45;
+    }
+    hr {
+      border: 0;
+      border-top: 1px solid var(--line);
+      margin: 18px 0;
+    }

ui/templates/index.html ADDED Viewed

	@@ -0,0 +1,136 @@

+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>MasterMap Cleaner</title>
+  <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
+</head>
+<body>
+  <header>
+    <h1>MasterMap Cleaner</h1>
+  </header>
+  <main>
+    <section>
+      <h2>Dataset to Clean</h2>
+      {% if clean_filename %}
+        <div class="file-pill">{{ clean_filename }}</div>
+        <form action="/remove-clean" method="post">
+          <button class="danger" type="submit">Remove File</button>
+        </form>
+      {% else %}
+        <form id="cleanUploadForm" action="/prepare-clean" method="post" enctype="multipart/form-data">
+          <div class="dropbox">
+            <strong>Drop or select an Excel file</strong>
+            <input name="file" type="file" accept=".xlsx,.xlsm" required onchange="submitUploadForm('cleanUploadForm', 'cleanUploadStatus')">
+          </div>
+          <div id="cleanUploadStatus" class="status"></div>
+        </form>
+      {% endif %}
+      <label for="sheetSelect">Source Sheet</label>
+      <select id="sheetSelect" {% if not clean_sheets %}disabled{% endif %}>
+        {% for sheet in clean_sheets %}
+          <option value="{{ sheet }}" {% if sheet == clean_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
+        {% endfor %}
+      </select>
+      <label for="outputSheet">Output Sheet</label>
+      <input id="outputSheet" value="{{ output_sheet }}">
+      <label for="models">Groq Models</label>
+      <textarea id="models">{{ models or default_models }}</textarea>
+      <button id="fetchModels" class="secondary" type="button">Fetch Available Models</button>
+      <button id="runButton" type="button" {% if not clean_path %}disabled{% endif %}>Run Cleaning</button>
+      <div id="runStatus" class="status">{{ message }}</div>
+      <hr>
+      <h2>Apply Blueprint</h2>
+      <label>Workbook</label>
+      <div id="applyWorkbookFile">
+      {% if apply_workbook_filename %}
+        <div class="file-pill">{{ apply_workbook_filename }}</div>
+      {% endif %}
+      </div>
+      <form id="applyWorkbookForm" action="/prepare-apply-workbook" method="post" enctype="multipart/form-data" onsubmit="return false;">
+        <div class="dropbox">
+          <strong>Drop or select workbook</strong>
+          <input id="applyWorkbookInput" name="file" type="file" accept=".xlsx,.xlsm" required>
+        </div>
+        <div id="applyWorkbookUploadStatus" class="status"></div>
+      </form>
+      <label for="applySheetSelect">Sheet To Update</label>
+      <select id="applySheetSelect" {% if not apply_sheets %}disabled{% endif %}>
+        {% for sheet in apply_sheets %}
+          <option value="{{ sheet }}" {% if sheet == apply_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
+        {% endfor %}
+      </select>
+      <label>Blueprint</label>
+      <div id="applyBlueprintFile">
+      {% if apply_blueprint_filename %}
+        <div class="file-pill">{{ apply_blueprint_filename }}</div>
+      {% endif %}
+      </div>
+      <form id="applyBlueprintForm" action="/prepare-apply-blueprint" method="post" enctype="multipart/form-data" onsubmit="return false;">
+        <div class="dropbox">
+          <strong>Drop or select blueprint</strong>
+          <input id="applyBlueprintInput" name="file" type="file" accept=".xlsx,.xlsm" required>
+        </div>
+        <div id="applyBlueprintUploadStatus" class="status"></div>
+      </form>
+      <button id="applyButton" type="button" {% if not can_apply %}disabled{% endif %}>Apply Blueprint</button>
+      <div id="applyStatus" class="status error">{{ error }}</div>
+      <hr>
+      <h2>References</h2>
+      <button id="saveReferencesButton" class="secondary" type="button" disabled>Save Manual References</button>
+      <div id="referencesStatus" class="status"></div>
+    </section>
+    <section>
+      <div class="output-block">
+        <h2>Cleaning Output</h2>
+        <div id="cleanProgressPanel" class="progress-panel">
+          <div class="progress-heading">
+            <span>Cleaning Progress</span>
+            <span id="cleanProgressSummary">Waiting...</span>
+          </div>
+          <div id="cleanProgressList" class="progress-list"></div>
+        </div>
+        <div id="cleanResult" class="blueprint-result"></div>
+        <details id="cleanRawLogDetails" class="raw-logs">
+          <summary>Raw cleaning logs</summary>
+          <div id="cleanLogs" class="log-screen"></div>
+        </details>
+      </div>
+      <div class="output-block">
+        <h2>Apply Output</h2>
+        <div id="applyResult" class="blueprint-result"></div>
+        <details id="applyRawLogDetails" class="raw-logs">
+          <summary>Raw apply logs</summary>
+          <div id="applyLogs" class="log-screen"></div>
+        </details>
+      </div>
+    </section>
+  </main>
+  <script>
+    window.MASTERMAP_CONFIG = {
+      cleanPath: {{ clean_path|tojson }},
+      applyWorkbookPath: {{ apply_workbook_path|tojson }},
+      applyBlueprintPath: {{ apply_blueprint_path|tojson }},
+      defaultOutputSheet: {{ default_output_sheet|tojson }}
+    };
+  </script>
+  <script src="{{ url_for('static', filename='app.js') }}"></script>
+</body>
+</html>

ui_app.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import sys
+import uuid
+from pathlib import Path
+from flask import Flask, Response, jsonify, render_template, request, send_file
+from newest_model import PREFERRED_PRODUCTION_CHAT_MODELS, select_groq_chat_models
+from src.config import AVAILABLE_MODELS, DATA_DIR, DEFAULT_OUTPUT_SHEET_NAME
+from src.process_runner import stop_process, stream_process
+from src.utils import reference_sync_status, save_manual_references_to_hub
+from src.workbook_io import read_workbook_sheets, resolve_allowed_path, save_uploaded_excel
+APP_ROOT = Path(__file__).resolve().parent
+UPLOAD_DIR = APP_ROOT / DATA_DIR / "uploads"
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+ALLOWED_FILE_ROOTS = [APP_ROOT / DATA_DIR]
+app = Flask(
+    __name__,
+    template_folder=str(APP_ROOT / "ui" / "templates"),
+    static_folder=str(APP_ROOT / "ui" / "static"),
+)
+app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
+STATE = {
+    "clean_path": "",
+    "clean_filename": "",
+    "clean_sheets": [],
+    "clean_selected_sheet": "",
+    "output_sheet": DEFAULT_OUTPUT_SHEET_NAME,
+    "models": "",
+    "apply_workbook_path": "",
+    "apply_workbook_filename": "",
+    "apply_sheets": [],
+    "apply_selected_sheet": "",
+    "apply_blueprint_path": "",
+    "apply_blueprint_filename": "",
+}
+@app.after_request
+def prevent_browser_cache(response):
+    response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
+    response.headers["Pragma"] = "no-cache"
+    response.headers["Expires"] = "0"
+    return response
+def default_models() -> str:
+    preferred_model_ids = {model.lower() for model in PREFERRED_PRODUCTION_CHAT_MODELS}
+    env_preferred_models = [
+        model
+        for model in AVAILABLE_MODELS
+        if model.lower() in preferred_model_ids
+    ]
+    return ",".join(env_preferred_models or PREFERRED_PRODUCTION_CHAT_MODELS)
+def render_page(message: str = "", error: str = ""):
+    if STATE["clean_sheets"]:
+        STATE["clean_selected_sheet"] = pick_sheet(STATE["clean_sheets"], STATE["clean_selected_sheet"])
+    if STATE["apply_sheets"]:
+        STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], STATE["apply_selected_sheet"])
+    return render_template(
+        "index.html",
+        **STATE,
+        default_output_sheet=DEFAULT_OUTPUT_SHEET_NAME,
+        default_models=default_models(),
+        can_apply=can_apply_blueprint(),
+        message=message,
+        error=error,
+    )
+def can_apply_blueprint() -> bool:
+    return bool(
+        STATE["apply_workbook_path"]
+        and STATE["apply_blueprint_path"]
+        and STATE["apply_sheets"]
+        and STATE["apply_selected_sheet"]
+    )
+def wants_json_response() -> bool:
+    return "application/json" in request.headers.get("Accept", "")
+def ui_state_payload(message: str = "", error: str = ""):
+    return {
+        "message": message,
+        "error": error,
+        "apply_workbook_path": STATE["apply_workbook_path"],
+        "apply_workbook_filename": STATE["apply_workbook_filename"],
+        "apply_sheets": STATE["apply_sheets"],
+        "apply_selected_sheet": STATE["apply_selected_sheet"],
+        "apply_blueprint_path": STATE["apply_blueprint_path"],
+        "apply_blueprint_filename": STATE["apply_blueprint_filename"],
+        "can_apply": can_apply_blueprint(),
+    }
+def pick_sheet(sheets, preferred_sheet=None):
+    if preferred_sheet and preferred_sheet in sheets:
+        return preferred_sheet
+    if STATE["output_sheet"] in sheets:
+        return STATE["output_sheet"]
+    return sheets[0] if sheets else ""
+def update_ui_state_from_form(form):
+    STATE["clean_selected_sheet"] = form.get("clean_selected_sheet") or STATE["clean_selected_sheet"]
+    STATE["output_sheet"] = form.get("output_sheet") or STATE["output_sheet"] or DEFAULT_OUTPUT_SHEET_NAME
+    STATE["models"] = form.get("models") or STATE["models"]
+    STATE["apply_selected_sheet"] = form.get("apply_selected_sheet") or STATE["apply_selected_sheet"]
+@app.route("/")
+def index():
+    return render_page()
+@app.route("/prepare-clean", methods=["POST"])
+def prepare_clean():
+    try:
+        update_ui_state_from_form(request.form)
+        filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
+        sheets = read_workbook_sheets(path)
+    except Exception as exc:
+        return render_page(error=str(exc))
+    STATE["clean_path"] = str(path)
+    STATE["clean_filename"] = filename
+    STATE["clean_sheets"] = sheets
+    STATE["clean_selected_sheet"] = pick_sheet(sheets, request.form.get("clean_selected_sheet"))
+    STATE["apply_workbook_path"] = str(path)
+    STATE["apply_workbook_filename"] = filename
+    STATE["apply_sheets"] = sheets
+    STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet") or STATE["output_sheet"])
+    return render_page(message=f"Loaded {filename}.")
+@app.route("/remove-clean", methods=["POST"])
+def remove_clean():
+    update_ui_state_from_form(request.form)
+    old_path = STATE["clean_path"]
+    STATE["clean_path"] = ""
+    STATE["clean_filename"] = ""
+    STATE["clean_sheets"] = []
+    STATE["clean_selected_sheet"] = ""
+    if STATE["apply_workbook_path"] == old_path:
+        STATE["apply_workbook_path"] = ""
+        STATE["apply_workbook_filename"] = ""
+        STATE["apply_sheets"] = []
+        STATE["apply_selected_sheet"] = ""
+    return render_page(message="File removed.")
+@app.route("/prepare-apply-workbook", methods=["POST"])
+def prepare_apply_workbook():
+    try:
+        update_ui_state_from_form(request.form)
+        filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
+        sheets = read_workbook_sheets(path)
+    except Exception as exc:
+        if wants_json_response():
+            return jsonify(ui_state_payload(error=str(exc))), 400
+        return render_page(error=str(exc))
+    STATE["apply_workbook_path"] = str(path)
+    STATE["apply_workbook_filename"] = filename
+    STATE["apply_sheets"] = sheets
+    STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet"))
+    if wants_json_response():
+        return jsonify(ui_state_payload(message=f"Loaded apply workbook {filename}."))
+    return render_page(message=f"Loaded apply workbook {filename}.")
+@app.route("/prepare-apply-blueprint", methods=["POST"])
+def prepare_apply_blueprint():
+    try:
+        update_ui_state_from_form(request.form)
+        if STATE["apply_workbook_path"] and Path(STATE["apply_workbook_path"]).is_file():
+            STATE["apply_sheets"] = read_workbook_sheets(Path(STATE["apply_workbook_path"]))
+        STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], request.form.get("apply_selected_sheet"))
+        filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
+    except Exception as exc:
+        if wants_json_response():
+            return jsonify(ui_state_payload(error=str(exc))), 400
+        return render_page(error=str(exc))
+    STATE["apply_blueprint_path"] = str(path)
+    STATE["apply_blueprint_filename"] = filename
+    if wants_json_response():
+        return jsonify(ui_state_payload(message=f"Loaded blueprint {filename}."))
+    return render_page(message=f"Loaded blueprint {filename}.")
+@app.route("/models")
+def models_endpoint():
+    try:
+        models = select_groq_chat_models(limit=len(PREFERRED_PRODUCTION_CHAT_MODELS))
+    except Exception as exc:
+        return jsonify({"error": str(exc)}), 500
+    return jsonify({"models": models})
+@app.route("/references/status")
+def references_status():
+    return jsonify(reference_sync_status())
+@app.route("/references/save", methods=["POST"])
+def save_references():
+    try:
+        result = save_manual_references_to_hub(APP_ROOT)
+    except Exception as exc:
+        return jsonify({"error": str(exc)}), 400
+    return jsonify({"message": "Manual references saved to Hugging Face.", **result})
+@app.route("/sheets")
+def sheets_endpoint():
+    try:
+        workbook_path = resolve_allowed_path(request.args.get("path", ""), APP_ROOT, ALLOWED_FILE_ROOTS)
+        if not workbook_path.is_file():
+            return jsonify({"error": "Workbook is not available."}), 404
+        return jsonify({"sheets": read_workbook_sheets(workbook_path)})
+    except Exception as exc:
+        return jsonify({"error": str(exc)}), 500
+@app.route("/download-blueprint")
+def download_blueprint():
+    blueprint_path = APP_ROOT / DATA_DIR / "Blueprint.xlsx"
+    if not blueprint_path.exists():
+        return jsonify({"error": "Blueprint has not been generated yet."}), 404
+    return send_file(blueprint_path, as_attachment=True, download_name="Blueprint.xlsx")
+@app.route("/download-cleaned-workbook")
+def download_cleaned_workbook():
+    if not STATE["clean_path"]:
+        return jsonify({"error": "Cleaned workbook is not available."}), 404
+    workbook_path = resolve_allowed_path(STATE["clean_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
+    if not workbook_path.is_file():
+        return jsonify({"error": "Cleaned workbook is not available."}), 404
+    return send_file(
+        workbook_path,
+        as_attachment=True,
+        download_name=f"cleaned_{STATE['clean_filename'] or workbook_path.name}",
+    )
+@app.route("/download-applied-workbook")
+def download_applied_workbook():
+    if not STATE["apply_workbook_path"]:
+        return jsonify({"error": "Applied workbook is not available."}), 404
+    workbook_path = resolve_allowed_path(STATE["apply_workbook_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
+    if not workbook_path.is_file():
+        return jsonify({"error": "Applied workbook is not available."}), 404
+    return send_file(
+        workbook_path,
+        as_attachment=True,
+        download_name=f"cleaned_{STATE['apply_workbook_filename'] or workbook_path.name}",
+    )
+@app.route("/run")
+def run():
+    job_id = request.args.get("job_id", uuid.uuid4().hex)
+    input_path = request.args.get("input", "")
+    sheet = request.args.get("sheet", "")
+    output_sheet = request.args.get("output_sheet", DEFAULT_OUTPUT_SHEET_NAME)
+    model_list = request.args.get("models", "")
+    if not input_path or not sheet:
+        return jsonify({"error": "Input file and source sheet are required."}), 400
+    try:
+        workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
+    except ValueError as exc:
+        return jsonify({"error": str(exc)}), 400
+    command = [
+        sys.executable,
+        "-u",
+        str(APP_ROOT / "main.py"),
+        "--input",
+        str(workbook_path),
+        "--sheet",
+        sheet,
+        "--output_sheet",
+        output_sheet,
+    ]
+    if model_list:
+        command.extend(["--models", model_list])
+    return Response(stream_process(command, cwd=APP_ROOT, job_id=job_id), mimetype="text/event-stream")
+@app.route("/stop", methods=["POST"])
+def stop():
+    job_id = request.args.get("job_id", "")
+    if not stop_process(job_id):
+        return jsonify({"stopped": False, "message": "No active run found."}), 404
+    return jsonify({"stopped": True})
+@app.route("/apply")
+def apply_blueprint():
+    input_path = request.args.get("input", "")
+    blueprint_path = request.args.get("blueprint", "")
+    sheet = request.args.get("sheet", DEFAULT_OUTPUT_SHEET_NAME)
+    if not input_path or not blueprint_path or not sheet:
+        return jsonify({"error": "Workbook, blueprint, and target sheet are required."}), 400
+    try:
+        workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
+        resolved_blueprint_path = resolve_allowed_path(blueprint_path, APP_ROOT, ALLOWED_FILE_ROOTS)
+    except ValueError as exc:
+        return jsonify({"error": str(exc)}), 400
+    command = [
+        sys.executable,
+        "-u",
+        str(APP_ROOT / "apply_blueprint.py"),
+        "--input",
+        str(workbook_path),
+        "--blueprint",
+        str(resolved_blueprint_path),
+        "--sheet",
+        sheet,
+    ]
+    return Response(stream_process(command, cwd=APP_ROOT), mimetype="text/event-stream")
+if __name__ == "__main__":
+    app.run(debug=False, threaded=True)