Spaces:
Running
Running
Commit ·
dc06d4c
0
Parent(s):
Initial clean commit
Browse files- .gitattributes +36 -0
- Dockerfile +16 -0
- README.md +14 -0
- apply_blueprint.py +169 -0
- main.py +242 -0
- newest_model.py +123 -0
- refdata/manual_references.json +71 -0
- refdata/official_references.json +650 -0
- requirements.txt +11 -0
- src/config.py +60 -0
- src/data_pipeline.py +431 -0
- src/llm_router.py +122 -0
- src/process_runner.py +67 -0
- src/utils.py +152 -0
- src/workbook_io.py +48 -0
- ui/static/app.js +463 -0
- ui/static/styles.css +225 -0
- ui/templates/index.html +136 -0
- ui_app.py +342 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* text=auto eol=lf
|
| 2 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONUNBUFFERED=1
|
| 4 |
+
ENV HF_HOME=/app/.cache/huggingface
|
| 5 |
+
ENV SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers
|
| 6 |
+
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
| 15 |
+
|
| 16 |
+
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "1", "--threads", "1", "--timeout", "3600", "ui_app:app"]
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MasterMap Cleaner
|
| 3 |
+
sdk: docker
|
| 4 |
+
app_port: 7860
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Hugging Face Secrets
|
| 8 |
+
|
| 9 |
+
Set these in the Space settings:
|
| 10 |
+
|
| 11 |
+
- `GROQ_API_KEY`: required for Groq model calls.
|
| 12 |
+
- `HF_TOKEN`: optional, required only for the `Save Manual References` button.
|
| 13 |
+
|
| 14 |
+
`Save Manual References` only enables on Hugging Face Spaces when `SPACE_ID` is present and `HF_TOKEN` is configured. It commits the current `refdata/manual_references.json` back to the Space repository.
|
apply_blueprint.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import openpyxl
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import argparse
|
| 6 |
+
from src.config import (
|
| 7 |
+
DEFAULT_BLUEPRINT_FILE,
|
| 8 |
+
DEFAULT_OFFICIAL_REFS_FILE,
|
| 9 |
+
DEFAULT_MANUAL_REFS_FILE,
|
| 10 |
+
DEFAULT_OUTPUT_SHEET_NAME,
|
| 11 |
+
resolve_data_path,
|
| 12 |
+
resolve_ref_path,
|
| 13 |
+
)
|
| 14 |
+
from src.utils import normalize_ref, prune_manual_refs_against_official, ref_contains
|
| 15 |
+
|
| 16 |
+
def parse_args():
|
| 17 |
+
parser = argparse.ArgumentParser(description="Apply Blueprint Human Overrides")
|
| 18 |
+
parser.add_argument("--input", required=True, help="Master Excel file name inside data/")
|
| 19 |
+
parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
|
| 20 |
+
parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
|
| 21 |
+
parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
|
| 22 |
+
parser.add_argument("--sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Workbook sheet where blueprint corrections should be applied")
|
| 23 |
+
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
args.input = resolve_data_path(args.input)
|
| 26 |
+
args.blueprint = resolve_data_path(args.blueprint)
|
| 27 |
+
args.refs = resolve_ref_path(args.refs)
|
| 28 |
+
args.manual_refs = resolve_ref_path(args.manual_refs)
|
| 29 |
+
return args
|
| 30 |
+
|
| 31 |
+
def load_json_safe(filepath):
|
| 32 |
+
try:
|
| 33 |
+
with open(filepath, 'r', encoding='utf-8-sig') as f:
|
| 34 |
+
return json.load(f)
|
| 35 |
+
except Exception:
|
| 36 |
+
return {}
|
| 37 |
+
|
| 38 |
+
def split_approved_parts(value):
|
| 39 |
+
if pd.isna(value):
|
| 40 |
+
return []
|
| 41 |
+
return [part.strip() for part in str(value).split(",") if part.strip()]
|
| 42 |
+
|
| 43 |
+
def ensure_manual_bucket(manual_refs, official_refs, column_name):
|
| 44 |
+
if column_name not in manual_refs:
|
| 45 |
+
manual_refs[column_name] = {} if isinstance(official_refs.get(column_name), dict) else []
|
| 46 |
+
return manual_refs[column_name]
|
| 47 |
+
|
| 48 |
+
def remember_approved_value(manual_refs, official_refs, column_name, approved_value):
|
| 49 |
+
manual_bucket = ensure_manual_bucket(manual_refs, official_refs, column_name)
|
| 50 |
+
added_count = 0
|
| 51 |
+
|
| 52 |
+
for part in split_approved_parts(approved_value):
|
| 53 |
+
if ref_contains(official_refs.get(column_name, []), part):
|
| 54 |
+
continue
|
| 55 |
+
if ref_contains(manual_bucket, part):
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
if isinstance(manual_bucket, list):
|
| 59 |
+
manual_bucket.append(part)
|
| 60 |
+
added_count += 1
|
| 61 |
+
elif isinstance(manual_bucket, dict):
|
| 62 |
+
manual_bucket[normalize_ref(part)] = part
|
| 63 |
+
added_count += 1
|
| 64 |
+
|
| 65 |
+
return added_count
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
args = parse_args()
|
| 69 |
+
|
| 70 |
+
print("Loading Master Data, Blueprint, and Memory Files...")
|
| 71 |
+
if not os.path.exists(args.blueprint):
|
| 72 |
+
print(f"Error: No blueprint found at {args.blueprint} to apply.")
|
| 73 |
+
exit()
|
| 74 |
+
|
| 75 |
+
bp_df = pd.read_excel(args.blueprint)
|
| 76 |
+
required_columns = {
|
| 77 |
+
"Row_Index",
|
| 78 |
+
"Column",
|
| 79 |
+
"Original_Raw_Text",
|
| 80 |
+
"AI_Suggested_Match",
|
| 81 |
+
"Human_Override",
|
| 82 |
+
}
|
| 83 |
+
missing_columns = required_columns - set(bp_df.columns)
|
| 84 |
+
if missing_columns:
|
| 85 |
+
print(f"Error: Blueprint is missing required columns: {sorted(missing_columns)}")
|
| 86 |
+
exit()
|
| 87 |
+
|
| 88 |
+
# Load the target Excel workbook
|
| 89 |
+
wb = openpyxl.load_workbook(args.input)
|
| 90 |
+
if args.sheet not in wb.sheetnames:
|
| 91 |
+
print(f"Error: No '{args.sheet}' sheet found in {args.input}.")
|
| 92 |
+
exit()
|
| 93 |
+
|
| 94 |
+
sheet = wb[args.sheet]
|
| 95 |
+
col_name_to_idx = {
|
| 96 |
+
str(sheet.cell(row=1, column=c).value).strip(): c
|
| 97 |
+
for c in range(1, sheet.max_column + 1)
|
| 98 |
+
if sheet.cell(row=1, column=c).value
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
# Load the memory dictionaries using the synced CLI path
|
| 102 |
+
official_refs = load_json_safe(args.refs)
|
| 103 |
+
manual_refs = load_json_safe(args.manual_refs)
|
| 104 |
+
|
| 105 |
+
changes_made = 0
|
| 106 |
+
memory_additions = 0
|
| 107 |
+
|
| 108 |
+
print("Applying manual overrides and updating memory...")
|
| 109 |
+
for _, row in bp_df.iterrows():
|
| 110 |
+
human_val = str(row['Human_Override']).strip() if pd.notna(row['Human_Override']) else ""
|
| 111 |
+
approved_val = human_val if human_val else str(row['AI_Suggested_Match']).strip() if pd.notna(row['AI_Suggested_Match']) else ""
|
| 112 |
+
confidence = str(row['Confidence']).strip().upper() if pd.notna(row['Confidence']) else ""
|
| 113 |
+
|
| 114 |
+
if not approved_val or approved_val in {"UNKNOWN", "LLM_Failed"}:
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
raw_col = str(row['Column']).strip()
|
| 118 |
+
|
| 119 |
+
if human_val:
|
| 120 |
+
# 1. Update the Excel File
|
| 121 |
+
try:
|
| 122 |
+
excel_row = int(row['Row_Index'])
|
| 123 |
+
except (TypeError, ValueError):
|
| 124 |
+
print(f"Skipping override with invalid Row_Index: {row['Row_Index']}")
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
if raw_col not in col_name_to_idx:
|
| 128 |
+
print(f"Skipping override: column '{raw_col}' was not found in {args.sheet}.")
|
| 129 |
+
continue
|
| 130 |
+
|
| 131 |
+
if excel_row < 1 or excel_row > sheet.max_row:
|
| 132 |
+
print(f"Skipping override: row {excel_row} is outside {args.sheet}.")
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
col_idx = col_name_to_idx[raw_col]
|
| 136 |
+
sheet.cell(row=excel_row, column=col_idx).value = human_val
|
| 137 |
+
changes_made += 1
|
| 138 |
+
|
| 139 |
+
# 2. Update Manual References for human overrides and accepted AI suggestions.
|
| 140 |
+
if raw_col == "Degree":
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
if not human_val and confidence == "LOW":
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
memory_additions += remember_approved_value(
|
| 147 |
+
manual_refs=manual_refs,
|
| 148 |
+
official_refs=official_refs,
|
| 149 |
+
column_name=raw_col,
|
| 150 |
+
approved_value=approved_val,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
|
| 154 |
+
|
| 155 |
+
# Save Excel
|
| 156 |
+
wb.save(args.input)
|
| 157 |
+
|
| 158 |
+
# Save JSONs
|
| 159 |
+
# Make sure the data directory exists before dumping
|
| 160 |
+
manual_refs_dir = os.path.dirname(args.manual_refs)
|
| 161 |
+
if manual_refs_dir:
|
| 162 |
+
os.makedirs(manual_refs_dir, exist_ok=True)
|
| 163 |
+
with open(args.manual_refs, 'w', encoding='utf-8') as f:
|
| 164 |
+
json.dump(manual_refs, f, indent=4, ensure_ascii=False)
|
| 165 |
+
|
| 166 |
+
print(f"Success! {changes_made} corrections injected into {args.input}, sheet '{args.sheet}'.")
|
| 167 |
+
print(f"Memory updated: {memory_additions} new approved values added.")
|
| 168 |
+
print(f"Memory cleaned: {memory_pruned} duplicate manual reference values skipped/removed.")
|
| 169 |
+
print(f"Memory updated: Human overrides dumped to {args.manual_refs}")
|
main.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import json
|
| 6 |
+
import openpyxl
|
| 7 |
+
from openpyxl.styles import PatternFill, Font
|
| 8 |
+
from openpyxl.utils import get_column_letter
|
| 9 |
+
from openpyxl.worksheet.datavalidation import DataValidation
|
| 10 |
+
from openpyxl.workbook.defined_name import DefinedName
|
| 11 |
+
|
| 12 |
+
# Import our new modular architecture
|
| 13 |
+
from src.config import parse_cli_args, GROQ_API_KEY, AVAILABLE_MODELS, DEFAULT_SIMILARITY_THRESHOLD
|
| 14 |
+
from src.llm_router import GroqRouter
|
| 15 |
+
from src.data_pipeline import process_column, cluster_degrees_by_institution
|
| 16 |
+
from src.utils import prune_manual_refs_against_official
|
| 17 |
+
|
| 18 |
+
# --- 1. CONFIGURATION ---
|
| 19 |
+
COLUMNS_CONFIG = {
|
| 20 |
+
"Country": r',|;|\n|/',
|
| 21 |
+
"Institution": r'[,/;|\n]',
|
| 22 |
+
"Continent": r',|;|\n|/',
|
| 23 |
+
"City": r',|;|\n|/',
|
| 24 |
+
"Level": r'\n|;',
|
| 25 |
+
"Language": r',|;|\n|/',
|
| 26 |
+
"Tags": r',|;|\n|/',
|
| 27 |
+
"Degree": r'\n|;'
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
master_cache = {}
|
| 31 |
+
|
| 32 |
+
def load_json_safe(filepath):
|
| 33 |
+
with open(filepath, 'r', encoding='utf-8-sig') as f:
|
| 34 |
+
return json.load(f)
|
| 35 |
+
|
| 36 |
+
def validate_official_refs(official_refs):
|
| 37 |
+
missing = []
|
| 38 |
+
for column_name in COLUMNS_CONFIG:
|
| 39 |
+
if column_name == "Degree":
|
| 40 |
+
continue
|
| 41 |
+
|
| 42 |
+
ref_data = official_refs.get(column_name)
|
| 43 |
+
if not isinstance(ref_data, (list, dict)) or len(ref_data) == 0:
|
| 44 |
+
missing.append(column_name)
|
| 45 |
+
|
| 46 |
+
if missing:
|
| 47 |
+
raise ValueError(
|
| 48 |
+
"Official references are missing or empty for: "
|
| 49 |
+
+ ", ".join(missing)
|
| 50 |
+
+ ". Refusing to run because this would send too many values to Groq."
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def inject_searchable_dropdowns(blueprint_path, master_unique_lists):
|
| 54 |
+
"""Injects robust, static searchable dropdowns into the Blueprint."""
|
| 55 |
+
print("Injecting static searchable dropdowns into Blueprint...")
|
| 56 |
+
wb = openpyxl.load_workbook(blueprint_path)
|
| 57 |
+
main_sheet = wb.active
|
| 58 |
+
|
| 59 |
+
# 1. Create the Reference Sheet
|
| 60 |
+
ref_sheet = wb.create_sheet(title="Reference_Lists")
|
| 61 |
+
|
| 62 |
+
col_idx = 1
|
| 63 |
+
for column_name, unique_items in master_unique_lists.items():
|
| 64 |
+
safe_name = column_name.replace(" ", "_")
|
| 65 |
+
|
| 66 |
+
# Write the header
|
| 67 |
+
ref_sheet.cell(row=1, column=col_idx, value=safe_name)
|
| 68 |
+
|
| 69 |
+
# Clean and alphabetize the list for a better user experience
|
| 70 |
+
valid_items = sorted([item for item in unique_items if item and isinstance(item, str)])
|
| 71 |
+
|
| 72 |
+
# Write the items
|
| 73 |
+
for row_idx, item in enumerate(valid_items, start=2):
|
| 74 |
+
ref_sheet.cell(row=row_idx, column=col_idx, value=item)
|
| 75 |
+
|
| 76 |
+
# 2. Create the Excel "Named Range"
|
| 77 |
+
if valid_items:
|
| 78 |
+
letter = get_column_letter(col_idx)
|
| 79 |
+
range_str = f"Reference_Lists!${letter}$2:${letter}${len(valid_items) + 1}"
|
| 80 |
+
named_range = DefinedName(name=safe_name, attr_text=range_str)
|
| 81 |
+
wb.defined_names.add(named_range)
|
| 82 |
+
|
| 83 |
+
col_idx += 1
|
| 84 |
+
|
| 85 |
+
# 3. Locate Target & Override Columns
|
| 86 |
+
target_col_idx = None
|
| 87 |
+
override_col_letter = None
|
| 88 |
+
for cell in main_sheet[1]:
|
| 89 |
+
if cell.value == "Column":
|
| 90 |
+
target_col_idx = get_column_letter(cell.column)
|
| 91 |
+
elif cell.value == "Human_Override":
|
| 92 |
+
override_col_letter = get_column_letter(cell.column)
|
| 93 |
+
|
| 94 |
+
# 4. Apply Data Validation
|
| 95 |
+
if target_col_idx and override_col_letter:
|
| 96 |
+
dv = DataValidation(
|
| 97 |
+
type="list",
|
| 98 |
+
formula1=f'=INDIRECT(SUBSTITUTE(${target_col_idx}2, " ", "_"))',
|
| 99 |
+
allowBlank=True,
|
| 100 |
+
showErrorMessage=False # CRITICAL: This allows the user to manually type an override!
|
| 101 |
+
)
|
| 102 |
+
dv.add(f"{override_col_letter}2:{override_col_letter}{main_sheet.max_row}")
|
| 103 |
+
main_sheet.add_data_validation(dv)
|
| 104 |
+
|
| 105 |
+
ref_sheet.sheet_state = 'hidden'
|
| 106 |
+
wb.save(blueprint_path)
|
| 107 |
+
print("Dropdowns successfully injected!")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
if __name__ == "__main__":
|
| 111 |
+
# --- 2. INITIALIZATION ---
|
| 112 |
+
args = parse_cli_args()
|
| 113 |
+
source_sheet_name = args.sheet
|
| 114 |
+
output_sheet_name = args.output_sheet
|
| 115 |
+
available_models = [m.strip() for m in args.models.split(",") if m.strip()] if args.models else AVAILABLE_MODELS
|
| 116 |
+
|
| 117 |
+
print("Loading AI Model (this may take a few seconds)...")
|
| 118 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 119 |
+
|
| 120 |
+
# Initialize our LLM Router
|
| 121 |
+
router = GroqRouter(api_key=GROQ_API_KEY, available_models=available_models)
|
| 122 |
+
|
| 123 |
+
if not os.path.exists(args.refs):
|
| 124 |
+
raise FileNotFoundError(f"Official references file not found: {args.refs}")
|
| 125 |
+
|
| 126 |
+
if not os.path.exists(args.manual_refs):
|
| 127 |
+
os.makedirs(os.path.dirname(args.manual_refs), exist_ok=True)
|
| 128 |
+
with open(args.manual_refs, 'w', encoding='utf-8') as f:
|
| 129 |
+
json.dump({}, f)
|
| 130 |
+
|
| 131 |
+
official_refs = load_json_safe(args.refs)
|
| 132 |
+
manual_refs = load_json_safe(args.manual_refs)
|
| 133 |
+
validate_official_refs(official_refs)
|
| 134 |
+
memory_pruned = prune_manual_refs_against_official(manual_refs, official_refs)
|
| 135 |
+
if memory_pruned:
|
| 136 |
+
print(f"[INFO] Removed {memory_pruned} manual reference duplicate(s) already covered by official refs.")
|
| 137 |
+
|
| 138 |
+
print(f"Loading Excel dataset from {args.input}, sheet '{source_sheet_name}'...")
|
| 139 |
+
data = pd.read_excel(args.input, sheet_name=source_sheet_name, skiprows=[1])
|
| 140 |
+
|
| 141 |
+
# Initialize the global Blueprint Logger
|
| 142 |
+
blueprint_records = []
|
| 143 |
+
|
| 144 |
+
# --- 3. EXECUTE BATCH PIPELINE ---
|
| 145 |
+
for col, pattern in COLUMNS_CONFIG.items():
|
| 146 |
+
if col == "Degree":
|
| 147 |
+
inst_col = 'Cleaned_Institution' if 'Cleaned_Institution' in data.columns else 'Institution'
|
| 148 |
+
data = cluster_degrees_by_institution(
|
| 149 |
+
df=data, degree_col=col, inst_col=inst_col, model=model,
|
| 150 |
+
master_cache=master_cache, blueprint_data=blueprint_records,
|
| 151 |
+
threshold=DEFAULT_SIMILARITY_THRESHOLD
|
| 152 |
+
)
|
| 153 |
+
else:
|
| 154 |
+
data = process_column(
|
| 155 |
+
df=data, column_name=col, model=model, groq_router=router,
|
| 156 |
+
official_refs=official_refs, manual_refs=manual_refs, master_cache=master_cache,
|
| 157 |
+
split_pattern=pattern, blueprint_data=blueprint_records
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# --- 4. EXPORT RESULTS ---
|
| 161 |
+
print("\nSaving all memory files...")
|
| 162 |
+
with open(args.manual_refs, 'w', encoding='utf-8') as f: json.dump(manual_refs, f, indent=4, ensure_ascii=False)
|
| 163 |
+
|
| 164 |
+
# 4a. Export the Blueprint for Human Review
|
| 165 |
+
if blueprint_records:
|
| 166 |
+
bp_df = pd.DataFrame(blueprint_records)
|
| 167 |
+
bp_df.to_excel(args.blueprint, index=False)
|
| 168 |
+
|
| 169 |
+
# --- Format the Blueprint Visually ---
|
| 170 |
+
bp_wb = openpyxl.load_workbook(args.blueprint)
|
| 171 |
+
bp_sheet = bp_wb.active
|
| 172 |
+
|
| 173 |
+
header_fill = PatternFill(start_color="1F4E78", end_color="1F4E78", fill_type="solid")
|
| 174 |
+
header_font = Font(color="FFFFFF", bold=True)
|
| 175 |
+
high_fill = PatternFill(start_color="C6EFCE", end_color="C6EFCE", fill_type="solid")
|
| 176 |
+
med_fill = PatternFill(start_color="FFEB9C", end_color="FFEB9C", fill_type="solid")
|
| 177 |
+
low_fill = PatternFill(start_color="FFC7CE", end_color="FFC7CE", fill_type="solid")
|
| 178 |
+
|
| 179 |
+
conf_col_idx = None
|
| 180 |
+
for col_idx in range(1, bp_sheet.max_column + 1):
|
| 181 |
+
cell = bp_sheet.cell(row=1, column=col_idx)
|
| 182 |
+
cell.fill = header_fill
|
| 183 |
+
cell.font = header_font
|
| 184 |
+
if cell.value == "Confidence": conf_col_idx = col_idx
|
| 185 |
+
bp_sheet.column_dimensions[get_column_letter(col_idx)].width = 30
|
| 186 |
+
|
| 187 |
+
if conf_col_idx:
|
| 188 |
+
for row_idx in range(2, bp_sheet.max_row + 1):
|
| 189 |
+
cell = bp_sheet.cell(row=row_idx, column=conf_col_idx)
|
| 190 |
+
val = str(cell.value).upper()
|
| 191 |
+
if "HIGH" in val: cell.fill = high_fill
|
| 192 |
+
elif "MEDIUM" in val: cell.fill = med_fill
|
| 193 |
+
elif "LOW" in val: cell.fill = low_fill
|
| 194 |
+
|
| 195 |
+
bp_wb.save(args.blueprint)
|
| 196 |
+
print(f"[!] Saved and formatted {len(bp_df)} rows requiring review to {args.blueprint}")
|
| 197 |
+
|
| 198 |
+
# --- NEW: Build master lists and inject dropdowns ---
|
| 199 |
+
def extract_uniques(ref_data):
|
| 200 |
+
"""Helper to extract names whether the memory file is a list or a dict"""
|
| 201 |
+
if isinstance(ref_data, dict): return list(ref_data.values())
|
| 202 |
+
elif isinstance(ref_data, list): return ref_data
|
| 203 |
+
return []
|
| 204 |
+
|
| 205 |
+
master_lists = {}
|
| 206 |
+
for category in COLUMNS_CONFIG.keys():
|
| 207 |
+
off_items = extract_uniques(official_refs.get(category, []))
|
| 208 |
+
man_items = extract_uniques(manual_refs.get(category, []))
|
| 209 |
+
# Merge, deduplicate, and remove blanks
|
| 210 |
+
master_lists[category] = list(set([x for x in (off_items + man_items) if x]))
|
| 211 |
+
|
| 212 |
+
inject_searchable_dropdowns(args.blueprint, master_lists)
|
| 213 |
+
|
| 214 |
+
else:
|
| 215 |
+
print("[!] No blueprint generated. All matches were HIGH confidence!")
|
| 216 |
+
|
| 217 |
+
# 4b. Inject Cleaned Data to Mastermap
|
| 218 |
+
print("\nOpening original Excel file to preserve formatting...")
|
| 219 |
+
wb = openpyxl.load_workbook(args.input)
|
| 220 |
+
new_sheet_name = output_sheet_name
|
| 221 |
+
|
| 222 |
+
if source_sheet_name == new_sheet_name:
|
| 223 |
+
raise ValueError("Output sheet name cannot match the source sheet name.")
|
| 224 |
+
|
| 225 |
+
source_sheet = wb[source_sheet_name]
|
| 226 |
+
|
| 227 |
+
if new_sheet_name in wb.sheetnames: del wb[new_sheet_name]
|
| 228 |
+
new_sheet = wb.copy_worksheet(source_sheet)
|
| 229 |
+
new_sheet.title = new_sheet_name
|
| 230 |
+
|
| 231 |
+
col_name_to_idx = {new_sheet.cell(row=1, column=c).value: c for c in range(1, new_sheet.max_column + 1) if new_sheet.cell(row=1, column=c).value}
|
| 232 |
+
|
| 233 |
+
for row_idx, (_, row_data) in enumerate(data.iterrows()):
|
| 234 |
+
excel_row = row_idx + 3
|
| 235 |
+
for col_name in COLUMNS_CONFIG.keys():
|
| 236 |
+
cleaned_col_name = f"Cleaned_{col_name}"
|
| 237 |
+
if cleaned_col_name in data.columns and col_name in col_name_to_idx:
|
| 238 |
+
new_value = row_data[cleaned_col_name]
|
| 239 |
+
new_sheet.cell(row=excel_row, column=col_name_to_idx[col_name]).value = None if pd.isna(new_value) else new_value
|
| 240 |
+
|
| 241 |
+
wb.save(args.input)
|
| 242 |
+
print(f"\nSuccess! Initial pass saved. Please review {args.blueprint}.")
|
newest_model.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
from typing import Any
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
GROQ_MODELS_URL = "https://api.groq.com/openai/v1/models"
|
| 12 |
+
|
| 13 |
+
# Models selected from Groq's rate-limit table for this workload: relevant
|
| 14 |
+
# chat/reasoning models with at least 1K requests per day.
|
| 15 |
+
PREFERRED_PRODUCTION_CHAT_MODELS = [
|
| 16 |
+
"llama-3.3-70b-versatile",
|
| 17 |
+
"openai/gpt-oss-120b",
|
| 18 |
+
"qwen/qwen3-32b",
|
| 19 |
+
"llama-3.1-8b-instant",
|
| 20 |
+
"openai/gpt-oss-20b",
|
| 21 |
+
"meta-llama/llama-4-scout-17b-16e-instruct",
|
| 22 |
+
"meta-llama/llama-prompt-guard-2-22m",
|
| 23 |
+
"meta-llama/llama-prompt-guard-2-86m"
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
EXCLUDED_MODEL_TERMS = [
|
| 27 |
+
"whisper",
|
| 28 |
+
"tts",
|
| 29 |
+
"audio",
|
| 30 |
+
"moderation",
|
| 31 |
+
"embed",
|
| 32 |
+
"compound",
|
| 33 |
+
"safeguard"
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
PREFERRED_MODEL_IDS = {model_id.lower() for model_id in PREFERRED_PRODUCTION_CHAT_MODELS}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def fetch_groq_models(api_key: str) -> list[dict[str, Any]]:
|
| 40 |
+
headers = {
|
| 41 |
+
"Authorization": f"Bearer {api_key}",
|
| 42 |
+
"Content-Type": "application/json",
|
| 43 |
+
}
|
| 44 |
+
response = requests.get(GROQ_MODELS_URL, headers=headers, timeout=30)
|
| 45 |
+
response.raise_for_status()
|
| 46 |
+
return response.json().get("data", [])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def is_active_chat_model(model: dict[str, Any]) -> bool:
|
| 50 |
+
model_id = str(model.get("id", "")).lower()
|
| 51 |
+
if not model_id:
|
| 52 |
+
return False
|
| 53 |
+
if model.get("active") is False:
|
| 54 |
+
return False
|
| 55 |
+
if model_id not in PREFERRED_MODEL_IDS:
|
| 56 |
+
return False
|
| 57 |
+
return not any(term in model_id for term in EXCLUDED_MODEL_TERMS)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def rank_model(model: dict[str, Any]) -> tuple[int, int, int, str]:
|
| 61 |
+
model_id = str(model.get("id", ""))
|
| 62 |
+
model_id_lower = model_id.lower()
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
preferred_rank = [item.lower() for item in PREFERRED_PRODUCTION_CHAT_MODELS].index(model_id_lower)
|
| 66 |
+
except ValueError:
|
| 67 |
+
preferred_rank = len(PREFERRED_PRODUCTION_CHAT_MODELS)
|
| 68 |
+
|
| 69 |
+
created = int(model.get("created") or 0)
|
| 70 |
+
context_window = int(model.get("context_window") or 0)
|
| 71 |
+
|
| 72 |
+
# Lower tuple wins. Known production models come first; unknown active chat
|
| 73 |
+
# models then prefer newer creation dates and larger context windows.
|
| 74 |
+
return (preferred_rank, -created, -context_window, model_id)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def select_groq_chat_models(limit: int = len(PREFERRED_PRODUCTION_CHAT_MODELS), strategy: str = "stable") -> list[str]:
|
| 78 |
+
api_key = os.getenv("GROQ_API_KEY")
|
| 79 |
+
if not api_key:
|
| 80 |
+
raise RuntimeError("GROQ_API_KEY is missing. Add it to .env first.")
|
| 81 |
+
|
| 82 |
+
models = fetch_groq_models(api_key)
|
| 83 |
+
chat_models = [model for model in models if is_active_chat_model(model)]
|
| 84 |
+
|
| 85 |
+
if strategy == "latest":
|
| 86 |
+
ranked_models = sorted(
|
| 87 |
+
chat_models,
|
| 88 |
+
key=lambda model: (
|
| 89 |
+
-int(model.get("created") or 0),
|
| 90 |
+
-int(model.get("context_window") or 0),
|
| 91 |
+
str(model.get("id", "")),
|
| 92 |
+
),
|
| 93 |
+
)
|
| 94 |
+
else:
|
| 95 |
+
ranked_models = sorted(chat_models, key=rank_model)
|
| 96 |
+
|
| 97 |
+
return [model["id"] for model in ranked_models[:limit]]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def main() -> None:
|
| 101 |
+
parser = argparse.ArgumentParser(description="Select currently available Groq chat models.")
|
| 102 |
+
parser.add_argument("--limit", type=int, default=len(PREFERRED_PRODUCTION_CHAT_MODELS), help="Number of fallback models to print.")
|
| 103 |
+
parser.add_argument(
|
| 104 |
+
"--strategy",
|
| 105 |
+
choices=["stable", "latest"],
|
| 106 |
+
default="stable",
|
| 107 |
+
help="stable prefers known production chat models; latest picks newest active chat models.",
|
| 108 |
+
)
|
| 109 |
+
args = parser.parse_args()
|
| 110 |
+
|
| 111 |
+
models = select_groq_chat_models(limit=args.limit, strategy=args.strategy)
|
| 112 |
+
if not models:
|
| 113 |
+
raise RuntimeError("No active Groq chat models were returned by the Models API.")
|
| 114 |
+
|
| 115 |
+
print("Recommended GROQ_MODEL value:")
|
| 116 |
+
print(f"GROQ_MODEL={','.join(models)}")
|
| 117 |
+
print("\nSelected fallback order:")
|
| 118 |
+
for index, model_id in enumerate(models, start=1):
|
| 119 |
+
print(f"{index}. {model_id}")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
main()
|
refdata/manual_references.json
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Country": [
|
| 3 |
+
"Hong Kong"
|
| 4 |
+
],
|
| 5 |
+
"Institution": [
|
| 6 |
+
"Centre Interdisciplinaire de Formation à la Fonction Personnel",
|
| 7 |
+
"Faculté de médecine Paris-Descartes",
|
| 8 |
+
"Politecnico di Milano",
|
| 9 |
+
"INSTN",
|
| 10 |
+
"Université Paris-Est Marne-la-Vallée",
|
| 11 |
+
"CEMS",
|
| 12 |
+
"Conservatoire National Supérieur d'Art Dramatique (CNSAD)",
|
| 13 |
+
"EDC Paris Business School",
|
| 14 |
+
"EFREI",
|
| 15 |
+
"Catolica Lisbon School of Business & Economics",
|
| 16 |
+
"Ecole des Mines Paris - Université PSL",
|
| 17 |
+
"École d'urbanisme de Paris",
|
| 18 |
+
"ESMOD Paris",
|
| 19 |
+
"HEAD Genève (Haute école d'art et de design)",
|
| 20 |
+
"MODART International",
|
| 21 |
+
"University of Mumbai",
|
| 22 |
+
"Oxford Brookes University",
|
| 23 |
+
"Shanghai Advanced Institute of Finance",
|
| 24 |
+
"The University of Hong Kong (HKU)",
|
| 25 |
+
"Université de Montpellier Paul Valéry",
|
| 26 |
+
"IAE Paris",
|
| 27 |
+
"The University of Sydney Business School",
|
| 28 |
+
"Korea University Business School",
|
| 29 |
+
"EHL Business School",
|
| 30 |
+
"Technische Universität Dortmund"
|
| 31 |
+
],
|
| 32 |
+
"Continent": [],
|
| 33 |
+
"City": [
|
| 34 |
+
"Champs-sur-Marne",
|
| 35 |
+
"Croix",
|
| 36 |
+
"Hanoi",
|
| 37 |
+
"Pantin",
|
| 38 |
+
"Versailles",
|
| 39 |
+
"Mumbai",
|
| 40 |
+
"Oxford",
|
| 41 |
+
"Suresnes",
|
| 42 |
+
"Aubervilliers",
|
| 43 |
+
"Dortmund",
|
| 44 |
+
"Mannheim"
|
| 45 |
+
],
|
| 46 |
+
"Level": {},
|
| 47 |
+
"Language": {},
|
| 48 |
+
"Tags": [
|
| 49 |
+
"Sociology",
|
| 50 |
+
"Fashion Management",
|
| 51 |
+
"Geoeconomy",
|
| 52 |
+
"Humanitarian",
|
| 53 |
+
"Sustainable Finance",
|
| 54 |
+
"Fashion",
|
| 55 |
+
"Financial Risks",
|
| 56 |
+
"Digital Creativity",
|
| 57 |
+
"International Marketing",
|
| 58 |
+
"Sales Management",
|
| 59 |
+
"Data Management",
|
| 60 |
+
"Journalism",
|
| 61 |
+
"Arts & Culture",
|
| 62 |
+
"Environmental Governance",
|
| 63 |
+
"Neurology",
|
| 64 |
+
"Dentist",
|
| 65 |
+
"Financial Control",
|
| 66 |
+
"Biodiversity",
|
| 67 |
+
"Languages",
|
| 68 |
+
"Financial Engineering",
|
| 69 |
+
"Biology"
|
| 70 |
+
]
|
| 71 |
+
}
|
refdata/official_references.json
ADDED
|
@@ -0,0 +1,650 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Country": [
|
| 3 |
+
"Afghanistan",
|
| 4 |
+
"Albania",
|
| 5 |
+
"Algeria",
|
| 6 |
+
"Andorra",
|
| 7 |
+
"Angola",
|
| 8 |
+
"Antigua and Barbuda",
|
| 9 |
+
"Argentina",
|
| 10 |
+
"Armenia",
|
| 11 |
+
"Australia",
|
| 12 |
+
"Austria",
|
| 13 |
+
"Azerbaijan",
|
| 14 |
+
"Bahamas",
|
| 15 |
+
"Bahrain",
|
| 16 |
+
"Bangladesh",
|
| 17 |
+
"Barbados",
|
| 18 |
+
"Belarus",
|
| 19 |
+
"Belgium",
|
| 20 |
+
"Belize",
|
| 21 |
+
"Benin",
|
| 22 |
+
"Bhutan",
|
| 23 |
+
"Bolivia",
|
| 24 |
+
"Bosnia and Herzegovina",
|
| 25 |
+
"Botswana",
|
| 26 |
+
"Brazil",
|
| 27 |
+
"Brunei",
|
| 28 |
+
"Bulgaria",
|
| 29 |
+
"Burkina Faso",
|
| 30 |
+
"Burundi",
|
| 31 |
+
"Cabo Verde",
|
| 32 |
+
"Cambodia",
|
| 33 |
+
"Cameroon",
|
| 34 |
+
"Canada",
|
| 35 |
+
"Central African Republic",
|
| 36 |
+
"Chad",
|
| 37 |
+
"Chile",
|
| 38 |
+
"China",
|
| 39 |
+
"Colombia",
|
| 40 |
+
"Comoros",
|
| 41 |
+
"Congo",
|
| 42 |
+
"Costa Rica",
|
| 43 |
+
"Croatia",
|
| 44 |
+
"Cuba",
|
| 45 |
+
"Cyprus",
|
| 46 |
+
"Czech Republic",
|
| 47 |
+
"Democratic Republic of the Congo",
|
| 48 |
+
"Denmark",
|
| 49 |
+
"Djibouti",
|
| 50 |
+
"Dominica",
|
| 51 |
+
"Dominican Republic",
|
| 52 |
+
"Ecuador",
|
| 53 |
+
"Egypt",
|
| 54 |
+
"El Salvador",
|
| 55 |
+
"Equatorial Guinea",
|
| 56 |
+
"Eritrea",
|
| 57 |
+
"Estonia",
|
| 58 |
+
"Eswatini",
|
| 59 |
+
"Ethiopia",
|
| 60 |
+
"Fiji",
|
| 61 |
+
"Finland",
|
| 62 |
+
"France",
|
| 63 |
+
"Gabon",
|
| 64 |
+
"Gambia",
|
| 65 |
+
"Georgia",
|
| 66 |
+
"Germany",
|
| 67 |
+
"Ghana",
|
| 68 |
+
"Greece",
|
| 69 |
+
"Grenada",
|
| 70 |
+
"Guatemala",
|
| 71 |
+
"Guinea",
|
| 72 |
+
"Guinea-Bissau",
|
| 73 |
+
"Guyana",
|
| 74 |
+
"Haiti",
|
| 75 |
+
"Honduras",
|
| 76 |
+
"Hungary",
|
| 77 |
+
"Iceland",
|
| 78 |
+
"India",
|
| 79 |
+
"Indonesia",
|
| 80 |
+
"Iran",
|
| 81 |
+
"Iraq",
|
| 82 |
+
"Ireland",
|
| 83 |
+
"Israel",
|
| 84 |
+
"Italy",
|
| 85 |
+
"Ivory Coast",
|
| 86 |
+
"Jamaica",
|
| 87 |
+
"Japan",
|
| 88 |
+
"Jordan",
|
| 89 |
+
"Kazakhstan",
|
| 90 |
+
"Kenya",
|
| 91 |
+
"Kiribati",
|
| 92 |
+
"Kosovo",
|
| 93 |
+
"Kuwait",
|
| 94 |
+
"Kyrgyzstan",
|
| 95 |
+
"Laos",
|
| 96 |
+
"Latvia",
|
| 97 |
+
"Lebanon",
|
| 98 |
+
"Lesotho",
|
| 99 |
+
"Liberia",
|
| 100 |
+
"Libya",
|
| 101 |
+
"Liechtenstein",
|
| 102 |
+
"Lithuania",
|
| 103 |
+
"Luxembourg",
|
| 104 |
+
"Madagascar",
|
| 105 |
+
"Malawi",
|
| 106 |
+
"Malaysia",
|
| 107 |
+
"Maldives",
|
| 108 |
+
"Mali",
|
| 109 |
+
"Malta",
|
| 110 |
+
"Marshall Islands",
|
| 111 |
+
"Mauritania",
|
| 112 |
+
"Mauritius",
|
| 113 |
+
"Mexico",
|
| 114 |
+
"Micronesia",
|
| 115 |
+
"Moldova",
|
| 116 |
+
"Monaco",
|
| 117 |
+
"Mongolia",
|
| 118 |
+
"Montenegro",
|
| 119 |
+
"Morocco",
|
| 120 |
+
"Mozambique",
|
| 121 |
+
"Myanmar",
|
| 122 |
+
"Namibia",
|
| 123 |
+
"Nauru",
|
| 124 |
+
"Nepal",
|
| 125 |
+
"Netherlands",
|
| 126 |
+
"New Zealand",
|
| 127 |
+
"Nicaragua",
|
| 128 |
+
"Niger",
|
| 129 |
+
"Nigeria",
|
| 130 |
+
"North Korea",
|
| 131 |
+
"North Macedonia",
|
| 132 |
+
"Norway",
|
| 133 |
+
"Oman",
|
| 134 |
+
"Pakistan",
|
| 135 |
+
"Palau",
|
| 136 |
+
"Palestine",
|
| 137 |
+
"Panama",
|
| 138 |
+
"Papua New Guinea",
|
| 139 |
+
"Paraguay",
|
| 140 |
+
"Peru",
|
| 141 |
+
"Philippines",
|
| 142 |
+
"Poland",
|
| 143 |
+
"Portugal",
|
| 144 |
+
"Qatar",
|
| 145 |
+
"Romania",
|
| 146 |
+
"Russia",
|
| 147 |
+
"Rwanda",
|
| 148 |
+
"Saint Kitts and Nevis",
|
| 149 |
+
"Saint Lucia",
|
| 150 |
+
"Saint Vincent and the Grenadines",
|
| 151 |
+
"Samoa",
|
| 152 |
+
"San Marino",
|
| 153 |
+
"Sao Tome and Principe",
|
| 154 |
+
"Saudi Arabia",
|
| 155 |
+
"Senegal",
|
| 156 |
+
"Serbia",
|
| 157 |
+
"Seychelles",
|
| 158 |
+
"Sierra Leone",
|
| 159 |
+
"Singapore",
|
| 160 |
+
"Slovakia",
|
| 161 |
+
"Slovenia",
|
| 162 |
+
"Solomon Islands",
|
| 163 |
+
"Somalia",
|
| 164 |
+
"South Africa",
|
| 165 |
+
"South Korea",
|
| 166 |
+
"South Sudan",
|
| 167 |
+
"Spain",
|
| 168 |
+
"Sri Lanka",
|
| 169 |
+
"Sudan",
|
| 170 |
+
"Suriname",
|
| 171 |
+
"Sweden",
|
| 172 |
+
"Switzerland",
|
| 173 |
+
"Syria",
|
| 174 |
+
"Taiwan",
|
| 175 |
+
"Tajikistan",
|
| 176 |
+
"Tanzania",
|
| 177 |
+
"Thailand",
|
| 178 |
+
"Timor-Leste",
|
| 179 |
+
"Togo",
|
| 180 |
+
"Tonga",
|
| 181 |
+
"Trinidad and Tobago",
|
| 182 |
+
"Tunisia",
|
| 183 |
+
"Turkey",
|
| 184 |
+
"Turkmenistan",
|
| 185 |
+
"Tuvalu",
|
| 186 |
+
"Uganda",
|
| 187 |
+
"Ukraine",
|
| 188 |
+
"United Arab Emirates",
|
| 189 |
+
"United Kingdom",
|
| 190 |
+
"United States",
|
| 191 |
+
"Uruguay",
|
| 192 |
+
"Uzbekistan",
|
| 193 |
+
"Vanuatu",
|
| 194 |
+
"Vatican City",
|
| 195 |
+
"Venezuela",
|
| 196 |
+
"Vietnam",
|
| 197 |
+
"Yemen",
|
| 198 |
+
"Zambia",
|
| 199 |
+
"Zimbabwe"
|
| 200 |
+
],
|
| 201 |
+
"Institution": [
|
| 202 |
+
"AgroParisTech (Institut des sciences et industries du vivant et de l'environnement)",
|
| 203 |
+
"Aix Marseille Université (AMU)",
|
| 204 |
+
"Alliance Manchester Business School (The University of Manchester)",
|
| 205 |
+
"American University",
|
| 206 |
+
"Audencia Business School",
|
| 207 |
+
"Babson College",
|
| 208 |
+
"Bayes Business School (formerly Cass Business School)",
|
| 209 |
+
"Berklee College of Music",
|
| 210 |
+
"Bocconi University (Università Commerciale Luigi Bocconi)",
|
| 211 |
+
"Brandeis University",
|
| 212 |
+
"Bucharest University of Economic Studies (ASE)",
|
| 213 |
+
"Burgundy School of Business (BSB)",
|
| 214 |
+
"Cambridge Judge Business School",
|
| 215 |
+
"CELSA Sorbonne Université",
|
| 216 |
+
"CEMS (The Global Alliance in Management Education)",
|
| 217 |
+
"CentraleSupélec",
|
| 218 |
+
"CFA Dordogne Périgueux",
|
| 219 |
+
"Charité Universitätsmedizin Berlin",
|
| 220 |
+
"Charles University of Prague",
|
| 221 |
+
"CIFFOP (Université Paris 1 Panthéon Sorbonne)",
|
| 222 |
+
"Columbia University",
|
| 223 |
+
"Conservatoire National des Arts et Métiers (Le Cnam)",
|
| 224 |
+
"Conservatoire National Supérieur d'Art Dramatique (CNSAD PSL)",
|
| 225 |
+
"Copenhagen Business School (CBS)",
|
| 226 |
+
"Cornell University",
|
| 227 |
+
"Cours Florent",
|
| 228 |
+
"Coventry University",
|
| 229 |
+
"CY Cergy Paris Université",
|
| 230 |
+
"Dublin City University (DCU)",
|
| 231 |
+
"EAC (École d'Art et de Culture)",
|
| 232 |
+
"EAE Business School",
|
| 233 |
+
"École 42",
|
| 234 |
+
"École des Hautes Études en Sciences Sociales (EHESS)",
|
| 235 |
+
"École Hôtelière de Paris CFA Médéric",
|
| 236 |
+
"École Polytechnique",
|
| 237 |
+
"EDHEC Business School (École des Hautes Études Commerciales du Nord)",
|
| 238 |
+
"EFAP (École des Nouveaux Métiers de la Communication)",
|
| 239 |
+
"EM Strasbourg Business School",
|
| 240 |
+
"EMIC (École de Management des Industries Créatives)",
|
| 241 |
+
"Emlyon Business School (EM Lyon)",
|
| 242 |
+
"Epitech Digital (École pour l'informatique et les nouvelles technologies)",
|
| 243 |
+
"Erasmus University Rotterdam",
|
| 244 |
+
"ESADE Business School (Ramon Llull University)",
|
| 245 |
+
"ESCP Business School (École Supérieure de Commerce de Paris)",
|
| 246 |
+
"ESG Group",
|
| 247 |
+
"ESI Business School",
|
| 248 |
+
"ESIC Business & Marketing School",
|
| 249 |
+
"ESPI (École Supérieure des Professions Immobilières)",
|
| 250 |
+
"ESSCA School of Management (École Supérieure des Sciences Commerciales d'Angers)",
|
| 251 |
+
"ESSEC Business School",
|
| 252 |
+
"ESTICE International Business School",
|
| 253 |
+
"FERRANDI Paris",
|
| 254 |
+
"Fordham University",
|
| 255 |
+
"Frankfurt School of Finance & Management",
|
| 256 |
+
"Freie Universität Berlin",
|
| 257 |
+
"Fu Jen Catholic University",
|
| 258 |
+
"Fudan University",
|
| 259 |
+
"George Brown College",
|
| 260 |
+
"Glion Institute of Higher Education",
|
| 261 |
+
"GOBELINS Paris (l'école de l'image)",
|
| 262 |
+
"Graduate School of Management (GSOM) St. Petersburg University",
|
| 263 |
+
"Grenoble École de Management (GEM)",
|
| 264 |
+
"Hanyang University",
|
| 265 |
+
"Haute École de Joaillerie (HEJ)",
|
| 266 |
+
"HEC Lausanne (Université de Lausanne)",
|
| 267 |
+
"HEC Montréal (École des Hautes Études Commerciales de Montréal)",
|
| 268 |
+
"HEC Paris (École des Hautes Études Commerciales de Paris)",
|
| 269 |
+
"HEIP (Hautes Études Internationales et Politiques)",
|
| 270 |
+
"Hertie School",
|
| 271 |
+
"Hong Kong University of Science and Technology (HKUST)",
|
| 272 |
+
"Humboldt Universität zu Berlin",
|
| 273 |
+
"IAE Aix Marseille Graduate School of Management",
|
| 274 |
+
"IAE Lille University School of Management",
|
| 275 |
+
"IAE Paris Sorbonne Business School",
|
| 276 |
+
"IAE Paris Est (formerly IAE Gustave Eiffel)",
|
| 277 |
+
"IAE Versailles (Université Paris Saclay)",
|
| 278 |
+
"IAE Lyon School of Management",
|
| 279 |
+
"IAE Creteil",
|
| 280 |
+
"ICART (L'école du management de la culture et du marché de l'art)",
|
| 281 |
+
"IE Business School (IE University)",
|
| 282 |
+
"Institut d'études politiques (IEP)",
|
| 283 |
+
"IESEG School of Management (Institut d'Économie Scientifique et de Gestion)",
|
| 284 |
+
"IFP School (Institut Français du Pétrole)",
|
| 285 |
+
"IGS RH (Institut de Gestion Sociale Ressources Humaines)",
|
| 286 |
+
"IIM Digital School (Institut de l'Internet et du Multimédia)",
|
| 287 |
+
"Imperial College Business School",
|
| 288 |
+
"Imperial College London",
|
| 289 |
+
"IMPGT Aix Marseille Université (Institut de Management Public et Gouvernance Territoriale)",
|
| 290 |
+
"IMT Atlantique (École Nationale Supérieure des Mines de Nantes et Télécom Bretagne)",
|
| 291 |
+
"INSEAD (Institut Européen d'Administration des Affaires)",
|
| 292 |
+
"INSEEC (Institut des Hautes Études Économiques et Commerciales)",
|
| 293 |
+
"Institut Catholique de Paris (ICP)",
|
| 294 |
+
"Institut Français de la Mode (IFM)",
|
| 295 |
+
"Institut Léonard de Vinci (ILV)",
|
| 296 |
+
"Institut Lyfe (formerly Institut Paul Bocuse)",
|
| 297 |
+
"Institut Magellan",
|
| 298 |
+
"Institut Polytechnique de Paris (IP Paris)",
|
| 299 |
+
"INSTN (Institut National des Sciences et Techniques Nucléaires)",
|
| 300 |
+
"International University of Monaco (IUM)",
|
| 301 |
+
"IQS School of Management (Universitat Ramon Llull)",
|
| 302 |
+
"IRIS Sup' (Institut de Relations Internationales et Stratégiques)",
|
| 303 |
+
"ISAE SUPAERO (Institut Supérieur de l'Aéronautique et de l'Espace)",
|
| 304 |
+
"ISC Paris Business School (Institut Supérieur du Commerce de Paris)",
|
| 305 |
+
"ISCOM (Institut Supérieur de Communication et Publicité)",
|
| 306 |
+
"ISIPCA (Institut Supérieur International du Parfum de la Cosmétique et de l'Aromatique alimentaire)",
|
| 307 |
+
"IULM University (Libera Università di Lingue e Comunicazione)",
|
| 308 |
+
"Ivey Business School (Western University)",
|
| 309 |
+
"Kedge Business School",
|
| 310 |
+
"King's College London",
|
| 311 |
+
"KU Leuven",
|
| 312 |
+
"Les Roches Global Hospitality Education",
|
| 313 |
+
"London Business School (LBS)",
|
| 314 |
+
"London College of Fashion (University of the Arts London)",
|
| 315 |
+
"London Film School",
|
| 316 |
+
"London School of Economics and Political Science (LSE)",
|
| 317 |
+
"Loughborough University London",
|
| 318 |
+
"Loyola Marymount University",
|
| 319 |
+
"LUISS Guido Carli (Libera Università Internazionale degli Studi Sociali)",
|
| 320 |
+
"Lund University",
|
| 321 |
+
"MBA ESG",
|
| 322 |
+
"MGIMO University (Moscow State Institute of International Relations)",
|
| 323 |
+
"Monash University",
|
| 324 |
+
"National University of Singapore (NUS)",
|
| 325 |
+
"NEOMA Business School",
|
| 326 |
+
"New York University (NYU)",
|
| 327 |
+
"Newcastle University",
|
| 328 |
+
"Northumbria University",
|
| 329 |
+
"Nova School of Business and Economics",
|
| 330 |
+
"Paris Executive Business School (PEBS)",
|
| 331 |
+
"Paris School of Business (PSB)",
|
| 332 |
+
"Penninghen (École de direction artistique et d'architecture intérieure)",
|
| 333 |
+
"Plekhanov Russian University of Economics",
|
| 334 |
+
"POLIMI Graduate School of Management",
|
| 335 |
+
"Polytech Nice Sophia (Université Côte d'Azur)",
|
| 336 |
+
"Prague University of Economics and Business (VSE)",
|
| 337 |
+
"Queen's University",
|
| 338 |
+
"Renmin University of China",
|
| 339 |
+
"Rotterdam School of Management Erasmus University (RSM)",
|
| 340 |
+
"Royal College of Art (RCA)",
|
| 341 |
+
"Ruhr University Bochum",
|
| 342 |
+
"School of International and Public Affairs (SIPA) at Columbia University",
|
| 343 |
+
"Sciences Po (Institut d'études politiques de Paris)",
|
| 344 |
+
"Sciences Po Grenoble UGA",
|
| 345 |
+
"SDA Bocconi School of Management",
|
| 346 |
+
"Seoul National University",
|
| 347 |
+
"Shanghai Jiao Tong University",
|
| 348 |
+
"Singapore Management University (SMU)",
|
| 349 |
+
"SKEMA Business School (School of Knowledge Economy and Management)",
|
| 350 |
+
"Smith School of Business at Queen's University",
|
| 351 |
+
"Sophia University",
|
| 352 |
+
"Sorbonne Paris Nord University",
|
| 353 |
+
"Sorbonne Université",
|
| 354 |
+
"Sotheby's Institute of Art",
|
| 355 |
+
"Stockholm School of Economics (SSE)",
|
| 356 |
+
"Sup de Luxe (Institut Supérieur de Marketing du Luxe)",
|
| 357 |
+
"TBS Education (Toulouse Business School)",
|
| 358 |
+
"Technische Universität Berlin (TU Berlin)",
|
| 359 |
+
"Télécom Paris (École Nationale Supérieure des Télécommunications)",
|
| 360 |
+
"The University of Edinburgh",
|
| 361 |
+
"The University of Melbourne",
|
| 362 |
+
"The University of Sydney (USYD)",
|
| 363 |
+
"Trinity College Dublin",
|
| 364 |
+
"Tsinghua University",
|
| 365 |
+
"UCD Michael Smurfit Graduate Business School",
|
| 366 |
+
"Universidad Carlos III de Madrid (UC3M)",
|
| 367 |
+
"Universidad Europea de Madrid (UEM)",
|
| 368 |
+
"Universidade Católica Portuguesa",
|
| 369 |
+
"Université Côte d'Azur",
|
| 370 |
+
"Université de Strasbourg",
|
| 371 |
+
"Université Gustave Eiffel",
|
| 372 |
+
"Université Jean Moulin Lyon 3",
|
| 373 |
+
"Université Laval",
|
| 374 |
+
"Université Libre de Bruxelles (ULB)",
|
| 375 |
+
"Université Paris 1 Panthéon Sorbonne",
|
| 376 |
+
"Université Paris 8 Vincennes Saint Denis",
|
| 377 |
+
"Université Paris Cité",
|
| 378 |
+
"Université Paris Dauphine PSL",
|
| 379 |
+
"Université Paris Nanterre",
|
| 380 |
+
"Université Paris Panthéon Assas",
|
| 381 |
+
"Université Paris Saclay",
|
| 382 |
+
"Université Sorbonne Nouvelle (Paris 3)",
|
| 383 |
+
"Université Toulouse Capitole",
|
| 384 |
+
"University College Dublin (UCD)",
|
| 385 |
+
"University College London (UCL)",
|
| 386 |
+
"Université de Nice",
|
| 387 |
+
"University of Amsterdam (UvA)",
|
| 388 |
+
"University of California Los Angeles (UCLA)",
|
| 389 |
+
"University of Delaware",
|
| 390 |
+
"University of Freiburg (Albert Ludwigs Universität Freiburg)",
|
| 391 |
+
"University of Fribourg",
|
| 392 |
+
"University of Glasgow",
|
| 393 |
+
"University of Groningen",
|
| 394 |
+
"University of London",
|
| 395 |
+
"University of Mannheim",
|
| 396 |
+
"University of Pennsylvania (UPenn)",
|
| 397 |
+
"University of Rochester",
|
| 398 |
+
"University of Seoul",
|
| 399 |
+
"University of South Carolina",
|
| 400 |
+
"University of Southern California (USC)",
|
| 401 |
+
"University of St. Gallen (HSG)",
|
| 402 |
+
"University of the Arts London (UAL)",
|
| 403 |
+
"University of Trento",
|
| 404 |
+
"University of Warwick",
|
| 405 |
+
"Warwick Business School (WBS)",
|
| 406 |
+
"Washington University in St. Louis",
|
| 407 |
+
"WHU Otto Beisheim School of Management",
|
| 408 |
+
"WU (Vienna University of Economics and Business)",
|
| 409 |
+
"Xi'an Jiaotong Liverpool University (XJTLU)",
|
| 410 |
+
"Zhejiang A&F University (ZAFU)"
|
| 411 |
+
],
|
| 412 |
+
"Continent": [
|
| 413 |
+
"Africa",
|
| 414 |
+
"America",
|
| 415 |
+
"Asia",
|
| 416 |
+
"Australia",
|
| 417 |
+
"Europe",
|
| 418 |
+
"Middle East",
|
| 419 |
+
"North America",
|
| 420 |
+
"Oceania",
|
| 421 |
+
"South America"
|
| 422 |
+
],
|
| 423 |
+
"City": [
|
| 424 |
+
"Abu Dhabi",
|
| 425 |
+
"Aix-en-Provence",
|
| 426 |
+
"Amsterdam",
|
| 427 |
+
"Barcelona",
|
| 428 |
+
"Beijing",
|
| 429 |
+
"Belo Horizonte",
|
| 430 |
+
"Berlin",
|
| 431 |
+
"Bochum",
|
| 432 |
+
"Bordeaux",
|
| 433 |
+
"Boulogne-Billancourt",
|
| 434 |
+
"Brussels",
|
| 435 |
+
"Bucharest",
|
| 436 |
+
"Bulle",
|
| 437 |
+
"Cambridge",
|
| 438 |
+
"Carcavelos",
|
| 439 |
+
"Casablanca",
|
| 440 |
+
"Cergy",
|
| 441 |
+
"Cergy-Pontoise",
|
| 442 |
+
"Clayton",
|
| 443 |
+
"Copenhagen",
|
| 444 |
+
"Courbevoie",
|
| 445 |
+
"Coventry",
|
| 446 |
+
"Creteil",
|
| 447 |
+
"Dublin",
|
| 448 |
+
"Ecully",
|
| 449 |
+
"Edinburgh",
|
| 450 |
+
"Fontainebleau",
|
| 451 |
+
"Frankfurt",
|
| 452 |
+
"Fribourg",
|
| 453 |
+
"Gif-sur-Yvette",
|
| 454 |
+
"Glasgow",
|
| 455 |
+
"Grenoble",
|
| 456 |
+
"Groningen",
|
| 457 |
+
"Ho Chi Minh",
|
| 458 |
+
"Hong Kong",
|
| 459 |
+
"Jouy-en-Josas",
|
| 460 |
+
"Kingston",
|
| 461 |
+
"Lancaster",
|
| 462 |
+
"Las Vegas",
|
| 463 |
+
"Lausanne",
|
| 464 |
+
"Leuven",
|
| 465 |
+
"Lille",
|
| 466 |
+
"Lisbon",
|
| 467 |
+
"London",
|
| 468 |
+
"Los Angeles",
|
| 469 |
+
"Lund",
|
| 470 |
+
"Lyon",
|
| 471 |
+
"Madrid",
|
| 472 |
+
"Malaga",
|
| 473 |
+
"Manchester",
|
| 474 |
+
"Marne-la-Vallée",
|
| 475 |
+
"Marseille",
|
| 476 |
+
"Melbourne",
|
| 477 |
+
"Milan",
|
| 478 |
+
"Monaco",
|
| 479 |
+
"Montpellier",
|
| 480 |
+
"Montreal",
|
| 481 |
+
"Moscow",
|
| 482 |
+
"Munich",
|
| 483 |
+
"Nantes",
|
| 484 |
+
"New York",
|
| 485 |
+
"Newark",
|
| 486 |
+
"Newcastle",
|
| 487 |
+
"Newcastle upon Tyne",
|
| 488 |
+
"Nice",
|
| 489 |
+
"North Carolina",
|
| 490 |
+
"Oulu",
|
| 491 |
+
"Palaiseau",
|
| 492 |
+
"Paris",
|
| 493 |
+
"Périgueux",
|
| 494 |
+
"Philadelphia",
|
| 495 |
+
"Prague",
|
| 496 |
+
"Puteaux",
|
| 497 |
+
"Quebec",
|
| 498 |
+
"Rabat",
|
| 499 |
+
"Reims",
|
| 500 |
+
"Rochester",
|
| 501 |
+
"Rome",
|
| 502 |
+
"Rotterdam",
|
| 503 |
+
"Roubaix",
|
| 504 |
+
"Rouen",
|
| 505 |
+
"Saint Aubin",
|
| 506 |
+
"Saint Denis",
|
| 507 |
+
"Saint-Petersbourg",
|
| 508 |
+
"Seoul",
|
| 509 |
+
"Serris",
|
| 510 |
+
"Shanghai",
|
| 511 |
+
"Shenzhen",
|
| 512 |
+
"Singapore",
|
| 513 |
+
"Sophia Antipolis",
|
| 514 |
+
"St Gallen",
|
| 515 |
+
"Stockholm",
|
| 516 |
+
"Strasbourg",
|
| 517 |
+
"Suzhou",
|
| 518 |
+
"Sydney",
|
| 519 |
+
"Taipei",
|
| 520 |
+
"Tokyo",
|
| 521 |
+
"Toronto",
|
| 522 |
+
"Toulouse",
|
| 523 |
+
"Tunis",
|
| 524 |
+
"Turin",
|
| 525 |
+
"Valencia",
|
| 526 |
+
"Vallendar",
|
| 527 |
+
"Velizy",
|
| 528 |
+
"Vienne",
|
| 529 |
+
"Villetaneuse",
|
| 530 |
+
"Waltham",
|
| 531 |
+
"Warsaw",
|
| 532 |
+
"Washington",
|
| 533 |
+
"Wellesley",
|
| 534 |
+
"Zhejiang"
|
| 535 |
+
],
|
| 536 |
+
"Level": {
|
| 537 |
+
"Master": "Master 2 (BAC+5) / Master Degree",
|
| 538 |
+
"Specialised Master": "Mastère Spécialisé/Advanced Master",
|
| 539 |
+
"Mastere": "Mastère Spécialisé/Advanced Master",
|
| 540 |
+
"MBA": "MBA",
|
| 541 |
+
"Doctorat": "Doctorat / PhD",
|
| 542 |
+
"Preparationdunconcours": "CAP",
|
| 543 |
+
"Maitrise": "Master 2 (BAC+5) / Master Degree",
|
| 544 |
+
"Licence": "Licence (BAC+3) / pengraduate",
|
| 545 |
+
"Chineselanguage": "Autre",
|
| 546 |
+
"Other kind of studies": "Autre",
|
| 547 |
+
"Maîtrise": "Master 1 (BAC+4) / Graduate",
|
| 548 |
+
"Preparation for a competitive examination": "Preparation for a competitive exam",
|
| 549 |
+
"L3": "Licence (BAC+3) / pengraduate",
|
| 550 |
+
"Bachelor": "Licence (BAC+3) / pengraduate"
|
| 551 |
+
},
|
| 552 |
+
"Language": {
|
| 553 |
+
"French": "French",
|
| 554 |
+
"Français": "French",
|
| 555 |
+
"Francais": "French",
|
| 556 |
+
"FR": "French",
|
| 557 |
+
"English": "English",
|
| 558 |
+
"Anglais": "English",
|
| 559 |
+
"EN": "English",
|
| 560 |
+
"German": "German",
|
| 561 |
+
"Deutsch": "German",
|
| 562 |
+
"Allemand": "German",
|
| 563 |
+
"DE": "German",
|
| 564 |
+
"Spanish": "Spanish",
|
| 565 |
+
"Español": "Spanish",
|
| 566 |
+
"Espagnol": "Spanish",
|
| 567 |
+
"ES": "Spanish",
|
| 568 |
+
"Chinese": "Chinese",
|
| 569 |
+
"Mandarin": "Chinese",
|
| 570 |
+
"Cantonese": "Chinese",
|
| 571 |
+
"Chinois": "Chinese",
|
| 572 |
+
"CN": "Chinese",
|
| 573 |
+
"Italian": "Italian",
|
| 574 |
+
"Italiano": "Italian",
|
| 575 |
+
"Italien": "Italian",
|
| 576 |
+
"IT": "Italian",
|
| 577 |
+
"Portuguese": "Portuguese",
|
| 578 |
+
"Português": "Portuguese",
|
| 579 |
+
"Portugues": "Portuguese",
|
| 580 |
+
"Portugais": "Portuguese",
|
| 581 |
+
"PT": "Portuguese",
|
| 582 |
+
"Japanese": "Japanese",
|
| 583 |
+
"Nihongo": "Japanese",
|
| 584 |
+
"Japonais": "Japanese",
|
| 585 |
+
"JP": "Japanese",
|
| 586 |
+
"Korean": "Korean",
|
| 587 |
+
"Hangul": "Korean",
|
| 588 |
+
"Coréen": "Korean",
|
| 589 |
+
"Coreen": "Korean",
|
| 590 |
+
"KR": "Korean",
|
| 591 |
+
"Arabic": "Arabic",
|
| 592 |
+
"Arabe": "Arabic",
|
| 593 |
+
"AR": "Arabic",
|
| 594 |
+
"Russian": "Russian",
|
| 595 |
+
"Russe": "Russian",
|
| 596 |
+
"RU": "Russian",
|
| 597 |
+
"Dutch": "Dutch",
|
| 598 |
+
"Nederlands": "Dutch",
|
| 599 |
+
"Néerlandais": "Dutch",
|
| 600 |
+
"Neerlandais": "Dutch",
|
| 601 |
+
"NL": "Dutch",
|
| 602 |
+
"Hindi": "Hindi",
|
| 603 |
+
"Indien": "Hindi"
|
| 604 |
+
},
|
| 605 |
+
"Tags": [
|
| 606 |
+
"Accounting",
|
| 607 |
+
"Audit & Control",
|
| 608 |
+
"Architecture & Urban Planning",
|
| 609 |
+
"Arts & Cultural Management",
|
| 610 |
+
"Banking & Insurance",
|
| 611 |
+
"Business Administration & Management",
|
| 612 |
+
"Communication & Media",
|
| 613 |
+
"Consulting & Professional Services",
|
| 614 |
+
"Cybersecurity",
|
| 615 |
+
"Data Science & Analytics",
|
| 616 |
+
"Design Management",
|
| 617 |
+
"Digital Marketing",
|
| 618 |
+
"Digital Transformation & Tech Management",
|
| 619 |
+
"Economics",
|
| 620 |
+
"Economics & Social Sciences",
|
| 621 |
+
"Energy Management",
|
| 622 |
+
"Engineering Management",
|
| 623 |
+
"Entrepreneurship & Innovation",
|
| 624 |
+
"Finance & Corporate Finance",
|
| 625 |
+
"Healthcare Management",
|
| 626 |
+
"Hospitality & Culinary Management",
|
| 627 |
+
"Human Resources (HR)",
|
| 628 |
+
"Humanities & Social Sciences",
|
| 629 |
+
"Information Systems & IT Management",
|
| 630 |
+
"International Business",
|
| 631 |
+
"International Security & Defense",
|
| 632 |
+
"Law",
|
| 633 |
+
"Luxury Management",
|
| 634 |
+
"Marketing",
|
| 635 |
+
"Marketing Research & Insights",
|
| 636 |
+
"NGO & Social Impact",
|
| 637 |
+
"Organizational Psychology",
|
| 638 |
+
"Political Science & International Relations",
|
| 639 |
+
"Public & Organizational Management",
|
| 640 |
+
"Real Estate & Asset Management",
|
| 641 |
+
"Real Estate & Urban Planning",
|
| 642 |
+
"Sales & Business Development",
|
| 643 |
+
"Sports Management",
|
| 644 |
+
"Strategy & Strategic Management",
|
| 645 |
+
"Supply Chain & Purchasing",
|
| 646 |
+
"Sustainability & Environmental Management",
|
| 647 |
+
"Tourism & Leisure",
|
| 648 |
+
"Video Games & Creative Tech"
|
| 649 |
+
]
|
| 650 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
openpyxl
|
| 3 |
+
sentence_transformers
|
| 4 |
+
torch
|
| 5 |
+
tqdm
|
| 6 |
+
requests
|
| 7 |
+
python-dotenv
|
| 8 |
+
tenacity
|
| 9 |
+
flask
|
| 10 |
+
gunicorn
|
| 11 |
+
huggingface_hub
|
src/config.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import argparse
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
+
# Load environment variables
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
# --- ENVIRONMENT VARIABLES to be set up in .env ---
|
| 9 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 10 |
+
RAW_MODELS = os.getenv("GROQ_MODEL", "")
|
| 11 |
+
|
| 12 |
+
# Parse models cleanly into a list
|
| 13 |
+
AVAILABLE_MODELS = [m.strip() for m in RAW_MODELS.split(",") if m.strip()]
|
| 14 |
+
|
| 15 |
+
# --- PIPELINE THRESHOLDS & CONSTANTS ---
|
| 16 |
+
DEFAULT_SIMILARITY_THRESHOLD = 0.93
|
| 17 |
+
TOP_K_CANDIDATES = 5
|
| 18 |
+
|
| 19 |
+
SHEET_NAME = "Data"
|
| 20 |
+
DATA_DIR = "data"
|
| 21 |
+
REFDATA_DIR = "refdata"
|
| 22 |
+
DEFAULT_OUTPUT_SHEET_NAME = "Cleaned_Data"
|
| 23 |
+
|
| 24 |
+
DEFAULT_BLUEPRINT_FILE = "Blueprint.xlsx"
|
| 25 |
+
DEFAULT_OFFICIAL_REFS_FILE = "official_references.json"
|
| 26 |
+
DEFAULT_MANUAL_REFS_FILE = "manual_references.json"
|
| 27 |
+
|
| 28 |
+
def resolve_data_path(file_arg):
|
| 29 |
+
"""Resolve bare file names into the project data directory."""
|
| 30 |
+
if not file_arg:
|
| 31 |
+
return file_arg
|
| 32 |
+
if os.path.isabs(file_arg) or os.path.dirname(file_arg):
|
| 33 |
+
return file_arg
|
| 34 |
+
return os.path.join(DATA_DIR, file_arg)
|
| 35 |
+
|
| 36 |
+
def resolve_ref_path(file_arg):
|
| 37 |
+
"""Resolve bare reference file names into the project refdata directory."""
|
| 38 |
+
if not file_arg:
|
| 39 |
+
return file_arg
|
| 40 |
+
if os.path.isabs(file_arg) or os.path.dirname(file_arg):
|
| 41 |
+
return file_arg
|
| 42 |
+
return os.path.join(REFDATA_DIR, file_arg)
|
| 43 |
+
|
| 44 |
+
def parse_cli_args():
|
| 45 |
+
"""Sets up the command line arguments so you don't have to hardcode filenames."""
|
| 46 |
+
parser = argparse.ArgumentParser(description="MasterMap Data Normalization Pipeline")
|
| 47 |
+
parser.add_argument("--input", required=True, help="Raw input Excel file name inside data/")
|
| 48 |
+
parser.add_argument("--blueprint", default=DEFAULT_BLUEPRINT_FILE, help="Blueprint Excel file name inside data/")
|
| 49 |
+
parser.add_argument("--refs", default=DEFAULT_OFFICIAL_REFS_FILE, help="Official references JSON file name inside refdata/")
|
| 50 |
+
parser.add_argument("--manual_refs", default=DEFAULT_MANUAL_REFS_FILE, help="Manual overrides JSON file name inside refdata/")
|
| 51 |
+
parser.add_argument("--sheet", default=SHEET_NAME, help="Source sheet name to clean")
|
| 52 |
+
parser.add_argument("--output_sheet", default=DEFAULT_OUTPUT_SHEET_NAME, help="Output sheet name to create")
|
| 53 |
+
parser.add_argument("--models", default="", help="Comma-separated Groq models to use in fallback order")
|
| 54 |
+
|
| 55 |
+
args = parser.parse_args()
|
| 56 |
+
args.input = resolve_data_path(args.input)
|
| 57 |
+
args.blueprint = resolve_data_path(args.blueprint)
|
| 58 |
+
args.refs = resolve_ref_path(args.refs)
|
| 59 |
+
args.manual_refs = resolve_ref_path(args.manual_refs)
|
| 60 |
+
return args
|
src/data_pipeline.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import torch
|
| 3 |
+
import re
|
| 4 |
+
from collections import Counter
|
| 5 |
+
from sentence_transformers import util
|
| 6 |
+
from tqdm import tqdm
|
| 7 |
+
|
| 8 |
+
# Import our pure text manipulation functions
|
| 9 |
+
from src.utils import (
|
| 10 |
+
clean_degree_text,
|
| 11 |
+
normalize_text,
|
| 12 |
+
strip_degrees_for_search,
|
| 13 |
+
smart_format
|
| 14 |
+
)
|
| 15 |
+
from src.config import TOP_K_CANDIDATES, DEFAULT_SIMILARITY_THRESHOLD
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
# ML & CLUSTERING ENGINE
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
|
| 20 |
+
def self_cluster_degrees(raw_degrees_list, model, school_cache, threshold=0.93):
|
| 21 |
+
cleaned_list = [clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)]
|
| 22 |
+
raw_to_clean = {raw: clean_degree_text(raw) for raw in raw_degrees_list if isinstance(raw, str)}
|
| 23 |
+
clean_counts = Counter(cleaned_list)
|
| 24 |
+
unique_cleans = [deg for deg, count in clean_counts.most_common() if deg]
|
| 25 |
+
|
| 26 |
+
raw_to_meta = {}
|
| 27 |
+
|
| 28 |
+
if len(unique_cleans) <= 1:
|
| 29 |
+
for raw, clean in raw_to_clean.items():
|
| 30 |
+
raw_to_meta[raw] = (clean, "Degree_Formatter", "HIGH")
|
| 31 |
+
return raw_to_meta
|
| 32 |
+
|
| 33 |
+
embeddings = model.encode(unique_cleans, convert_to_tensor=True)
|
| 34 |
+
clean_to_clustered = {}
|
| 35 |
+
merge_info = {} # Tracks similarity scores for the Blueprint
|
| 36 |
+
|
| 37 |
+
for i, current_deg in enumerate(unique_cleans):
|
| 38 |
+
if current_deg in clean_to_clustered: continue
|
| 39 |
+
clean_to_clustered[current_deg] = current_deg
|
| 40 |
+
|
| 41 |
+
if i + 1 < len(unique_cleans):
|
| 42 |
+
cos_scores = util.cos_sim(embeddings[i], embeddings[i+1:])[0]
|
| 43 |
+
for j, score in enumerate(cos_scores):
|
| 44 |
+
target_deg = unique_cleans[i + 1 + j]
|
| 45 |
+
if score.item() >= threshold and target_deg not in clean_to_clustered:
|
| 46 |
+
pair_key = f"{min(current_deg, target_deg)}|||{max(current_deg, target_deg)}"
|
| 47 |
+
|
| 48 |
+
# We still use school_cache as a temporary runtime speedup,
|
| 49 |
+
# but it is NOT saved to the json memory.
|
| 50 |
+
cached_action = school_cache.get(pair_key)
|
| 51 |
+
|
| 52 |
+
if cached_action:
|
| 53 |
+
if cached_action == current_deg:
|
| 54 |
+
clean_to_clustered[target_deg] = current_deg
|
| 55 |
+
elif cached_action == target_deg:
|
| 56 |
+
clean_to_clustered[current_deg] = target_deg
|
| 57 |
+
merge_info[target_deg] = "Cached (Runtime)"
|
| 58 |
+
else:
|
| 59 |
+
clean_to_clustered[target_deg] = current_deg
|
| 60 |
+
school_cache[pair_key] = current_deg
|
| 61 |
+
merge_info[target_deg] = f"{score.item()*100:.1f}%"
|
| 62 |
+
|
| 63 |
+
for raw, clean in raw_to_clean.items():
|
| 64 |
+
final_val = clean
|
| 65 |
+
was_merged = False
|
| 66 |
+
sim_str = ""
|
| 67 |
+
|
| 68 |
+
while final_val in clean_to_clustered and clean_to_clustered[final_val] != final_val:
|
| 69 |
+
if not sim_str: sim_str = merge_info.get(final_val, "")
|
| 70 |
+
final_val = clean_to_clustered[final_val]
|
| 71 |
+
was_merged = True
|
| 72 |
+
|
| 73 |
+
conf = "MEDIUM" if was_merged else "HIGH"
|
| 74 |
+
|
| 75 |
+
if was_merged:
|
| 76 |
+
src = f"Auto-Merge ({sim_str})" if sim_str else "Auto-Merge"
|
| 77 |
+
else:
|
| 78 |
+
src = "Degree_Formatter"
|
| 79 |
+
|
| 80 |
+
raw_to_meta[raw] = (final_val, src, conf)
|
| 81 |
+
|
| 82 |
+
return raw_to_meta
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def cluster_degrees_by_institution(df, degree_col, inst_col, model, master_cache, blueprint_data, threshold=0.93):
|
| 86 |
+
print(f"\n[INFO] Auto-Clustering '{degree_col}'. (Merges will be logged to Blueprint...)")
|
| 87 |
+
cleaned_col_name = f'Cleaned_{degree_col}'
|
| 88 |
+
df[cleaned_col_name] = df[degree_col].copy()
|
| 89 |
+
unique_schools = df[inst_col].dropna().unique()
|
| 90 |
+
|
| 91 |
+
if "Degree_Decisions" not in master_cache: master_cache["Degree_Decisions"] = {}
|
| 92 |
+
|
| 93 |
+
school_mappings = {}
|
| 94 |
+
|
| 95 |
+
# 1. Wrap the AI bottleneck (school clustering) in tqdm
|
| 96 |
+
for school in tqdm(unique_schools, desc=f"Mapping {degree_col}s by Institution"):
|
| 97 |
+
school_mask = (df[inst_col] == school) & (df[degree_col].notna())
|
| 98 |
+
raw_degs = df.loc[school_mask, degree_col].astype(str).tolist()
|
| 99 |
+
if not raw_degs: continue
|
| 100 |
+
|
| 101 |
+
if school not in master_cache["Degree_Decisions"]: master_cache["Degree_Decisions"][school] = {}
|
| 102 |
+
school_mappings[school] = self_cluster_degrees(raw_degs, model, master_cache["Degree_Decisions"][school], threshold)
|
| 103 |
+
|
| 104 |
+
# 2. Wrap the DataFrame injection and Blueprint logging in tqdm
|
| 105 |
+
for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Applying & Logging {degree_col}s"):
|
| 106 |
+
school = row[inst_col]
|
| 107 |
+
raw_deg = str(row[degree_col])
|
| 108 |
+
|
| 109 |
+
if pd.isna(row[degree_col]) or school not in school_mappings: continue
|
| 110 |
+
|
| 111 |
+
mapping_data = school_mappings[school].get(raw_deg)
|
| 112 |
+
if mapping_data:
|
| 113 |
+
final_val, src, conf = mapping_data
|
| 114 |
+
df.at[idx, cleaned_col_name] = final_val
|
| 115 |
+
|
| 116 |
+
# Log to Blueprint if modified or auto-merged
|
| 117 |
+
if str(raw_deg).strip() != final_val.strip() or conf != "HIGH":
|
| 118 |
+
blueprint_data.append({
|
| 119 |
+
"Row_Index": idx + 3,
|
| 120 |
+
"Column": degree_col,
|
| 121 |
+
"Original_Raw_Text": raw_deg,
|
| 122 |
+
"AI_Suggested_Match": final_val,
|
| 123 |
+
"Human_Override": "",
|
| 124 |
+
"Confidence": conf,
|
| 125 |
+
"Match_Source": src
|
| 126 |
+
})
|
| 127 |
+
return df
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def get_deterministic_match(value, combined_valid_targets):
|
| 131 |
+
val_clean = normalize_text(value)
|
| 132 |
+
for target in combined_valid_targets:
|
| 133 |
+
target_clean = normalize_text(target)
|
| 134 |
+
if re.search(rf"^{re.escape(val_clean)}(\b|[\s\(\/\\\-])", target_clean): return target
|
| 135 |
+
for target in combined_valid_targets:
|
| 136 |
+
if f"({val_clean.upper()})" in normalize_text(target).upper(): return target
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def get_top_candidates(model, value, combined_valid_targets, reference_embeddings, k=5):
|
| 141 |
+
if not combined_valid_targets: return []
|
| 142 |
+
query_embedding = model.encode(value, convert_to_tensor=True)
|
| 143 |
+
similarities = util.pytorch_cos_sim(query_embedding, reference_embeddings)[0]
|
| 144 |
+
actual_k = min(k, len(combined_valid_targets))
|
| 145 |
+
top_matches = torch.topk(similarities, actual_k)
|
| 146 |
+
return [combined_valid_targets[idx] for idx in top_matches.indices]
|
| 147 |
+
|
| 148 |
+
def get_dict_exact_match(value, combined_dict):
|
| 149 |
+
value_clean = normalize_text(value)
|
| 150 |
+
|
| 151 |
+
for alias, canonical in combined_dict.items():
|
| 152 |
+
if normalize_text(alias) == value_clean:
|
| 153 |
+
return canonical
|
| 154 |
+
|
| 155 |
+
for canonical in combined_dict.values():
|
| 156 |
+
if normalize_text(canonical) == value_clean:
|
| 157 |
+
return canonical
|
| 158 |
+
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
def get_dict_rule_match(value, combined_dict):
|
| 162 |
+
aliases = list(combined_dict.keys())
|
| 163 |
+
canonical_values = list(dict.fromkeys(combined_dict.values()))
|
| 164 |
+
|
| 165 |
+
alias_match = get_deterministic_match(value, aliases)
|
| 166 |
+
if alias_match:
|
| 167 |
+
return combined_dict[alias_match]
|
| 168 |
+
|
| 169 |
+
value_match = get_deterministic_match(value, canonical_values)
|
| 170 |
+
if value_match:
|
| 171 |
+
return value_match
|
| 172 |
+
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
def as_reference_list(ref_data):
|
| 176 |
+
if isinstance(ref_data, list):
|
| 177 |
+
return ref_data
|
| 178 |
+
if isinstance(ref_data, dict):
|
| 179 |
+
return list(dict.fromkeys(ref_data.values()))
|
| 180 |
+
return []
|
| 181 |
+
|
| 182 |
+
def as_reference_dict(ref_data):
|
| 183 |
+
if isinstance(ref_data, dict):
|
| 184 |
+
return ref_data
|
| 185 |
+
if isinstance(ref_data, list):
|
| 186 |
+
return {item: item for item in ref_data if isinstance(item, str)}
|
| 187 |
+
return {}
|
| 188 |
+
|
| 189 |
+
def update_match_postfix(progress, source_counts):
|
| 190 |
+
progress.set_postfix({
|
| 191 |
+
"Exact_Match": source_counts["Exact_Match"],
|
| 192 |
+
"Rule_Match": source_counts["Rule_Match"],
|
| 193 |
+
"LLM_Judged": source_counts["LLM_Judged"],
|
| 194 |
+
}, refresh=False)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def match_cache_key(column_name, value):
|
| 198 |
+
"""Return the single cache key format used for matching and reconstruction."""
|
| 199 |
+
if column_name in ["Institution", "Degree"]:
|
| 200 |
+
value = strip_degrees_for_search(value)
|
| 201 |
+
return normalize_text(str(value).rstrip("."))
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def append_unique_cleaned_part(cleaned_parts, value):
|
| 205 |
+
seen = set()
|
| 206 |
+
for existing_value in cleaned_parts:
|
| 207 |
+
for existing_part in str(existing_value).split(","):
|
| 208 |
+
key = normalize_text(existing_part.strip())
|
| 209 |
+
if key:
|
| 210 |
+
seen.add(key)
|
| 211 |
+
|
| 212 |
+
added = False
|
| 213 |
+
for part in str(value).split(","):
|
| 214 |
+
clean_part = part.strip()
|
| 215 |
+
if not clean_part:
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
key = normalize_text(clean_part)
|
| 219 |
+
if key in seen:
|
| 220 |
+
continue
|
| 221 |
+
|
| 222 |
+
seen.add(key)
|
| 223 |
+
cleaned_parts.append(clean_part)
|
| 224 |
+
added = True
|
| 225 |
+
|
| 226 |
+
return added
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# ---------------------------------------------------------------------------
|
| 230 |
+
# CORE DATA PIPELINE
|
| 231 |
+
# ---------------------------------------------------------------------------
|
| 232 |
+
|
| 233 |
+
def process_column(df, column_name, model, groq_router, official_refs, manual_refs, master_cache, split_pattern, blueprint_data):
|
| 234 |
+
if column_name not in df.columns: return df
|
| 235 |
+
|
| 236 |
+
core_data = official_refs.get(column_name, [])
|
| 237 |
+
added_data = manual_refs.get(column_name, [])
|
| 238 |
+
if column_name not in master_cache: master_cache[column_name] = {}
|
| 239 |
+
|
| 240 |
+
detailed_cache = {}
|
| 241 |
+
is_dict_mode = isinstance(core_data, dict)
|
| 242 |
+
|
| 243 |
+
def get_updated_embeddings():
|
| 244 |
+
if is_dict_mode:
|
| 245 |
+
c_dict = {**as_reference_dict(core_data), **as_reference_dict(added_data)}
|
| 246 |
+
c_keys = list(c_dict.keys())
|
| 247 |
+
u_vals = list(set(c_dict.values()))
|
| 248 |
+
k_emb = model.encode(c_keys, convert_to_tensor=True) if c_keys else None
|
| 249 |
+
v_emb = model.encode(u_vals, convert_to_tensor=True) if u_vals else None
|
| 250 |
+
return c_dict, c_keys, k_emb, u_vals, v_emb
|
| 251 |
+
else:
|
| 252 |
+
comb = as_reference_list(core_data) + as_reference_list(added_data)
|
| 253 |
+
comb = list(dict.fromkeys(item for item in comb if isinstance(item, str) and item.strip()))
|
| 254 |
+
emb = model.encode(comb, convert_to_tensor=True) if comb else None
|
| 255 |
+
return None, comb, emb, None, None
|
| 256 |
+
|
| 257 |
+
combined_dict, combined_valid_targets, reference_embeddings, unique_values, value_embeddings = get_updated_embeddings()
|
| 258 |
+
|
| 259 |
+
if is_dict_mode and not combined_dict:
|
| 260 |
+
raise ValueError(f"No dictionary references loaded for '{column_name}'. Refusing to call Groq for every value.")
|
| 261 |
+
if not is_dict_mode and not combined_valid_targets:
|
| 262 |
+
raise ValueError(f"No list references loaded for '{column_name}'. Refusing to call Groq for every value.")
|
| 263 |
+
|
| 264 |
+
uniques = set()
|
| 265 |
+
for cell in df[column_name].dropna():
|
| 266 |
+
for p in re.split(split_pattern, str(cell), flags=re.IGNORECASE):
|
| 267 |
+
if p.strip(): uniques.add(p.strip())
|
| 268 |
+
|
| 269 |
+
print(f"\n[INFO] Analyzing {len(uniques)} unique entities in '{column_name}'...")
|
| 270 |
+
source_counts = Counter()
|
| 271 |
+
|
| 272 |
+
progress = tqdm(sorted(uniques, key=normalize_text), desc=f"Cleaning {column_name}")
|
| 273 |
+
for word in progress:
|
| 274 |
+
word_clean = match_cache_key(column_name, word)
|
| 275 |
+
|
| 276 |
+
# 1. Check Memory Cache
|
| 277 |
+
if word_clean in master_cache[column_name]:
|
| 278 |
+
detailed_cache[word_clean] = {"val": master_cache[column_name][word_clean], "src": "Memory_Cache", "conf": "HIGH"}
|
| 279 |
+
source_counts["Memory_Cache"] += 1
|
| 280 |
+
update_match_postfix(progress, source_counts)
|
| 281 |
+
continue
|
| 282 |
+
|
| 283 |
+
# 2. Check Exact Targets
|
| 284 |
+
if is_dict_mode:
|
| 285 |
+
exact = get_dict_exact_match(word, combined_dict)
|
| 286 |
+
else:
|
| 287 |
+
exact = next((k for k in combined_valid_targets if normalize_text(k) == normalize_text(word_clean)), None) if combined_valid_targets else None
|
| 288 |
+
|
| 289 |
+
if exact:
|
| 290 |
+
val = exact
|
| 291 |
+
detailed_cache[word_clean] = {"val": val, "src": "Exact_Match", "conf": "HIGH"}
|
| 292 |
+
source_counts["Exact_Match"] += 1
|
| 293 |
+
update_match_postfix(progress, source_counts)
|
| 294 |
+
continue
|
| 295 |
+
|
| 296 |
+
# 3. Deterministic / Rule Match
|
| 297 |
+
if is_dict_mode:
|
| 298 |
+
suggested_match = get_dict_rule_match(word, combined_dict)
|
| 299 |
+
else:
|
| 300 |
+
suggested_match = get_deterministic_match(word, combined_valid_targets) if combined_valid_targets else None
|
| 301 |
+
|
| 302 |
+
if suggested_match:
|
| 303 |
+
detailed_cache[word_clean] = {"val": suggested_match, "src": "Rule_Match", "conf": "HIGH"}
|
| 304 |
+
source_counts["Rule_Match"] += 1
|
| 305 |
+
update_match_postfix(progress, source_counts)
|
| 306 |
+
continue
|
| 307 |
+
|
| 308 |
+
# 4. LLM API Match
|
| 309 |
+
candidates = []
|
| 310 |
+
if is_dict_mode:
|
| 311 |
+
cand_keys = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
|
| 312 |
+
cand_vals = get_top_candidates(model, word, unique_values, value_embeddings)
|
| 313 |
+
candidates = list(dict.fromkeys(cand_keys + cand_vals))[:TOP_K_CANDIDATES]
|
| 314 |
+
else:
|
| 315 |
+
candidates = get_top_candidates(model, word, combined_valid_targets, reference_embeddings)
|
| 316 |
+
|
| 317 |
+
# Call the router instance
|
| 318 |
+
ans_val, src, conf = groq_router.ask_judge(word, candidates, column_name)
|
| 319 |
+
source_counts[src] += 1
|
| 320 |
+
update_match_postfix(progress, source_counts)
|
| 321 |
+
|
| 322 |
+
# Process every valid string, regardless of confidence (skip if API crashed)
|
| 323 |
+
if "API_Error" not in conf and ans_val != "UNKNOWN" and ans_val != "LLM_Failed":
|
| 324 |
+
llm_parts = [p.strip() for p in ans_val.split(",")]
|
| 325 |
+
corrected_parts = []
|
| 326 |
+
all_matched = True # Flag to track if every piece exists in our data
|
| 327 |
+
|
| 328 |
+
for part in llm_parts:
|
| 329 |
+
if is_dict_mode:
|
| 330 |
+
exact_match = get_dict_exact_match(part, combined_dict)
|
| 331 |
+
if exact_match:
|
| 332 |
+
corrected_parts.append(exact_match)
|
| 333 |
+
else:
|
| 334 |
+
rule_match = get_dict_rule_match(part, combined_dict)
|
| 335 |
+
if rule_match:
|
| 336 |
+
corrected_parts.append(rule_match)
|
| 337 |
+
else:
|
| 338 |
+
corrected_parts.append(part)
|
| 339 |
+
all_matched = False
|
| 340 |
+
else:
|
| 341 |
+
# 1. Exact Match Check (Case-insensitive)
|
| 342 |
+
exact_match = next((c for c in candidates if c.lower() == part.lower()), None)
|
| 343 |
+
if exact_match:
|
| 344 |
+
corrected_parts.append(exact_match)
|
| 345 |
+
else:
|
| 346 |
+
# 2. Rule-Based Match Check
|
| 347 |
+
rule_match = get_deterministic_match(part, candidates)
|
| 348 |
+
if rule_match:
|
| 349 |
+
corrected_parts.append(rule_match)
|
| 350 |
+
else:
|
| 351 |
+
# 3. No match in dictionary. Keep LLM's version, but flag that we couldn't verify it.
|
| 352 |
+
corrected_parts.append(part)
|
| 353 |
+
all_matched = False
|
| 354 |
+
|
| 355 |
+
# Remove duplicates while preserving the exact order
|
| 356 |
+
unique_parts = list(dict.fromkeys(corrected_parts))
|
| 357 |
+
|
| 358 |
+
# Glue it back together
|
| 359 |
+
ans_val = ", ".join(unique_parts)
|
| 360 |
+
|
| 361 |
+
raw_parts_for_check = [
|
| 362 |
+
p.strip()
|
| 363 |
+
for p in re.split(split_pattern, str(word))
|
| 364 |
+
if p.strip()
|
| 365 |
+
] or [word]
|
| 366 |
+
raw_lookup_keys = {normalize_text(part) for part in raw_parts_for_check}
|
| 367 |
+
verified_lookup_keys = {normalize_text(part) for part in unique_parts}
|
| 368 |
+
|
| 369 |
+
# Only upgrade when the LLM answer was verified against the refs and
|
| 370 |
+
# also matches the original text directly. Otherwise it is still an
|
| 371 |
+
# LLM judgment, even if the chosen answer exists in the references.
|
| 372 |
+
if all_matched and verified_lookup_keys.issubset(raw_lookup_keys):
|
| 373 |
+
conf = "HIGH"
|
| 374 |
+
src = "LLM_Rule_Verified"
|
| 375 |
+
# -----------------------------------------------------
|
| 376 |
+
|
| 377 |
+
detailed_cache[word_clean] = {"val": ans_val, "src": src, "conf": conf}
|
| 378 |
+
|
| 379 |
+
# Reconstruct cells and capture low/medium confidence matches for the Blueprint
|
| 380 |
+
for idx, row in df.iterrows():
|
| 381 |
+
cell_val = row[column_name]
|
| 382 |
+
if pd.isna(cell_val): continue
|
| 383 |
+
|
| 384 |
+
raw_parts = [p.strip() for p in re.split(split_pattern, str(cell_val)) if p.strip()]
|
| 385 |
+
cleaned_parts = []
|
| 386 |
+
lowest_conf = "HIGH"
|
| 387 |
+
cell_sources = []
|
| 388 |
+
|
| 389 |
+
i = 0
|
| 390 |
+
while i < len(raw_parts):
|
| 391 |
+
curr = raw_parts[i]
|
| 392 |
+
|
| 393 |
+
# Check for combined pairs (e.g., "University of, Manchester" split by mistake)
|
| 394 |
+
if i + 1 < len(raw_parts):
|
| 395 |
+
combo_clean = match_cache_key(column_name, f"{curr}, {raw_parts[i+1]}")
|
| 396 |
+
if combo_clean in detailed_cache:
|
| 397 |
+
append_unique_cleaned_part(cleaned_parts, detailed_cache[combo_clean]["val"])
|
| 398 |
+
if detailed_cache[combo_clean]["conf"] != "HIGH": lowest_conf = detailed_cache[combo_clean]["conf"]
|
| 399 |
+
cell_sources.append(detailed_cache[combo_clean]["src"])
|
| 400 |
+
i += 2; continue
|
| 401 |
+
|
| 402 |
+
p_clean = match_cache_key(column_name, curr)
|
| 403 |
+
if len(p_clean) <= 1: i += 1; continue
|
| 404 |
+
|
| 405 |
+
cache_hit = detailed_cache.get(p_clean)
|
| 406 |
+
if cache_hit:
|
| 407 |
+
append_unique_cleaned_part(cleaned_parts, cache_hit["val"])
|
| 408 |
+
if cache_hit["conf"] != "HIGH": lowest_conf = cache_hit["conf"]
|
| 409 |
+
cell_sources.append(cache_hit["src"])
|
| 410 |
+
else:
|
| 411 |
+
append_unique_cleaned_part(cleaned_parts, smart_format(curr))
|
| 412 |
+
lowest_conf = "LOW"
|
| 413 |
+
cell_sources.append("Fallback_Format")
|
| 414 |
+
i += 1
|
| 415 |
+
|
| 416 |
+
final_stitched_val = ", ".join(cleaned_parts)
|
| 417 |
+
df.at[idx, f'Cleaned_{column_name}'] = final_stitched_val
|
| 418 |
+
|
| 419 |
+
# Log EVERY change made to the Excel file, plus any low/medium confidence guesses
|
| 420 |
+
if str(cell_val).strip() != final_stitched_val.strip() or lowest_conf != "HIGH":
|
| 421 |
+
blueprint_data.append({
|
| 422 |
+
"Row_Index": idx + 3,
|
| 423 |
+
"Column": column_name,
|
| 424 |
+
"Original_Raw_Text": cell_val,
|
| 425 |
+
"AI_Suggested_Match": final_stitched_val,
|
| 426 |
+
"Human_Override": "",
|
| 427 |
+
"Match_Source": " | ".join(set(cell_sources)),
|
| 428 |
+
"Confidence": lowest_conf
|
| 429 |
+
})
|
| 430 |
+
|
| 431 |
+
return df
|
src/llm_router.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import time
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
|
| 5 |
+
|
| 6 |
+
class RateLimitException(Exception): pass
|
| 7 |
+
|
| 8 |
+
class GroqRouter:
|
| 9 |
+
def __init__(self, api_key, available_models):
|
| 10 |
+
self.api_key = api_key
|
| 11 |
+
self.available_models = available_models
|
| 12 |
+
self.current_model_index = 0
|
| 13 |
+
self.last_printed_model = None
|
| 14 |
+
|
| 15 |
+
def ask_judge(self, word, candidates, column_name):
|
| 16 |
+
if self.current_model_index >= len(self.available_models):
|
| 17 |
+
return (word, "API_Error_All_Models_Dead", "LOW")
|
| 18 |
+
|
| 19 |
+
if not self.api_key:
|
| 20 |
+
return (word, "No_API_Key", "LOW")
|
| 21 |
+
|
| 22 |
+
headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
|
| 23 |
+
|
| 24 |
+
if column_name in ["Institution", "Degree"]:
|
| 25 |
+
specific_rules = (
|
| 26 |
+
"- Split distinct separate schools or global alliances with a comma (e.g., 'Harvard & MIT' -> 'Harvard University, MIT').\n"
|
| 27 |
+
"- Treat alliances like CEMS as distinct institutions! NEVER delete them.\n"
|
| 28 |
+
"- Do NOT split a Business School and its Parent University. They are ONE entity.\n"
|
| 29 |
+
"- Do NOT split a school and its acronym/alias. Keep them as one entity.\n"
|
| 30 |
+
"- Watch out for joint programs! If a hyphen separates TWO truly distinct institutions, split them.\n"
|
| 31 |
+
"- Do NOT split single names with hyphens (e.g., 'Paris-Descartes').\n"
|
| 32 |
+
"- Strip out ONLY standard degree titles (e.g., 'MSc', 'MBA', 'BBA', 'Ph.D.')."
|
| 33 |
+
)
|
| 34 |
+
elif column_name in ["Country", "Continent"]:
|
| 35 |
+
specific_rules = (
|
| 36 |
+
f"- MUST translate the {column_name} name to its standard ENGLISH equivalent (e.g., 'Espagne' -> 'Spain', 'Milano' -> 'Milan').\n"
|
| 37 |
+
f"- If the input contains multiple distinct {column_name}s, MUST split them into a comma-separated list.\n"
|
| 38 |
+
f"- If it is a single {column_name}, return exactly ONE official English name."
|
| 39 |
+
)
|
| 40 |
+
else:
|
| 41 |
+
specific_rules = (
|
| 42 |
+
"- If the input contains multiple distinct entities, MUST split them into a comma-separated list.\n"
|
| 43 |
+
"- If it is a single entity, return exactly ONE official name. No aliases."
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
system_msg = (
|
| 47 |
+
"You are a data normalization expert. Your task is to map raw input to the MOST ACCURATE "
|
| 48 |
+
"official name from a provided list of candidates or your own world knowledge. "
|
| 49 |
+
"Respond ONLY with the final string. No conversational text, no quotes."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
user_msg = (
|
| 53 |
+
f"Column: {column_name}\n"
|
| 54 |
+
f"Raw Input: '{word}'\n"
|
| 55 |
+
f"Database Candidates: {candidates}\n\n"
|
| 56 |
+
f"Rules:\n"
|
| 57 |
+
f"1. Choose the best match from the Database Candidates if possible.\n"
|
| 58 |
+
f"2. If an entity from the Raw Input does NOT exist in the Candidates list, DO NOT map it to a different candidate just because they share a city or country. Instead, return its cleaned original name exactly as written.\n"
|
| 59 |
+
f"{specific_rules}\n"
|
| 60 |
+
f"3. If gibberish, return 'UNKNOWN'."
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
payload = {
|
| 64 |
+
"messages": [{"role": "system", "content": system_msg}, {"role": "user", "content": user_msg}],
|
| 65 |
+
"temperature": 0.0,
|
| 66 |
+
"max_tokens": 50
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# --- SIMPLIFIED RETRY LOGIC ---
|
| 70 |
+
@retry(
|
| 71 |
+
retry=retry_if_exception_type(RateLimitException),
|
| 72 |
+
wait=wait_exponential(multiplier=2, min=2, max=30),
|
| 73 |
+
stop=stop_after_attempt(5),
|
| 74 |
+
reraise=True
|
| 75 |
+
)
|
| 76 |
+
def fire_request():
|
| 77 |
+
res = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=payload, timeout=30)
|
| 78 |
+
|
| 79 |
+
if res.status_code == 429:
|
| 80 |
+
raise RateLimitException("Model Limits Exhausted")
|
| 81 |
+
|
| 82 |
+
return res
|
| 83 |
+
|
| 84 |
+
while self.current_model_index < len(self.available_models):
|
| 85 |
+
active_model = self.available_models[self.current_model_index]
|
| 86 |
+
payload["model"] = active_model
|
| 87 |
+
|
| 88 |
+
if active_model != self.last_printed_model:
|
| 89 |
+
tqdm.write(f"\n[INFO] Routing requests to model: {active_model}")
|
| 90 |
+
self.last_printed_model = active_model
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
time.sleep(0.3)
|
| 94 |
+
response = fire_request()
|
| 95 |
+
|
| 96 |
+
if response.status_code == 200:
|
| 97 |
+
answer = response.json()["choices"][0]["message"]["content"].strip().strip('"\'')
|
| 98 |
+
if answer == "UNKNOWN":
|
| 99 |
+
return (word, "LLM_Failed", "LOW")
|
| 100 |
+
return (answer, "LLM_Judged", "MEDIUM")
|
| 101 |
+
|
| 102 |
+
else:
|
| 103 |
+
tqdm.write(f" [!] Groq API Error: {response.status_code} for '{word}'")
|
| 104 |
+
return (word, f"API_Error_{response.status_code}", "LOW")
|
| 105 |
+
|
| 106 |
+
except RateLimitException:
|
| 107 |
+
tqdm.write(f" [!] Limits exhausted for {active_model}!")
|
| 108 |
+
|
| 109 |
+
self.current_model_index += 1
|
| 110 |
+
|
| 111 |
+
if self.current_model_index < len(self.available_models):
|
| 112 |
+
next_model = self.available_models[self.current_model_index]
|
| 113 |
+
tqdm.write(f" [!] Switching to fallback model: {next_model}...")
|
| 114 |
+
else:
|
| 115 |
+
tqdm.write(f" [!] FATAL: ALL {len(self.available_models)} MODELS DEAD. Skipping remainder of dataset.")
|
| 116 |
+
return (word, "API_Error_All_Models_Dead", "LOW")
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
tqdm.write(f" [!] Request failed for '{word}' on {active_model}: {e}")
|
| 120 |
+
return (word, "API_Error_Timeout", "LOW")
|
| 121 |
+
|
| 122 |
+
return (word, "API_Error_Unhandled", "LOW")
|
src/process_runner.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import codecs
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import signal
|
| 5 |
+
import subprocess
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
ACTIVE_PROCESSES = {}
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def stop_process(job_id: str) -> bool:
|
| 13 |
+
process = ACTIVE_PROCESSES.get(job_id)
|
| 14 |
+
if not process or process.poll() is not None:
|
| 15 |
+
return False
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
if os.name == "nt":
|
| 19 |
+
process.send_signal(signal.CTRL_BREAK_EVENT)
|
| 20 |
+
else:
|
| 21 |
+
process.terminate()
|
| 22 |
+
except Exception:
|
| 23 |
+
process.terminate()
|
| 24 |
+
|
| 25 |
+
return True
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def stream_process(command, cwd: Path, job_id=None):
|
| 29 |
+
env = os.environ.copy()
|
| 30 |
+
env["PYTHONUNBUFFERED"] = "1"
|
| 31 |
+
popen_kwargs = {
|
| 32 |
+
"cwd": cwd,
|
| 33 |
+
"stdout": subprocess.PIPE,
|
| 34 |
+
"stderr": subprocess.STDOUT,
|
| 35 |
+
"bufsize": 0,
|
| 36 |
+
"env": env,
|
| 37 |
+
}
|
| 38 |
+
if os.name == "nt":
|
| 39 |
+
popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
|
| 40 |
+
|
| 41 |
+
process = subprocess.Popen(
|
| 42 |
+
command,
|
| 43 |
+
**popen_kwargs,
|
| 44 |
+
)
|
| 45 |
+
if job_id:
|
| 46 |
+
ACTIVE_PROCESSES[job_id] = process
|
| 47 |
+
try:
|
| 48 |
+
assert process.stdout is not None
|
| 49 |
+
decoder = codecs.getincrementaldecoder("utf-8")("replace")
|
| 50 |
+
while True:
|
| 51 |
+
raw_chunk = process.stdout.read(1)
|
| 52 |
+
if raw_chunk == b"" and process.poll() is not None:
|
| 53 |
+
break
|
| 54 |
+
if raw_chunk:
|
| 55 |
+
chunk = decoder.decode(raw_chunk)
|
| 56 |
+
yield f"data: {json.dumps(chunk)}\n\n"
|
| 57 |
+
|
| 58 |
+
exit_code = process.wait()
|
| 59 |
+
trailing_chunk = decoder.decode(b"", final=True)
|
| 60 |
+
if trailing_chunk:
|
| 61 |
+
yield f"data: {json.dumps(trailing_chunk)}\n\n"
|
| 62 |
+
yield f"data: {json.dumps(chr(10) + f'Process exited with code {exit_code}' + chr(10))}\n\n"
|
| 63 |
+
event_name = "done" if exit_code == 0 else "failed"
|
| 64 |
+
yield f"event: {event_name}\ndata: {{}}\n\n"
|
| 65 |
+
finally:
|
| 66 |
+
if job_id:
|
| 67 |
+
ACTIVE_PROCESSES.pop(job_id, None)
|
src/utils.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import re
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
def strip_degrees_for_search(text):
|
| 7 |
+
if not isinstance(text, str): return text
|
| 8 |
+
degree_pattern = r'\b(MSc|MBA|BBA|BSc|Ph\.?D\.?|BA|MA|BS|MS|EMBA|Master|Bachelor|Masters|Bachelors|Licence)\b'
|
| 9 |
+
cleaned = re.sub(degree_pattern, '', text, flags=re.IGNORECASE)
|
| 10 |
+
cleaned = re.sub(r'\s+', ' ', cleaned)
|
| 11 |
+
cleaned = cleaned.strip(' -.,&/|')
|
| 12 |
+
if not cleaned: return text.strip()
|
| 13 |
+
return cleaned
|
| 14 |
+
|
| 15 |
+
def smart_format(text):
|
| 16 |
+
if not isinstance(text, str): return text
|
| 17 |
+
res = text.title()
|
| 18 |
+
acronyms = ['Ma', 'Ba', 'Mba', 'Bba', 'Hr', 'It', 'Bs', 'Ms', 'Phd', 'Bsc', 'Msc', 'Llm', 'Pge', 'Cems']
|
| 19 |
+
for ac in acronyms:
|
| 20 |
+
res = re.sub(rf'\b{ac}\b', lambda m: m.group(0).upper(), res)
|
| 21 |
+
res = res.replace("PHD", "PhD").replace("BSC", "BSc").replace("MSC", "MSc")
|
| 22 |
+
res = re.sub(r"\b(L|D|Qu)'([A-Z])", lambda m: f"{m.group(1)}'{m.group(2).lower()}", res)
|
| 23 |
+
return res.strip()
|
| 24 |
+
|
| 25 |
+
def clean_degree_text(text):
|
| 26 |
+
if not isinstance(text, str): return ""
|
| 27 |
+
text = re.sub(r'\band\b', '&', text, flags=re.IGNORECASE)
|
| 28 |
+
text = re.sub(r'\bet\b', '&', text, flags=re.IGNORECASE)
|
| 29 |
+
text = re.sub(r'[^\w\s\-&\+\']', ' ', text)
|
| 30 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 31 |
+
return smart_format(text)
|
| 32 |
+
|
| 33 |
+
def normalize_text(text):
|
| 34 |
+
if not isinstance(text, str): return ""
|
| 35 |
+
normalized = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
|
| 36 |
+
return normalized.strip().lower()
|
| 37 |
+
|
| 38 |
+
def normalize_ref(value):
|
| 39 |
+
return normalize_text(str(value))
|
| 40 |
+
|
| 41 |
+
def iter_ref_values(ref_data):
|
| 42 |
+
if isinstance(ref_data, dict):
|
| 43 |
+
yield from (item for item in ref_data.keys() if isinstance(item, str))
|
| 44 |
+
yield from (item for item in ref_data.values() if isinstance(item, str))
|
| 45 |
+
elif isinstance(ref_data, list):
|
| 46 |
+
yield from (item for item in ref_data if isinstance(item, str))
|
| 47 |
+
|
| 48 |
+
def ref_contains(ref_data, value):
|
| 49 |
+
needle = normalize_ref(value)
|
| 50 |
+
return any(normalize_ref(item) == needle for item in iter_ref_values(ref_data))
|
| 51 |
+
|
| 52 |
+
def prune_manual_refs_against_official(manual_refs, official_refs):
|
| 53 |
+
removed_count = 0
|
| 54 |
+
|
| 55 |
+
for column_name, manual_bucket in list(manual_refs.items()):
|
| 56 |
+
official_bucket = official_refs.get(column_name, [])
|
| 57 |
+
|
| 58 |
+
if isinstance(manual_bucket, list):
|
| 59 |
+
kept = []
|
| 60 |
+
seen = set()
|
| 61 |
+
for value in manual_bucket:
|
| 62 |
+
if not isinstance(value, str):
|
| 63 |
+
removed_count += 1
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
key = normalize_ref(value)
|
| 67 |
+
if not key or key in seen:
|
| 68 |
+
removed_count += 1
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
if ref_contains(official_bucket, value):
|
| 72 |
+
removed_count += 1
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
seen.add(key)
|
| 76 |
+
kept.append(value)
|
| 77 |
+
manual_refs[column_name] = kept
|
| 78 |
+
|
| 79 |
+
elif isinstance(manual_bucket, dict):
|
| 80 |
+
kept = {}
|
| 81 |
+
seen_values = set()
|
| 82 |
+
for alias, value in manual_bucket.items():
|
| 83 |
+
candidate = value if isinstance(value, str) else alias
|
| 84 |
+
key = normalize_ref(candidate)
|
| 85 |
+
if not key or key in seen_values:
|
| 86 |
+
removed_count += 1
|
| 87 |
+
continue
|
| 88 |
+
|
| 89 |
+
if ref_contains(official_bucket, candidate):
|
| 90 |
+
removed_count += 1
|
| 91 |
+
continue
|
| 92 |
+
|
| 93 |
+
seen_values.add(key)
|
| 94 |
+
kept[normalize_ref(alias)] = value
|
| 95 |
+
manual_refs[column_name] = kept
|
| 96 |
+
|
| 97 |
+
return removed_count
|
| 98 |
+
|
| 99 |
+
MANUAL_REFERENCES_REPO_PATH = "refdata/manual_references.json"
|
| 100 |
+
|
| 101 |
+
def reference_sync_status():
|
| 102 |
+
space_id = os.getenv("SPACE_ID", "")
|
| 103 |
+
has_token = bool(os.getenv("HF_TOKEN"))
|
| 104 |
+
|
| 105 |
+
if not space_id:
|
| 106 |
+
return {
|
| 107 |
+
"enabled": False,
|
| 108 |
+
"space_id": "",
|
| 109 |
+
"reason": "Reference sync is only available on Hugging Face Spaces.",
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
if not has_token:
|
| 113 |
+
return {
|
| 114 |
+
"enabled": False,
|
| 115 |
+
"space_id": space_id,
|
| 116 |
+
"reason": "HF_TOKEN secret is missing from this Space.",
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
return {
|
| 120 |
+
"enabled": True,
|
| 121 |
+
"space_id": space_id,
|
| 122 |
+
"reason": "",
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
def save_manual_references_to_hub(app_root: Path):
|
| 126 |
+
status = reference_sync_status()
|
| 127 |
+
if not status["enabled"]:
|
| 128 |
+
raise RuntimeError(status["reason"])
|
| 129 |
+
|
| 130 |
+
manual_refs_path = app_root / MANUAL_REFERENCES_REPO_PATH
|
| 131 |
+
if not manual_refs_path.is_file():
|
| 132 |
+
raise FileNotFoundError(f"Manual references file not found: {manual_refs_path}")
|
| 133 |
+
|
| 134 |
+
try:
|
| 135 |
+
from huggingface_hub import HfApi
|
| 136 |
+
except ImportError as exc:
|
| 137 |
+
raise RuntimeError("huggingface_hub is not installed.") from exc
|
| 138 |
+
|
| 139 |
+
api = HfApi(token=os.environ["HF_TOKEN"])
|
| 140 |
+
commit_info = api.upload_file(
|
| 141 |
+
path_or_fileobj=str(manual_refs_path),
|
| 142 |
+
path_in_repo=MANUAL_REFERENCES_REPO_PATH,
|
| 143 |
+
repo_id=status["space_id"],
|
| 144 |
+
repo_type="space",
|
| 145 |
+
commit_message="Update manual references",
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
return {
|
| 149 |
+
"space_id": status["space_id"],
|
| 150 |
+
"path": MANUAL_REFERENCES_REPO_PATH,
|
| 151 |
+
"commit_url": str(commit_info),
|
| 152 |
+
}
|
src/workbook_io.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uuid
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import openpyxl
|
| 5 |
+
from werkzeug.utils import secure_filename
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
ALLOWED_EXCEL_EXTENSIONS = (".xlsx", ".xlsm")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def save_uploaded_excel(uploaded, upload_dir: Path):
|
| 12 |
+
if not uploaded or not uploaded.filename:
|
| 13 |
+
raise ValueError("No file uploaded.")
|
| 14 |
+
|
| 15 |
+
filename = secure_filename(uploaded.filename)
|
| 16 |
+
if not filename.lower().endswith(ALLOWED_EXCEL_EXTENSIONS):
|
| 17 |
+
raise ValueError("Upload an .xlsx or .xlsm file.")
|
| 18 |
+
|
| 19 |
+
stem = Path(filename).stem
|
| 20 |
+
suffix = Path(filename).suffix
|
| 21 |
+
saved_filename = f"{stem}_{uuid.uuid4().hex[:8]}{suffix}"
|
| 22 |
+
destination = upload_dir / saved_filename
|
| 23 |
+
uploaded.save(destination)
|
| 24 |
+
return saved_filename, destination
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def read_workbook_sheets(path: Path) -> list[str]:
|
| 28 |
+
workbook = openpyxl.load_workbook(path, read_only=True, data_only=False)
|
| 29 |
+
try:
|
| 30 |
+
return workbook.sheetnames
|
| 31 |
+
finally:
|
| 32 |
+
workbook.close()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def resolve_allowed_path(raw_path: str, app_root: Path, allowed_roots: list[Path]) -> Path:
|
| 36 |
+
if not raw_path:
|
| 37 |
+
raise ValueError("Path is required.")
|
| 38 |
+
|
| 39 |
+
candidate = Path(raw_path)
|
| 40 |
+
if not candidate.is_absolute():
|
| 41 |
+
candidate = app_root / candidate
|
| 42 |
+
|
| 43 |
+
resolved = candidate.resolve()
|
| 44 |
+
allowed = [root.resolve() for root in allowed_roots]
|
| 45 |
+
if not any(resolved == root or resolved.is_relative_to(root) for root in allowed):
|
| 46 |
+
raise ValueError("Path is outside the application data directory.")
|
| 47 |
+
|
| 48 |
+
return resolved
|
ui/static/app.js
ADDED
|
@@ -0,0 +1,463 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const config = window.MASTERMAP_CONFIG || {};
|
| 2 |
+
const cleanPath = config.cleanPath || "";
|
| 3 |
+
let applyWorkbookPath = config.applyWorkbookPath || "";
|
| 4 |
+
let applyBlueprintPath = config.applyBlueprintPath || "";
|
| 5 |
+
const defaultOutputSheet = config.defaultOutputSheet || "Cleaned_Data";
|
| 6 |
+
const sheetSelect = document.getElementById("sheetSelect");
|
| 7 |
+
const applySheetSelect = document.getElementById("applySheetSelect");
|
| 8 |
+
const outputSheet = document.getElementById("outputSheet");
|
| 9 |
+
const models = document.getElementById("models");
|
| 10 |
+
const fetchModels = document.getElementById("fetchModels");
|
| 11 |
+
const runButton = document.getElementById("runButton");
|
| 12 |
+
const applyButton = document.getElementById("applyButton");
|
| 13 |
+
const saveReferencesButton = document.getElementById("saveReferencesButton");
|
| 14 |
+
const applyWorkbookForm = document.getElementById("applyWorkbookForm");
|
| 15 |
+
const applyBlueprintForm = document.getElementById("applyBlueprintForm");
|
| 16 |
+
const applyWorkbookInput = document.getElementById("applyWorkbookInput");
|
| 17 |
+
const applyBlueprintInput = document.getElementById("applyBlueprintInput");
|
| 18 |
+
const runStatus = document.getElementById("runStatus");
|
| 19 |
+
const applyStatus = document.getElementById("applyStatus");
|
| 20 |
+
const referencesStatus = document.getElementById("referencesStatus");
|
| 21 |
+
const applyWorkbookFile = document.getElementById("applyWorkbookFile");
|
| 22 |
+
const applyBlueprintFile = document.getElementById("applyBlueprintFile");
|
| 23 |
+
const cleanLogs = document.getElementById("cleanLogs");
|
| 24 |
+
const applyLogs = document.getElementById("applyLogs");
|
| 25 |
+
const cleanProgressPanel = document.getElementById("cleanProgressPanel");
|
| 26 |
+
const cleanProgressSummary = document.getElementById("cleanProgressSummary");
|
| 27 |
+
const cleanProgressList = document.getElementById("cleanProgressList");
|
| 28 |
+
const cleanResult = document.getElementById("cleanResult");
|
| 29 |
+
const applyResult = document.getElementById("applyResult");
|
| 30 |
+
let cleanRawLogText = "";
|
| 31 |
+
let cleanLiveLine = "";
|
| 32 |
+
let cleanProgressOrder = [];
|
| 33 |
+
let cleanProgressByColumn = {};
|
| 34 |
+
let applyRawLogText = "";
|
| 35 |
+
let applyLiveLine = "";
|
| 36 |
+
let activeRunStream = null;
|
| 37 |
+
let activeRunJobId = "";
|
| 38 |
+
let stopRequested = false;
|
| 39 |
+
|
| 40 |
+
function submitUploadForm(formId, statusId, event) {
|
| 41 |
+
if (event) {
|
| 42 |
+
event.preventDefault();
|
| 43 |
+
}
|
| 44 |
+
const form = document.getElementById(formId);
|
| 45 |
+
const status = document.getElementById(statusId);
|
| 46 |
+
[
|
| 47 |
+
["clean_selected_sheet", sheetSelect ? sheetSelect.value : ""],
|
| 48 |
+
["output_sheet", outputSheet ? outputSheet.value : ""],
|
| 49 |
+
["models", models ? models.value : ""],
|
| 50 |
+
["apply_selected_sheet", applySheetSelect ? applySheetSelect.value : ""]
|
| 51 |
+
].forEach(([name, value]) => {
|
| 52 |
+
let input = form.querySelector(`input[name="${name}"]`);
|
| 53 |
+
if (!input) {
|
| 54 |
+
input = document.createElement("input");
|
| 55 |
+
input.type = "hidden";
|
| 56 |
+
input.name = name;
|
| 57 |
+
form.appendChild(input);
|
| 58 |
+
}
|
| 59 |
+
input.value = value;
|
| 60 |
+
});
|
| 61 |
+
status.textContent = "Loading...";
|
| 62 |
+
|
| 63 |
+
if (formId === "applyWorkbookForm" || formId === "applyBlueprintForm") {
|
| 64 |
+
uploadApplyFile(form, status, formId);
|
| 65 |
+
return;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
form.submit();
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
async function uploadApplyFile(form, status, formId) {
|
| 72 |
+
try {
|
| 73 |
+
const res = await fetch(form.action, {
|
| 74 |
+
method: "POST",
|
| 75 |
+
body: new FormData(form),
|
| 76 |
+
headers: { "Accept": "application/json" }
|
| 77 |
+
});
|
| 78 |
+
const data = await res.json();
|
| 79 |
+
if (!res.ok) {
|
| 80 |
+
status.textContent = data.error || "Upload failed.";
|
| 81 |
+
status.classList.add("error");
|
| 82 |
+
return;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
status.classList.remove("error");
|
| 86 |
+
status.textContent = data.message || "Loaded.";
|
| 87 |
+
if (formId === "applyWorkbookForm") {
|
| 88 |
+
applyWorkbookPath = data.apply_workbook_path || "";
|
| 89 |
+
applyWorkbookFile.innerHTML = data.apply_workbook_filename
|
| 90 |
+
? `<div class="file-pill">${escapeHtml(data.apply_workbook_filename)}</div>`
|
| 91 |
+
: "";
|
| 92 |
+
}
|
| 93 |
+
if (formId === "applyBlueprintForm") {
|
| 94 |
+
applyBlueprintPath = data.apply_blueprint_path || "";
|
| 95 |
+
applyBlueprintFile.innerHTML = data.apply_blueprint_filename
|
| 96 |
+
? `<div class="file-pill">${escapeHtml(data.apply_blueprint_filename)}</div>`
|
| 97 |
+
: "";
|
| 98 |
+
}
|
| 99 |
+
if (data.apply_sheets) {
|
| 100 |
+
setApplySheets(data.apply_sheets, data.apply_selected_sheet);
|
| 101 |
+
}
|
| 102 |
+
applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
|
| 103 |
+
} catch (error) {
|
| 104 |
+
status.textContent = "Upload failed.";
|
| 105 |
+
status.classList.add("error");
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function clearCleanOutput() {
|
| 110 |
+
cleanRawLogText = "";
|
| 111 |
+
cleanLiveLine = "";
|
| 112 |
+
cleanProgressOrder = [];
|
| 113 |
+
cleanProgressByColumn = {};
|
| 114 |
+
cleanLogs.textContent = "";
|
| 115 |
+
cleanProgressPanel.classList.remove("active");
|
| 116 |
+
cleanProgressSummary.textContent = "Waiting...";
|
| 117 |
+
cleanProgressList.innerHTML = "";
|
| 118 |
+
cleanResult.classList.remove("active");
|
| 119 |
+
cleanResult.innerHTML = "";
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
function clearApplyOutput() {
|
| 123 |
+
applyRawLogText = "";
|
| 124 |
+
applyLiveLine = "";
|
| 125 |
+
applyLogs.textContent = "";
|
| 126 |
+
applyResult.classList.remove("active");
|
| 127 |
+
applyResult.innerHTML = "";
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
function parseProgressLine(line) {
|
| 131 |
+
const cleanLine = line.replace(/\x1b\[[0-9;]*m/g, "").trim();
|
| 132 |
+
const match = cleanLine.match(/^Cleaning\s+(.+?):\s+(\d+)%\|.*?\|\s+(\d+)\/(\d+)\s+\[([^\]]*)\]/);
|
| 133 |
+
if (!match) return null;
|
| 134 |
+
|
| 135 |
+
const columnName = match[1].trim();
|
| 136 |
+
const percent = Number(match[2]);
|
| 137 |
+
const current = match[3];
|
| 138 |
+
const total = match[4];
|
| 139 |
+
const bracketParts = match[5].split(",").map(part => part.trim()).filter(Boolean);
|
| 140 |
+
const timingParts = bracketParts.slice(0, 2);
|
| 141 |
+
const metricParts = bracketParts.slice(2);
|
| 142 |
+
const timing = timingParts.join(", ").includes("?") ? "estimating..." : timingParts.join(", ");
|
| 143 |
+
const metrics = metricParts.join(", ");
|
| 144 |
+
|
| 145 |
+
return { columnName, percent, current, total, timing, metrics };
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
function escapeHtml(value) {
|
| 149 |
+
return String(value).replace(/[&<>"']/g, char => ({
|
| 150 |
+
"&": "&",
|
| 151 |
+
"<": "<",
|
| 152 |
+
">": ">",
|
| 153 |
+
'"': """,
|
| 154 |
+
"'": "'"
|
| 155 |
+
}[char]));
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
function renderCleanProgressRows() {
|
| 159 |
+
cleanProgressPanel.classList.add("active");
|
| 160 |
+
cleanProgressSummary.textContent = `${cleanProgressOrder.length} column${cleanProgressOrder.length === 1 ? "" : "s"}`;
|
| 161 |
+
cleanProgressList.innerHTML = cleanProgressOrder.map(columnName => {
|
| 162 |
+
const item = cleanProgressByColumn[columnName];
|
| 163 |
+
const percent = Math.max(0, Math.min(100, item.percent));
|
| 164 |
+
const meta = `${item.current}/${item.total}${item.timing ? " | " + item.timing : ""}${item.metrics ? " | " + item.metrics : ""}`;
|
| 165 |
+
return `
|
| 166 |
+
<div class="progress-row">
|
| 167 |
+
<div class="progress-top">
|
| 168 |
+
<strong>${escapeHtml(columnName)}</strong>
|
| 169 |
+
<span>${percent}%</span>
|
| 170 |
+
</div>
|
| 171 |
+
<div class="progress-track">
|
| 172 |
+
<div class="progress-fill" style="width:${percent}%"></div>
|
| 173 |
+
</div>
|
| 174 |
+
<div class="progress-meta">${escapeHtml(meta)}</div>
|
| 175 |
+
</div>
|
| 176 |
+
`;
|
| 177 |
+
}).join("");
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
function renderCleanProgressLine(line) {
|
| 181 |
+
const parsed = parseProgressLine(line);
|
| 182 |
+
if (!parsed) return false;
|
| 183 |
+
|
| 184 |
+
if (!cleanProgressByColumn[parsed.columnName]) {
|
| 185 |
+
cleanProgressOrder.push(parsed.columnName);
|
| 186 |
+
}
|
| 187 |
+
cleanProgressByColumn[parsed.columnName] = parsed;
|
| 188 |
+
renderCleanProgressRows();
|
| 189 |
+
return true;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
function appendCleanLogChunk(chunk) {
|
| 193 |
+
for (const char of chunk) {
|
| 194 |
+
cleanRawLogText += char === "\r" ? "\n" : char;
|
| 195 |
+
if (char === "\r") {
|
| 196 |
+
renderCleanProgressLine(cleanLiveLine);
|
| 197 |
+
cleanLiveLine = "";
|
| 198 |
+
} else if (char === "\n") {
|
| 199 |
+
renderCleanProgressLine(cleanLiveLine);
|
| 200 |
+
cleanLiveLine = "";
|
| 201 |
+
} else {
|
| 202 |
+
cleanLiveLine += char;
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
const isPartialProgress = cleanLiveLine.startsWith("Cleaning ") && cleanLiveLine.includes("|");
|
| 206 |
+
if (!isPartialProgress) {
|
| 207 |
+
renderCleanProgressLine(cleanLiveLine);
|
| 208 |
+
}
|
| 209 |
+
cleanLogs.textContent = cleanRawLogText;
|
| 210 |
+
cleanLogs.scrollTop = cleanLogs.scrollHeight;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
function renderApplySummary() {
|
| 214 |
+
const changed = applyRawLogText.match(/Success!\s+(\d+)\s+corrections injected/i);
|
| 215 |
+
const added = applyRawLogText.match(/Memory updated:\s+(\d+)\s+new approved values added/i);
|
| 216 |
+
if (!changed && !added) return;
|
| 217 |
+
|
| 218 |
+
applyResult.classList.add("active");
|
| 219 |
+
applyResult.innerHTML = `
|
| 220 |
+
<strong>Blueprint applied</strong>
|
| 221 |
+
<div class="status">${changed ? changed[1] : "0"} workbook row value${changed && changed[1] === "1" ? "" : "s"} updated from human overrides.</div>
|
| 222 |
+
<div class="status">${added ? added[1] : "0"} new unique reference value${added && added[1] === "1" ? "" : "s"} added to manual references.</div>
|
| 223 |
+
<a class="download-link" href="/download-applied-workbook">Download Cleaned Workbook</a>
|
| 224 |
+
`;
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
function appendApplyLogChunk(chunk) {
|
| 228 |
+
for (const char of chunk) {
|
| 229 |
+
applyRawLogText += char === "\r" ? "\n" : char;
|
| 230 |
+
if (char === "\r" || char === "\n") {
|
| 231 |
+
applyLiveLine = "";
|
| 232 |
+
} else {
|
| 233 |
+
applyLiveLine += char;
|
| 234 |
+
}
|
| 235 |
+
}
|
| 236 |
+
applyLogs.textContent = applyRawLogText;
|
| 237 |
+
applyLogs.scrollTop = applyLogs.scrollHeight;
|
| 238 |
+
renderApplySummary();
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
function clearPersistedOutputs() {
|
| 242 |
+
[
|
| 243 |
+
"mastermap.cleanRawLogText",
|
| 244 |
+
"mastermap.applyRawLogText",
|
| 245 |
+
"mastermap.cleanResultHtml",
|
| 246 |
+
"mastermap.applyResultHtml",
|
| 247 |
+
"mastermap.cleanResultActive",
|
| 248 |
+
"mastermap.applyResultActive"
|
| 249 |
+
].forEach(key => localStorage.removeItem(key));
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
function setRunButtonIdle() {
|
| 253 |
+
activeRunStream = null;
|
| 254 |
+
activeRunJobId = "";
|
| 255 |
+
stopRequested = false;
|
| 256 |
+
runButton.disabled = false;
|
| 257 |
+
runButton.textContent = "Run Cleaning";
|
| 258 |
+
runButton.classList.remove("danger");
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
async function stopActiveRun() {
|
| 262 |
+
if (!activeRunJobId) return;
|
| 263 |
+
stopRequested = true;
|
| 264 |
+
runButton.disabled = true;
|
| 265 |
+
runButton.textContent = "Stopping...";
|
| 266 |
+
runStatus.textContent = "Stopping run...";
|
| 267 |
+
await fetch(`/stop?job_id=${encodeURIComponent(activeRunJobId)}`, { method: "POST" });
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
function selectApplySheet(sheetName) {
|
| 271 |
+
if (!sheetName) return;
|
| 272 |
+
const existing = Array.from(applySheetSelect.options).some(option => option.value === sheetName);
|
| 273 |
+
if (!existing) {
|
| 274 |
+
const option = document.createElement("option");
|
| 275 |
+
option.value = sheetName;
|
| 276 |
+
option.textContent = sheetName;
|
| 277 |
+
applySheetSelect.appendChild(option);
|
| 278 |
+
}
|
| 279 |
+
applySheetSelect.value = sheetName;
|
| 280 |
+
applySheetSelect.disabled = false;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
function setApplySheets(sheets, preferredSheet) {
|
| 284 |
+
const selected = preferredSheet || applySheetSelect.value;
|
| 285 |
+
applySheetSelect.innerHTML = "";
|
| 286 |
+
sheets.forEach(sheetName => {
|
| 287 |
+
const option = document.createElement("option");
|
| 288 |
+
option.value = sheetName;
|
| 289 |
+
option.textContent = sheetName;
|
| 290 |
+
applySheetSelect.appendChild(option);
|
| 291 |
+
});
|
| 292 |
+
if (selected && sheets.includes(selected)) {
|
| 293 |
+
applySheetSelect.value = selected;
|
| 294 |
+
} else if (sheets.length) {
|
| 295 |
+
applySheetSelect.value = sheets[0];
|
| 296 |
+
}
|
| 297 |
+
applySheetSelect.disabled = sheets.length === 0;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
async function refreshApplySheets(preferredSheet) {
|
| 301 |
+
if (!applyWorkbookPath) return selectApplySheet(preferredSheet);
|
| 302 |
+
const params = new URLSearchParams({ path: applyWorkbookPath });
|
| 303 |
+
const res = await fetch(`/sheets?${params.toString()}`);
|
| 304 |
+
if (!res.ok) {
|
| 305 |
+
selectApplySheet(preferredSheet);
|
| 306 |
+
return;
|
| 307 |
+
}
|
| 308 |
+
const data = await res.json();
|
| 309 |
+
setApplySheets(data.sheets || [], preferredSheet);
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
async function refreshReferenceSyncStatus(updateText = true) {
|
| 313 |
+
try {
|
| 314 |
+
const res = await fetch("/references/status");
|
| 315 |
+
const data = await res.json();
|
| 316 |
+
saveReferencesButton.disabled = !data.enabled;
|
| 317 |
+
if (updateText) {
|
| 318 |
+
referencesStatus.textContent = data.enabled
|
| 319 |
+
? `Ready to save manual references to ${data.space_id}.`
|
| 320 |
+
: data.reason || "Reference sync is unavailable.";
|
| 321 |
+
}
|
| 322 |
+
} catch (error) {
|
| 323 |
+
saveReferencesButton.disabled = true;
|
| 324 |
+
if (updateText) {
|
| 325 |
+
referencesStatus.textContent = "Reference sync status unavailable.";
|
| 326 |
+
}
|
| 327 |
+
}
|
| 328 |
+
}
|
| 329 |
+
|
| 330 |
+
clearPersistedOutputs();
|
| 331 |
+
refreshReferenceSyncStatus();
|
| 332 |
+
|
| 333 |
+
applyWorkbookForm.addEventListener("submit", event => {
|
| 334 |
+
submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
|
| 335 |
+
});
|
| 336 |
+
applyBlueprintForm.addEventListener("submit", event => {
|
| 337 |
+
submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
|
| 338 |
+
});
|
| 339 |
+
applyWorkbookInput.addEventListener("change", event => {
|
| 340 |
+
submitUploadForm("applyWorkbookForm", "applyWorkbookUploadStatus", event);
|
| 341 |
+
});
|
| 342 |
+
applyBlueprintInput.addEventListener("change", event => {
|
| 343 |
+
submitUploadForm("applyBlueprintForm", "applyBlueprintUploadStatus", event);
|
| 344 |
+
});
|
| 345 |
+
|
| 346 |
+
fetchModels.addEventListener("click", async () => {
|
| 347 |
+
fetchModels.disabled = true;
|
| 348 |
+
runStatus.textContent = "Fetching Groq models...";
|
| 349 |
+
const res = await fetch("/models");
|
| 350 |
+
const data = await res.json();
|
| 351 |
+
fetchModels.disabled = false;
|
| 352 |
+
if (!res.ok) {
|
| 353 |
+
runStatus.textContent = data.error || "Could not fetch models";
|
| 354 |
+
return;
|
| 355 |
+
}
|
| 356 |
+
models.value = data.models.join(",");
|
| 357 |
+
runStatus.textContent = "Model list updated.";
|
| 358 |
+
});
|
| 359 |
+
|
| 360 |
+
saveReferencesButton.addEventListener("click", async () => {
|
| 361 |
+
saveReferencesButton.disabled = true;
|
| 362 |
+
referencesStatus.textContent = "Saving manual references...";
|
| 363 |
+
try {
|
| 364 |
+
const res = await fetch("/references/save", { method: "POST" });
|
| 365 |
+
const data = await res.json();
|
| 366 |
+
if (!res.ok) {
|
| 367 |
+
referencesStatus.textContent = data.error || "Could not save manual references.";
|
| 368 |
+
await refreshReferenceSyncStatus();
|
| 369 |
+
return;
|
| 370 |
+
}
|
| 371 |
+
referencesStatus.textContent = data.message || "Manual references saved.";
|
| 372 |
+
} catch (error) {
|
| 373 |
+
referencesStatus.textContent = "Could not save manual references.";
|
| 374 |
+
} finally {
|
| 375 |
+
await refreshReferenceSyncStatus(false);
|
| 376 |
+
}
|
| 377 |
+
});
|
| 378 |
+
|
| 379 |
+
runButton.addEventListener("click", () => {
|
| 380 |
+
if (activeRunStream) {
|
| 381 |
+
stopActiveRun();
|
| 382 |
+
return;
|
| 383 |
+
}
|
| 384 |
+
if (!cleanPath || !sheetSelect.value) return;
|
| 385 |
+
clearCleanOutput();
|
| 386 |
+
activeRunJobId = window.crypto && window.crypto.randomUUID ? window.crypto.randomUUID() : String(Date.now());
|
| 387 |
+
stopRequested = false;
|
| 388 |
+
runButton.disabled = false;
|
| 389 |
+
runButton.textContent = "Stop Cleaning";
|
| 390 |
+
runButton.classList.add("danger");
|
| 391 |
+
runStatus.textContent = "Running...";
|
| 392 |
+
const params = new URLSearchParams({
|
| 393 |
+
job_id: activeRunJobId,
|
| 394 |
+
input: cleanPath,
|
| 395 |
+
sheet: sheetSelect.value,
|
| 396 |
+
output_sheet: outputSheet.value || defaultOutputSheet,
|
| 397 |
+
models: models.value.trim()
|
| 398 |
+
});
|
| 399 |
+
const stream = new EventSource(`/run?${params.toString()}`);
|
| 400 |
+
activeRunStream = stream;
|
| 401 |
+
stream.onmessage = event => appendCleanLogChunk(JSON.parse(event.data));
|
| 402 |
+
stream.addEventListener("done", async () => {
|
| 403 |
+
stream.close();
|
| 404 |
+
if (!stopRequested) {
|
| 405 |
+
applyBlueprintPath = "data/Blueprint.xlsx";
|
| 406 |
+
const targetSheet = outputSheet.value || defaultOutputSheet;
|
| 407 |
+
await refreshApplySheets(targetSheet);
|
| 408 |
+
applyButton.disabled = !(applyWorkbookPath && applyBlueprintPath && applySheetSelect.value);
|
| 409 |
+
applyStatus.textContent = "Generated blueprint is ready for Apply Blueprint.";
|
| 410 |
+
cleanResult.classList.add("active");
|
| 411 |
+
cleanResult.innerHTML = `
|
| 412 |
+
<strong>Blueprint generated</strong>
|
| 413 |
+
<div class="status">Blueprint saved at data/Blueprint.xlsx</div>
|
| 414 |
+
<a class="download-link" href="/download-blueprint">Download Blueprint</a>
|
| 415 |
+
<a class="download-link" href="/download-cleaned-workbook">Download Cleaned Workbook</a>
|
| 416 |
+
`;
|
| 417 |
+
runStatus.textContent = "Finished.";
|
| 418 |
+
} else {
|
| 419 |
+
runStatus.textContent = "Stopped.";
|
| 420 |
+
}
|
| 421 |
+
setRunButtonIdle();
|
| 422 |
+
});
|
| 423 |
+
stream.addEventListener("failed", () => {
|
| 424 |
+
stream.close();
|
| 425 |
+
setRunButtonIdle();
|
| 426 |
+
runStatus.textContent = "Run failed. Check logs.";
|
| 427 |
+
});
|
| 428 |
+
stream.addEventListener("error", () => {
|
| 429 |
+
stream.close();
|
| 430 |
+
setRunButtonIdle();
|
| 431 |
+
runStatus.textContent = "Run stopped. Check logs.";
|
| 432 |
+
});
|
| 433 |
+
});
|
| 434 |
+
|
| 435 |
+
applyButton.addEventListener("click", () => {
|
| 436 |
+
if (!applyWorkbookPath || !applyBlueprintPath || !applySheetSelect.value) return;
|
| 437 |
+
clearApplyOutput();
|
| 438 |
+
applyButton.disabled = true;
|
| 439 |
+
applyStatus.textContent = "Applying blueprint...";
|
| 440 |
+
const params = new URLSearchParams({
|
| 441 |
+
input: applyWorkbookPath,
|
| 442 |
+
blueprint: applyBlueprintPath,
|
| 443 |
+
sheet: applySheetSelect.value
|
| 444 |
+
});
|
| 445 |
+
const stream = new EventSource(`/apply?${params.toString()}`);
|
| 446 |
+
stream.onmessage = event => appendApplyLogChunk(JSON.parse(event.data));
|
| 447 |
+
stream.addEventListener("done", () => {
|
| 448 |
+
stream.close();
|
| 449 |
+
applyButton.disabled = false;
|
| 450 |
+
applyStatus.textContent = "Finished.";
|
| 451 |
+
renderApplySummary();
|
| 452 |
+
});
|
| 453 |
+
stream.addEventListener("failed", () => {
|
| 454 |
+
stream.close();
|
| 455 |
+
applyButton.disabled = false;
|
| 456 |
+
applyStatus.textContent = "Apply failed. Check logs.";
|
| 457 |
+
});
|
| 458 |
+
stream.addEventListener("error", () => {
|
| 459 |
+
stream.close();
|
| 460 |
+
applyButton.disabled = false;
|
| 461 |
+
applyStatus.textContent = "Apply stopped. Check logs.";
|
| 462 |
+
});
|
| 463 |
+
});
|
ui/static/styles.css
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--bg: #f6f7f9;
|
| 3 |
+
--panel: #ffffff;
|
| 4 |
+
--text: #17202a;
|
| 5 |
+
--muted: #627386;
|
| 6 |
+
--line: #d9e0e7;
|
| 7 |
+
--accent: #2457c5;
|
| 8 |
+
--accent-dark: #1d459c;
|
| 9 |
+
--danger: #b42318;
|
| 10 |
+
}
|
| 11 |
+
* { box-sizing: border-box; }
|
| 12 |
+
body {
|
| 13 |
+
margin: 0;
|
| 14 |
+
font-family: Arial, sans-serif;
|
| 15 |
+
color: var(--text);
|
| 16 |
+
background: var(--bg);
|
| 17 |
+
}
|
| 18 |
+
header {
|
| 19 |
+
padding: 20px 28px;
|
| 20 |
+
background: var(--panel);
|
| 21 |
+
border-bottom: 1px solid var(--line);
|
| 22 |
+
}
|
| 23 |
+
h1 { margin: 0; font-size: 22px; }
|
| 24 |
+
h2 { font-size: 16px; margin: 0 0 12px; }
|
| 25 |
+
main {
|
| 26 |
+
display: grid;
|
| 27 |
+
grid-template-columns: 380px 1fr;
|
| 28 |
+
gap: 18px;
|
| 29 |
+
padding: 18px 28px 28px;
|
| 30 |
+
}
|
| 31 |
+
section {
|
| 32 |
+
background: var(--panel);
|
| 33 |
+
border: 1px solid var(--line);
|
| 34 |
+
border-radius: 8px;
|
| 35 |
+
padding: 16px;
|
| 36 |
+
}
|
| 37 |
+
label {
|
| 38 |
+
display: block;
|
| 39 |
+
margin: 14px 0 6px;
|
| 40 |
+
font-weight: 700;
|
| 41 |
+
font-size: 13px;
|
| 42 |
+
}
|
| 43 |
+
input, select, textarea, button {
|
| 44 |
+
width: 100%;
|
| 45 |
+
font: inherit;
|
| 46 |
+
border: 1px solid var(--line);
|
| 47 |
+
border-radius: 6px;
|
| 48 |
+
padding: 10px 11px;
|
| 49 |
+
background: #fff;
|
| 50 |
+
}
|
| 51 |
+
textarea {
|
| 52 |
+
min-height: 86px;
|
| 53 |
+
resize: vertical;
|
| 54 |
+
font-family: Consolas, monospace;
|
| 55 |
+
font-size: 13px;
|
| 56 |
+
}
|
| 57 |
+
button {
|
| 58 |
+
cursor: pointer;
|
| 59 |
+
background: var(--accent);
|
| 60 |
+
color: white;
|
| 61 |
+
border-color: var(--accent);
|
| 62 |
+
font-weight: 700;
|
| 63 |
+
margin-top: 10px;
|
| 64 |
+
}
|
| 65 |
+
button:hover { background: var(--accent-dark); }
|
| 66 |
+
button.secondary {
|
| 67 |
+
background: white;
|
| 68 |
+
color: var(--accent);
|
| 69 |
+
}
|
| 70 |
+
button.danger {
|
| 71 |
+
background: white;
|
| 72 |
+
color: var(--danger);
|
| 73 |
+
border-color: var(--danger);
|
| 74 |
+
}
|
| 75 |
+
button.danger:hover { background: #fff1f0; }
|
| 76 |
+
button:disabled {
|
| 77 |
+
opacity: 0.55;
|
| 78 |
+
cursor: not-allowed;
|
| 79 |
+
}
|
| 80 |
+
.dropbox {
|
| 81 |
+
border: 2px dashed #aeb9c7;
|
| 82 |
+
border-radius: 8px;
|
| 83 |
+
background: #fbfcfe;
|
| 84 |
+
padding: 14px;
|
| 85 |
+
color: var(--muted);
|
| 86 |
+
}
|
| 87 |
+
.dropbox strong {
|
| 88 |
+
display: block;
|
| 89 |
+
color: var(--text);
|
| 90 |
+
margin-bottom: 8px;
|
| 91 |
+
}
|
| 92 |
+
.dropbox input {
|
| 93 |
+
background: white;
|
| 94 |
+
cursor: pointer;
|
| 95 |
+
}
|
| 96 |
+
.file-pill {
|
| 97 |
+
border: 1px solid var(--line);
|
| 98 |
+
border-radius: 6px;
|
| 99 |
+
padding: 10px 11px;
|
| 100 |
+
background: #f8fafc;
|
| 101 |
+
color: var(--text);
|
| 102 |
+
word-break: break-word;
|
| 103 |
+
margin-bottom: 8px;
|
| 104 |
+
}
|
| 105 |
+
.status {
|
| 106 |
+
margin-top: 10px;
|
| 107 |
+
color: var(--muted);
|
| 108 |
+
font-size: 13px;
|
| 109 |
+
min-height: 18px;
|
| 110 |
+
}
|
| 111 |
+
.status.error { color: var(--danger); }
|
| 112 |
+
.progress-panel {
|
| 113 |
+
display: none;
|
| 114 |
+
margin-bottom: 12px;
|
| 115 |
+
background: #ffffff;
|
| 116 |
+
border: 1px solid var(--line);
|
| 117 |
+
border-radius: 8px;
|
| 118 |
+
padding: 12px;
|
| 119 |
+
}
|
| 120 |
+
.progress-panel.active {
|
| 121 |
+
display: block;
|
| 122 |
+
}
|
| 123 |
+
.progress-heading {
|
| 124 |
+
display: flex;
|
| 125 |
+
justify-content: space-between;
|
| 126 |
+
gap: 12px;
|
| 127 |
+
align-items: center;
|
| 128 |
+
font-size: 13px;
|
| 129 |
+
font-weight: 700;
|
| 130 |
+
margin-bottom: 10px;
|
| 131 |
+
}
|
| 132 |
+
.progress-list {
|
| 133 |
+
display: grid;
|
| 134 |
+
gap: 10px;
|
| 135 |
+
}
|
| 136 |
+
.progress-row {
|
| 137 |
+
border: 1px solid var(--line);
|
| 138 |
+
border-radius: 6px;
|
| 139 |
+
padding: 10px;
|
| 140 |
+
background: #f8fafc;
|
| 141 |
+
}
|
| 142 |
+
.progress-top {
|
| 143 |
+
display: flex;
|
| 144 |
+
justify-content: space-between;
|
| 145 |
+
gap: 12px;
|
| 146 |
+
font-size: 13px;
|
| 147 |
+
margin-bottom: 8px;
|
| 148 |
+
}
|
| 149 |
+
.progress-track {
|
| 150 |
+
height: 10px;
|
| 151 |
+
background: #e8edf3;
|
| 152 |
+
border-radius: 999px;
|
| 153 |
+
overflow: hidden;
|
| 154 |
+
}
|
| 155 |
+
.progress-fill {
|
| 156 |
+
height: 100%;
|
| 157 |
+
width: 0%;
|
| 158 |
+
background: var(--accent);
|
| 159 |
+
}
|
| 160 |
+
.progress-meta {
|
| 161 |
+
margin-top: 8px;
|
| 162 |
+
color: var(--muted);
|
| 163 |
+
font-size: 12px;
|
| 164 |
+
line-height: 1.4;
|
| 165 |
+
}
|
| 166 |
+
.blueprint-result {
|
| 167 |
+
display: none;
|
| 168 |
+
margin-bottom: 12px;
|
| 169 |
+
border: 1px solid var(--line);
|
| 170 |
+
border-radius: 8px;
|
| 171 |
+
background: #f8fafc;
|
| 172 |
+
padding: 12px;
|
| 173 |
+
font-size: 13px;
|
| 174 |
+
}
|
| 175 |
+
.blueprint-result.active {
|
| 176 |
+
display: block;
|
| 177 |
+
}
|
| 178 |
+
.download-link {
|
| 179 |
+
display: inline-block;
|
| 180 |
+
color: white;
|
| 181 |
+
background: var(--accent);
|
| 182 |
+
border-radius: 6px;
|
| 183 |
+
padding: 9px 11px;
|
| 184 |
+
text-decoration: none;
|
| 185 |
+
font-weight: 700;
|
| 186 |
+
margin-top: 8px;
|
| 187 |
+
}
|
| 188 |
+
.download-link:hover {
|
| 189 |
+
background: var(--accent-dark);
|
| 190 |
+
}
|
| 191 |
+
.raw-logs {
|
| 192 |
+
border: 1px solid var(--line);
|
| 193 |
+
border-radius: 8px;
|
| 194 |
+
background: #ffffff;
|
| 195 |
+
padding: 10px;
|
| 196 |
+
}
|
| 197 |
+
.raw-logs summary {
|
| 198 |
+
cursor: pointer;
|
| 199 |
+
font-size: 13px;
|
| 200 |
+
font-weight: 700;
|
| 201 |
+
}
|
| 202 |
+
.output-block {
|
| 203 |
+
margin-bottom: 14px;
|
| 204 |
+
}
|
| 205 |
+
.output-block h2 {
|
| 206 |
+
margin-bottom: 10px;
|
| 207 |
+
}
|
| 208 |
+
.log-screen {
|
| 209 |
+
height: 420px;
|
| 210 |
+
margin-top: 10px;
|
| 211 |
+
overflow: auto;
|
| 212 |
+
background: #0f172a;
|
| 213 |
+
color: #dbeafe;
|
| 214 |
+
border-radius: 8px;
|
| 215 |
+
padding: 14px;
|
| 216 |
+
white-space: pre-wrap;
|
| 217 |
+
font-family: Consolas, monospace;
|
| 218 |
+
font-size: 13px;
|
| 219 |
+
line-height: 1.45;
|
| 220 |
+
}
|
| 221 |
+
hr {
|
| 222 |
+
border: 0;
|
| 223 |
+
border-top: 1px solid var(--line);
|
| 224 |
+
margin: 18px 0;
|
| 225 |
+
}
|
ui/templates/index.html
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>MasterMap Cleaner</title>
|
| 7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
|
| 8 |
+
|
| 9 |
+
</head>
|
| 10 |
+
<body>
|
| 11 |
+
<header>
|
| 12 |
+
<h1>MasterMap Cleaner</h1>
|
| 13 |
+
</header>
|
| 14 |
+
<main>
|
| 15 |
+
<section>
|
| 16 |
+
<h2>Dataset to Clean</h2>
|
| 17 |
+
{% if clean_filename %}
|
| 18 |
+
<div class="file-pill">{{ clean_filename }}</div>
|
| 19 |
+
<form action="/remove-clean" method="post">
|
| 20 |
+
<button class="danger" type="submit">Remove File</button>
|
| 21 |
+
</form>
|
| 22 |
+
{% else %}
|
| 23 |
+
<form id="cleanUploadForm" action="/prepare-clean" method="post" enctype="multipart/form-data">
|
| 24 |
+
<div class="dropbox">
|
| 25 |
+
<strong>Drop or select an Excel file</strong>
|
| 26 |
+
<input name="file" type="file" accept=".xlsx,.xlsm" required onchange="submitUploadForm('cleanUploadForm', 'cleanUploadStatus')">
|
| 27 |
+
</div>
|
| 28 |
+
<div id="cleanUploadStatus" class="status"></div>
|
| 29 |
+
</form>
|
| 30 |
+
{% endif %}
|
| 31 |
+
|
| 32 |
+
<label for="sheetSelect">Source Sheet</label>
|
| 33 |
+
<select id="sheetSelect" {% if not clean_sheets %}disabled{% endif %}>
|
| 34 |
+
{% for sheet in clean_sheets %}
|
| 35 |
+
<option value="{{ sheet }}" {% if sheet == clean_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
|
| 36 |
+
{% endfor %}
|
| 37 |
+
</select>
|
| 38 |
+
|
| 39 |
+
<label for="outputSheet">Output Sheet</label>
|
| 40 |
+
<input id="outputSheet" value="{{ output_sheet }}">
|
| 41 |
+
|
| 42 |
+
<label for="models">Groq Models</label>
|
| 43 |
+
<textarea id="models">{{ models or default_models }}</textarea>
|
| 44 |
+
<button id="fetchModels" class="secondary" type="button">Fetch Available Models</button>
|
| 45 |
+
|
| 46 |
+
<button id="runButton" type="button" {% if not clean_path %}disabled{% endif %}>Run Cleaning</button>
|
| 47 |
+
<div id="runStatus" class="status">{{ message }}</div>
|
| 48 |
+
|
| 49 |
+
<hr>
|
| 50 |
+
|
| 51 |
+
<h2>Apply Blueprint</h2>
|
| 52 |
+
<label>Workbook</label>
|
| 53 |
+
<div id="applyWorkbookFile">
|
| 54 |
+
{% if apply_workbook_filename %}
|
| 55 |
+
<div class="file-pill">{{ apply_workbook_filename }}</div>
|
| 56 |
+
{% endif %}
|
| 57 |
+
</div>
|
| 58 |
+
<form id="applyWorkbookForm" action="/prepare-apply-workbook" method="post" enctype="multipart/form-data" onsubmit="return false;">
|
| 59 |
+
<div class="dropbox">
|
| 60 |
+
<strong>Drop or select workbook</strong>
|
| 61 |
+
<input id="applyWorkbookInput" name="file" type="file" accept=".xlsx,.xlsm" required>
|
| 62 |
+
</div>
|
| 63 |
+
<div id="applyWorkbookUploadStatus" class="status"></div>
|
| 64 |
+
</form>
|
| 65 |
+
|
| 66 |
+
<label for="applySheetSelect">Sheet To Update</label>
|
| 67 |
+
<select id="applySheetSelect" {% if not apply_sheets %}disabled{% endif %}>
|
| 68 |
+
{% for sheet in apply_sheets %}
|
| 69 |
+
<option value="{{ sheet }}" {% if sheet == apply_selected_sheet %}selected{% endif %}>{{ sheet }}</option>
|
| 70 |
+
{% endfor %}
|
| 71 |
+
</select>
|
| 72 |
+
|
| 73 |
+
<label>Blueprint</label>
|
| 74 |
+
<div id="applyBlueprintFile">
|
| 75 |
+
{% if apply_blueprint_filename %}
|
| 76 |
+
<div class="file-pill">{{ apply_blueprint_filename }}</div>
|
| 77 |
+
{% endif %}
|
| 78 |
+
</div>
|
| 79 |
+
<form id="applyBlueprintForm" action="/prepare-apply-blueprint" method="post" enctype="multipart/form-data" onsubmit="return false;">
|
| 80 |
+
<div class="dropbox">
|
| 81 |
+
<strong>Drop or select blueprint</strong>
|
| 82 |
+
<input id="applyBlueprintInput" name="file" type="file" accept=".xlsx,.xlsm" required>
|
| 83 |
+
</div>
|
| 84 |
+
<div id="applyBlueprintUploadStatus" class="status"></div>
|
| 85 |
+
</form>
|
| 86 |
+
|
| 87 |
+
<button id="applyButton" type="button" {% if not can_apply %}disabled{% endif %}>Apply Blueprint</button>
|
| 88 |
+
<div id="applyStatus" class="status error">{{ error }}</div>
|
| 89 |
+
|
| 90 |
+
<hr>
|
| 91 |
+
|
| 92 |
+
<h2>References</h2>
|
| 93 |
+
<button id="saveReferencesButton" class="secondary" type="button" disabled>Save Manual References</button>
|
| 94 |
+
<div id="referencesStatus" class="status"></div>
|
| 95 |
+
</section>
|
| 96 |
+
|
| 97 |
+
<section>
|
| 98 |
+
<div class="output-block">
|
| 99 |
+
<h2>Cleaning Output</h2>
|
| 100 |
+
<div id="cleanProgressPanel" class="progress-panel">
|
| 101 |
+
<div class="progress-heading">
|
| 102 |
+
<span>Cleaning Progress</span>
|
| 103 |
+
<span id="cleanProgressSummary">Waiting...</span>
|
| 104 |
+
</div>
|
| 105 |
+
<div id="cleanProgressList" class="progress-list"></div>
|
| 106 |
+
</div>
|
| 107 |
+
<div id="cleanResult" class="blueprint-result"></div>
|
| 108 |
+
<details id="cleanRawLogDetails" class="raw-logs">
|
| 109 |
+
<summary>Raw cleaning logs</summary>
|
| 110 |
+
<div id="cleanLogs" class="log-screen"></div>
|
| 111 |
+
</details>
|
| 112 |
+
</div>
|
| 113 |
+
|
| 114 |
+
<div class="output-block">
|
| 115 |
+
<h2>Apply Output</h2>
|
| 116 |
+
<div id="applyResult" class="blueprint-result"></div>
|
| 117 |
+
<details id="applyRawLogDetails" class="raw-logs">
|
| 118 |
+
<summary>Raw apply logs</summary>
|
| 119 |
+
<div id="applyLogs" class="log-screen"></div>
|
| 120 |
+
</details>
|
| 121 |
+
</div>
|
| 122 |
+
</section>
|
| 123 |
+
</main>
|
| 124 |
+
|
| 125 |
+
<script>
|
| 126 |
+
window.MASTERMAP_CONFIG = {
|
| 127 |
+
cleanPath: {{ clean_path|tojson }},
|
| 128 |
+
applyWorkbookPath: {{ apply_workbook_path|tojson }},
|
| 129 |
+
applyBlueprintPath: {{ apply_blueprint_path|tojson }},
|
| 130 |
+
defaultOutputSheet: {{ default_output_sheet|tojson }}
|
| 131 |
+
};
|
| 132 |
+
</script>
|
| 133 |
+
<script src="{{ url_for('static', filename='app.js') }}"></script>
|
| 134 |
+
|
| 135 |
+
</body>
|
| 136 |
+
</html>
|
ui_app.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import uuid
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from flask import Flask, Response, jsonify, render_template, request, send_file
|
| 6 |
+
|
| 7 |
+
from newest_model import PREFERRED_PRODUCTION_CHAT_MODELS, select_groq_chat_models
|
| 8 |
+
from src.config import AVAILABLE_MODELS, DATA_DIR, DEFAULT_OUTPUT_SHEET_NAME
|
| 9 |
+
from src.process_runner import stop_process, stream_process
|
| 10 |
+
from src.utils import reference_sync_status, save_manual_references_to_hub
|
| 11 |
+
from src.workbook_io import read_workbook_sheets, resolve_allowed_path, save_uploaded_excel
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
APP_ROOT = Path(__file__).resolve().parent
|
| 15 |
+
UPLOAD_DIR = APP_ROOT / DATA_DIR / "uploads"
|
| 16 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
ALLOWED_FILE_ROOTS = [APP_ROOT / DATA_DIR]
|
| 18 |
+
|
| 19 |
+
app = Flask(
|
| 20 |
+
__name__,
|
| 21 |
+
template_folder=str(APP_ROOT / "ui" / "templates"),
|
| 22 |
+
static_folder=str(APP_ROOT / "ui" / "static"),
|
| 23 |
+
)
|
| 24 |
+
app.config["MAX_CONTENT_LENGTH"] = 100 * 1024 * 1024
|
| 25 |
+
|
| 26 |
+
STATE = {
|
| 27 |
+
"clean_path": "",
|
| 28 |
+
"clean_filename": "",
|
| 29 |
+
"clean_sheets": [],
|
| 30 |
+
"clean_selected_sheet": "",
|
| 31 |
+
"output_sheet": DEFAULT_OUTPUT_SHEET_NAME,
|
| 32 |
+
"models": "",
|
| 33 |
+
"apply_workbook_path": "",
|
| 34 |
+
"apply_workbook_filename": "",
|
| 35 |
+
"apply_sheets": [],
|
| 36 |
+
"apply_selected_sheet": "",
|
| 37 |
+
"apply_blueprint_path": "",
|
| 38 |
+
"apply_blueprint_filename": "",
|
| 39 |
+
}
|
| 40 |
+
@app.after_request
|
| 41 |
+
def prevent_browser_cache(response):
|
| 42 |
+
response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
|
| 43 |
+
response.headers["Pragma"] = "no-cache"
|
| 44 |
+
response.headers["Expires"] = "0"
|
| 45 |
+
return response
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def default_models() -> str:
|
| 50 |
+
preferred_model_ids = {model.lower() for model in PREFERRED_PRODUCTION_CHAT_MODELS}
|
| 51 |
+
env_preferred_models = [
|
| 52 |
+
model
|
| 53 |
+
for model in AVAILABLE_MODELS
|
| 54 |
+
if model.lower() in preferred_model_ids
|
| 55 |
+
]
|
| 56 |
+
return ",".join(env_preferred_models or PREFERRED_PRODUCTION_CHAT_MODELS)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def render_page(message: str = "", error: str = ""):
|
| 60 |
+
if STATE["clean_sheets"]:
|
| 61 |
+
STATE["clean_selected_sheet"] = pick_sheet(STATE["clean_sheets"], STATE["clean_selected_sheet"])
|
| 62 |
+
if STATE["apply_sheets"]:
|
| 63 |
+
STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], STATE["apply_selected_sheet"])
|
| 64 |
+
|
| 65 |
+
return render_template(
|
| 66 |
+
"index.html",
|
| 67 |
+
**STATE,
|
| 68 |
+
default_output_sheet=DEFAULT_OUTPUT_SHEET_NAME,
|
| 69 |
+
default_models=default_models(),
|
| 70 |
+
can_apply=can_apply_blueprint(),
|
| 71 |
+
message=message,
|
| 72 |
+
error=error,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def can_apply_blueprint() -> bool:
|
| 77 |
+
return bool(
|
| 78 |
+
STATE["apply_workbook_path"]
|
| 79 |
+
and STATE["apply_blueprint_path"]
|
| 80 |
+
and STATE["apply_sheets"]
|
| 81 |
+
and STATE["apply_selected_sheet"]
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def wants_json_response() -> bool:
|
| 86 |
+
return "application/json" in request.headers.get("Accept", "")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def ui_state_payload(message: str = "", error: str = ""):
|
| 90 |
+
return {
|
| 91 |
+
"message": message,
|
| 92 |
+
"error": error,
|
| 93 |
+
"apply_workbook_path": STATE["apply_workbook_path"],
|
| 94 |
+
"apply_workbook_filename": STATE["apply_workbook_filename"],
|
| 95 |
+
"apply_sheets": STATE["apply_sheets"],
|
| 96 |
+
"apply_selected_sheet": STATE["apply_selected_sheet"],
|
| 97 |
+
"apply_blueprint_path": STATE["apply_blueprint_path"],
|
| 98 |
+
"apply_blueprint_filename": STATE["apply_blueprint_filename"],
|
| 99 |
+
"can_apply": can_apply_blueprint(),
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def pick_sheet(sheets, preferred_sheet=None):
|
| 104 |
+
if preferred_sheet and preferred_sheet in sheets:
|
| 105 |
+
return preferred_sheet
|
| 106 |
+
if STATE["output_sheet"] in sheets:
|
| 107 |
+
return STATE["output_sheet"]
|
| 108 |
+
return sheets[0] if sheets else ""
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def update_ui_state_from_form(form):
|
| 112 |
+
STATE["clean_selected_sheet"] = form.get("clean_selected_sheet") or STATE["clean_selected_sheet"]
|
| 113 |
+
STATE["output_sheet"] = form.get("output_sheet") or STATE["output_sheet"] or DEFAULT_OUTPUT_SHEET_NAME
|
| 114 |
+
STATE["models"] = form.get("models") or STATE["models"]
|
| 115 |
+
STATE["apply_selected_sheet"] = form.get("apply_selected_sheet") or STATE["apply_selected_sheet"]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
@app.route("/")
|
| 119 |
+
def index():
|
| 120 |
+
return render_page()
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@app.route("/prepare-clean", methods=["POST"])
|
| 124 |
+
def prepare_clean():
|
| 125 |
+
try:
|
| 126 |
+
update_ui_state_from_form(request.form)
|
| 127 |
+
filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
|
| 128 |
+
sheets = read_workbook_sheets(path)
|
| 129 |
+
except Exception as exc:
|
| 130 |
+
return render_page(error=str(exc))
|
| 131 |
+
|
| 132 |
+
STATE["clean_path"] = str(path)
|
| 133 |
+
STATE["clean_filename"] = filename
|
| 134 |
+
STATE["clean_sheets"] = sheets
|
| 135 |
+
STATE["clean_selected_sheet"] = pick_sheet(sheets, request.form.get("clean_selected_sheet"))
|
| 136 |
+
STATE["apply_workbook_path"] = str(path)
|
| 137 |
+
STATE["apply_workbook_filename"] = filename
|
| 138 |
+
STATE["apply_sheets"] = sheets
|
| 139 |
+
STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet") or STATE["output_sheet"])
|
| 140 |
+
return render_page(message=f"Loaded {filename}.")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
@app.route("/remove-clean", methods=["POST"])
|
| 144 |
+
def remove_clean():
|
| 145 |
+
update_ui_state_from_form(request.form)
|
| 146 |
+
old_path = STATE["clean_path"]
|
| 147 |
+
STATE["clean_path"] = ""
|
| 148 |
+
STATE["clean_filename"] = ""
|
| 149 |
+
STATE["clean_sheets"] = []
|
| 150 |
+
STATE["clean_selected_sheet"] = ""
|
| 151 |
+
if STATE["apply_workbook_path"] == old_path:
|
| 152 |
+
STATE["apply_workbook_path"] = ""
|
| 153 |
+
STATE["apply_workbook_filename"] = ""
|
| 154 |
+
STATE["apply_sheets"] = []
|
| 155 |
+
STATE["apply_selected_sheet"] = ""
|
| 156 |
+
return render_page(message="File removed.")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
@app.route("/prepare-apply-workbook", methods=["POST"])
|
| 160 |
+
def prepare_apply_workbook():
|
| 161 |
+
try:
|
| 162 |
+
update_ui_state_from_form(request.form)
|
| 163 |
+
filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
|
| 164 |
+
sheets = read_workbook_sheets(path)
|
| 165 |
+
except Exception as exc:
|
| 166 |
+
if wants_json_response():
|
| 167 |
+
return jsonify(ui_state_payload(error=str(exc))), 400
|
| 168 |
+
return render_page(error=str(exc))
|
| 169 |
+
|
| 170 |
+
STATE["apply_workbook_path"] = str(path)
|
| 171 |
+
STATE["apply_workbook_filename"] = filename
|
| 172 |
+
STATE["apply_sheets"] = sheets
|
| 173 |
+
STATE["apply_selected_sheet"] = pick_sheet(sheets, request.form.get("apply_selected_sheet"))
|
| 174 |
+
if wants_json_response():
|
| 175 |
+
return jsonify(ui_state_payload(message=f"Loaded apply workbook {filename}."))
|
| 176 |
+
return render_page(message=f"Loaded apply workbook {filename}.")
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@app.route("/prepare-apply-blueprint", methods=["POST"])
|
| 180 |
+
def prepare_apply_blueprint():
|
| 181 |
+
try:
|
| 182 |
+
update_ui_state_from_form(request.form)
|
| 183 |
+
if STATE["apply_workbook_path"] and Path(STATE["apply_workbook_path"]).is_file():
|
| 184 |
+
STATE["apply_sheets"] = read_workbook_sheets(Path(STATE["apply_workbook_path"]))
|
| 185 |
+
STATE["apply_selected_sheet"] = pick_sheet(STATE["apply_sheets"], request.form.get("apply_selected_sheet"))
|
| 186 |
+
filename, path = save_uploaded_excel(request.files.get("file"), UPLOAD_DIR)
|
| 187 |
+
except Exception as exc:
|
| 188 |
+
if wants_json_response():
|
| 189 |
+
return jsonify(ui_state_payload(error=str(exc))), 400
|
| 190 |
+
return render_page(error=str(exc))
|
| 191 |
+
|
| 192 |
+
STATE["apply_blueprint_path"] = str(path)
|
| 193 |
+
STATE["apply_blueprint_filename"] = filename
|
| 194 |
+
if wants_json_response():
|
| 195 |
+
return jsonify(ui_state_payload(message=f"Loaded blueprint {filename}."))
|
| 196 |
+
return render_page(message=f"Loaded blueprint {filename}.")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
@app.route("/models")
|
| 200 |
+
def models_endpoint():
|
| 201 |
+
try:
|
| 202 |
+
models = select_groq_chat_models(limit=len(PREFERRED_PRODUCTION_CHAT_MODELS))
|
| 203 |
+
except Exception as exc:
|
| 204 |
+
return jsonify({"error": str(exc)}), 500
|
| 205 |
+
return jsonify({"models": models})
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
@app.route("/references/status")
|
| 209 |
+
def references_status():
|
| 210 |
+
return jsonify(reference_sync_status())
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@app.route("/references/save", methods=["POST"])
|
| 214 |
+
def save_references():
|
| 215 |
+
try:
|
| 216 |
+
result = save_manual_references_to_hub(APP_ROOT)
|
| 217 |
+
except Exception as exc:
|
| 218 |
+
return jsonify({"error": str(exc)}), 400
|
| 219 |
+
return jsonify({"message": "Manual references saved to Hugging Face.", **result})
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@app.route("/sheets")
|
| 223 |
+
def sheets_endpoint():
|
| 224 |
+
try:
|
| 225 |
+
workbook_path = resolve_allowed_path(request.args.get("path", ""), APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 226 |
+
if not workbook_path.is_file():
|
| 227 |
+
return jsonify({"error": "Workbook is not available."}), 404
|
| 228 |
+
return jsonify({"sheets": read_workbook_sheets(workbook_path)})
|
| 229 |
+
except Exception as exc:
|
| 230 |
+
return jsonify({"error": str(exc)}), 500
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
@app.route("/download-blueprint")
|
| 234 |
+
def download_blueprint():
|
| 235 |
+
blueprint_path = APP_ROOT / DATA_DIR / "Blueprint.xlsx"
|
| 236 |
+
if not blueprint_path.exists():
|
| 237 |
+
return jsonify({"error": "Blueprint has not been generated yet."}), 404
|
| 238 |
+
return send_file(blueprint_path, as_attachment=True, download_name="Blueprint.xlsx")
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
@app.route("/download-cleaned-workbook")
|
| 242 |
+
def download_cleaned_workbook():
|
| 243 |
+
if not STATE["clean_path"]:
|
| 244 |
+
return jsonify({"error": "Cleaned workbook is not available."}), 404
|
| 245 |
+
workbook_path = resolve_allowed_path(STATE["clean_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 246 |
+
if not workbook_path.is_file():
|
| 247 |
+
return jsonify({"error": "Cleaned workbook is not available."}), 404
|
| 248 |
+
return send_file(
|
| 249 |
+
workbook_path,
|
| 250 |
+
as_attachment=True,
|
| 251 |
+
download_name=f"cleaned_{STATE['clean_filename'] or workbook_path.name}",
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
@app.route("/download-applied-workbook")
|
| 256 |
+
def download_applied_workbook():
|
| 257 |
+
if not STATE["apply_workbook_path"]:
|
| 258 |
+
return jsonify({"error": "Applied workbook is not available."}), 404
|
| 259 |
+
workbook_path = resolve_allowed_path(STATE["apply_workbook_path"], APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 260 |
+
if not workbook_path.is_file():
|
| 261 |
+
return jsonify({"error": "Applied workbook is not available."}), 404
|
| 262 |
+
return send_file(
|
| 263 |
+
workbook_path,
|
| 264 |
+
as_attachment=True,
|
| 265 |
+
download_name=f"cleaned_{STATE['apply_workbook_filename'] or workbook_path.name}",
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
@app.route("/run")
|
| 270 |
+
def run():
|
| 271 |
+
job_id = request.args.get("job_id", uuid.uuid4().hex)
|
| 272 |
+
input_path = request.args.get("input", "")
|
| 273 |
+
sheet = request.args.get("sheet", "")
|
| 274 |
+
output_sheet = request.args.get("output_sheet", DEFAULT_OUTPUT_SHEET_NAME)
|
| 275 |
+
model_list = request.args.get("models", "")
|
| 276 |
+
|
| 277 |
+
if not input_path or not sheet:
|
| 278 |
+
return jsonify({"error": "Input file and source sheet are required."}), 400
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 282 |
+
except ValueError as exc:
|
| 283 |
+
return jsonify({"error": str(exc)}), 400
|
| 284 |
+
|
| 285 |
+
command = [
|
| 286 |
+
sys.executable,
|
| 287 |
+
"-u",
|
| 288 |
+
str(APP_ROOT / "main.py"),
|
| 289 |
+
"--input",
|
| 290 |
+
str(workbook_path),
|
| 291 |
+
"--sheet",
|
| 292 |
+
sheet,
|
| 293 |
+
"--output_sheet",
|
| 294 |
+
output_sheet,
|
| 295 |
+
]
|
| 296 |
+
if model_list:
|
| 297 |
+
command.extend(["--models", model_list])
|
| 298 |
+
|
| 299 |
+
return Response(stream_process(command, cwd=APP_ROOT, job_id=job_id), mimetype="text/event-stream")
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
@app.route("/stop", methods=["POST"])
|
| 303 |
+
def stop():
|
| 304 |
+
job_id = request.args.get("job_id", "")
|
| 305 |
+
if not stop_process(job_id):
|
| 306 |
+
return jsonify({"stopped": False, "message": "No active run found."}), 404
|
| 307 |
+
|
| 308 |
+
return jsonify({"stopped": True})
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@app.route("/apply")
|
| 312 |
+
def apply_blueprint():
|
| 313 |
+
input_path = request.args.get("input", "")
|
| 314 |
+
blueprint_path = request.args.get("blueprint", "")
|
| 315 |
+
sheet = request.args.get("sheet", DEFAULT_OUTPUT_SHEET_NAME)
|
| 316 |
+
|
| 317 |
+
if not input_path or not blueprint_path or not sheet:
|
| 318 |
+
return jsonify({"error": "Workbook, blueprint, and target sheet are required."}), 400
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
workbook_path = resolve_allowed_path(input_path, APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 322 |
+
resolved_blueprint_path = resolve_allowed_path(blueprint_path, APP_ROOT, ALLOWED_FILE_ROOTS)
|
| 323 |
+
except ValueError as exc:
|
| 324 |
+
return jsonify({"error": str(exc)}), 400
|
| 325 |
+
|
| 326 |
+
command = [
|
| 327 |
+
sys.executable,
|
| 328 |
+
"-u",
|
| 329 |
+
str(APP_ROOT / "apply_blueprint.py"),
|
| 330 |
+
"--input",
|
| 331 |
+
str(workbook_path),
|
| 332 |
+
"--blueprint",
|
| 333 |
+
str(resolved_blueprint_path),
|
| 334 |
+
"--sheet",
|
| 335 |
+
sheet,
|
| 336 |
+
]
|
| 337 |
+
|
| 338 |
+
return Response(stream_process(command, cwd=APP_ROOT), mimetype="text/event-stream")
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
if __name__ == "__main__":
|
| 342 |
+
app.run(debug=False, threaded=True)
|